Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
|
@@ -1,117 +1,124 @@
|
|
|
|
|
| 1 |
import os
|
| 2 |
import pandas as pd
|
| 3 |
from fastapi import FastAPI, HTTPException, Body
|
| 4 |
from pydantic import BaseModel, Field
|
| 5 |
-
from typing import List, Dict, Any
|
| 6 |
from datasets import load_dataset, Dataset, DatasetDict
|
| 7 |
from huggingface_hub import HfApi, hf_hub_download
|
| 8 |
from datetime import datetime, timezone
|
| 9 |
import logging
|
| 10 |
-
import uvicorn
|
| 11 |
-
import random
|
| 12 |
|
|
|
|
| 13 |
tool_threshold = 3
|
| 14 |
step_threshold = 5
|
| 15 |
-
|
| 16 |
-
# --- Configuration ---
|
| 17 |
HF_DATASET_ID = "agents-course/unit4-students-scores"
|
| 18 |
-
|
| 19 |
-
#
|
| 20 |
-
|
| 21 |
-
questions_for_api: List[Dict[str,
|
| 22 |
ground_truth_answers: Dict[str, str] = {}
|
|
|
|
| 23 |
# --- Logging Setup ---
|
| 24 |
logging.basicConfig(level=logging.INFO)
|
| 25 |
logger = logging.getLogger(__name__)
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
def load_questions():
|
| 28 |
global filtered_dataset
|
| 29 |
global questions_for_api
|
| 30 |
global ground_truth_answers
|
| 31 |
tempo_filtered=[]
|
| 32 |
-
# Clear existing data
|
| 33 |
questions_for_api.clear()
|
| 34 |
ground_truth_answers.clear()
|
| 35 |
|
| 36 |
logger.info("Starting to load and filter GAIA dataset...")
|
| 37 |
try:
|
| 38 |
-
|
| 39 |
-
|
|
|
|
| 40 |
except Exception as e:
|
| 41 |
-
logger.error(f"Failed to load GAIA dataset: {e}", exc_info=True)
|
| 42 |
-
# Decide how to handle this: maybe raise the error or exit
|
| 43 |
raise RuntimeError("Could not load the primary GAIA dataset.") from e
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
|
|
|
| 47 |
|
| 48 |
-
if metadata:
|
| 49 |
num_tools_str = metadata.get('Number of tools')
|
| 50 |
num_steps_str = metadata.get('Number of steps')
|
| 51 |
|
| 52 |
-
# Check if both numbers exist before trying to convert
|
| 53 |
if num_tools_str is not None and num_steps_str is not None:
|
| 54 |
try:
|
| 55 |
-
# Convert values to integers for comparison
|
| 56 |
num_tools = int(num_tools_str)
|
| 57 |
num_steps = int(num_steps_str)
|
| 58 |
|
| 59 |
-
# Apply the filter conditions
|
| 60 |
if num_tools < tool_threshold and num_steps < step_threshold:
|
| 61 |
-
|
| 62 |
-
# logger.debug(question) # Print the matching question dictionary
|
| 63 |
-
# logger.debug("------------------------------------------------------------------")
|
| 64 |
-
tempo_filtered.append(question) # Add to the filtered list
|
| 65 |
-
# else: # Optional: Handle items that don't match the filter
|
| 66 |
-
# logger.debug(f"Skipping Task ID: {question.get('task_id', 'N/A')} - Tools: {num_tools}, Steps: {num_steps}")
|
| 67 |
except ValueError:
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
else:
|
| 72 |
logger.warning(f"Skipping Task ID: {question.get('task_id', 'N/A')} - Missing 'Annotator Metadata'.")
|
| 73 |
-
# logger.debug("------------------------------------------------------------------")
|
| 74 |
|
| 75 |
-
|
|
|
|
| 76 |
logger.info(f"Found {len(filtered_dataset)} questions matching the criteria (tools < {tool_threshold}, steps < {step_threshold}).")
|
| 77 |
-
# print(filtered_dataset) # Keep this commented unless debugging
|
| 78 |
|
|
|
|
| 79 |
processed_count = 0
|
| 80 |
for item in filtered_dataset:
|
| 81 |
task_id = item.get('task_id')
|
| 82 |
-
question_text = item.get('Question')
|
| 83 |
final_answer = item.get('Final answer')
|
| 84 |
-
|
| 85 |
-
# Validate required fields
|
| 86 |
if task_id and question_text and final_answer is not None:
|
| 87 |
-
# Create a copy
|
| 88 |
-
processed_item = item.copy()
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
# Store
|
| 97 |
questions_for_api.append(processed_item)
|
| 98 |
-
|
| 99 |
-
#
|
| 100 |
ground_truth_answers[str(task_id)] = str(final_answer)
|
| 101 |
processed_count += 1
|
| 102 |
else:
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
if not questions_for_api:
|
| 106 |
logger.error("CRITICAL: No valid questions loaded after filtering. API endpoints needing questions will fail.")
|
| 107 |
-
# Depending on requirements, you might want to exit or raise an error here
|
| 108 |
# raise RuntimeError("Failed to load mandatory question data after filtering.")
|
| 109 |
|
| 110 |
-
# --- Pydantic Models
|
|
|
|
|
|
|
| 111 |
class Question(BaseModel):
|
| 112 |
task_id: str
|
| 113 |
-
|
| 114 |
|
|
|
|
|
|
|
| 115 |
class AnswerItem(BaseModel):
|
| 116 |
task_id: str
|
| 117 |
submitted_answer: str = Field(..., description="The agent's answer for the task_id")
|
|
@@ -132,35 +139,30 @@ class ScoreResponse(BaseModel):
|
|
| 132 |
class ErrorResponse(BaseModel):
|
| 133 |
detail: str
|
| 134 |
|
|
|
|
| 135 |
# --- FastAPI Application ---
|
| 136 |
app = FastAPI(
|
| 137 |
title="Agent Evaluation API",
|
| 138 |
description="API to fetch questions and submit agent answers for scoring.",
|
| 139 |
)
|
| 140 |
|
| 141 |
-
# --- Startup Event
|
| 142 |
@app.on_event("startup")
|
| 143 |
async def startup_event():
|
| 144 |
-
"""
|
| 145 |
-
Loads the questions when the FastAPI application starts.
|
| 146 |
-
"""
|
| 147 |
logger.info("Application startup: Loading questions...")
|
| 148 |
try:
|
| 149 |
-
load_questions()
|
| 150 |
if not questions_for_api:
|
| 151 |
-
logger.error("CRITICAL: No questions were loaded during startup.
|
| 152 |
-
# Depending on requirements, you might want the app to fail startup
|
| 153 |
-
# raise RuntimeError("Failed to load mandatory question data.")
|
| 154 |
else:
|
| 155 |
logger.info(f"Successfully loaded {len(questions_for_api)} questions.")
|
| 156 |
except Exception as e:
|
| 157 |
logger.error(f"CRITICAL ERROR DURING STARTUP while loading questions: {e}", exc_info=True)
|
| 158 |
-
# Decide if the app should exit if loading fails
|
| 159 |
# import sys
|
| 160 |
-
# sys.exit(1)
|
| 161 |
|
| 162 |
-
|
| 163 |
-
#
|
| 164 |
def update_huggingface_dataset(username: str, score: float):
|
| 165 |
"""Loads the dataset, updates the score if higher, and pushes back."""
|
| 166 |
try:
|
|
@@ -242,7 +244,7 @@ def update_huggingface_dataset(username: str, score: float):
|
|
| 242 |
|
| 243 |
updated_ds = DatasetDict({'train': Dataset.from_pandas(df)})
|
| 244 |
logger.info(f"Dataset to push: {updated_ds}") # Log the dataset structure
|
| 245 |
-
updated_ds.push_to_hub(HF_DATASET_ID) #
|
| 246 |
logger.warning("Dataset push to hub is currently commented out. Uncomment the line above to enable leaderboard updates.") # REMINDER
|
| 247 |
logger.info("Dataset push simulated/attempted.")
|
| 248 |
return True
|
|
@@ -254,35 +256,35 @@ def update_huggingface_dataset(username: str, score: float):
|
|
| 254 |
# Re-raise the exception to be caught by the endpoint handler
|
| 255 |
raise HTTPException(status_code=500, detail=f"Failed to update Hugging Face dataset: {e}")
|
| 256 |
|
| 257 |
-
|
| 258 |
-
# --- API Endpoints ---
|
| 259 |
|
| 260 |
@app.get("/questions",
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
|
|
|
| 264 |
async def get_questions():
|
| 265 |
"""
|
| 266 |
-
Provides the list of questions that agents should answer.
|
| 267 |
"""
|
| 268 |
-
# print(f"Returning {len(questions_for_api)} questions.") # Debug log
|
| 269 |
if not questions_for_api:
|
| 270 |
logger.error("GET /questions requested but no questions are loaded.")
|
| 271 |
raise HTTPException(status_code=404, detail="No questions available.")
|
|
|
|
| 272 |
return questions_for_api
|
| 273 |
|
| 274 |
-
# --- NEW ENDPOINT ---
|
| 275 |
@app.get("/random-question",
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
|
|
|
| 279 |
responses={
|
| 280 |
-
200: {"description": "A random question."},
|
| 281 |
404: {"model": ErrorResponse, "description": "No questions available to choose from."}
|
| 282 |
})
|
| 283 |
async def get_random_question():
|
| 284 |
"""
|
| 285 |
-
Provides a single, randomly selected question
|
| 286 |
"""
|
| 287 |
if not questions_for_api:
|
| 288 |
logger.warning("GET /random-question requested but no questions are loaded.")
|
|
@@ -290,11 +292,11 @@ async def get_random_question():
|
|
| 290 |
|
| 291 |
# Select and return a random question dictionary
|
| 292 |
random_question = random.choice(questions_for_api)
|
| 293 |
-
logger.info(f"Returning random question with task_id: {random_question
|
|
|
|
| 294 |
return random_question
|
| 295 |
-
# --- END NEW ENDPOINT ---
|
| 296 |
-
|
| 297 |
|
|
|
|
| 298 |
@app.post("/submit",
|
| 299 |
response_model=ScoreResponse,
|
| 300 |
summary="Submit Agent Answers",
|
|
@@ -358,17 +360,22 @@ async def submit_answers(submission: Submission = Body(...)):
|
|
| 358 |
logger.debug(f"Incorrect answer for {task_id} from {submission.username}. Submitted: '{submitted}', Expected: '{ground_truth}'")
|
| 359 |
|
| 360 |
|
| 361 |
-
# Calculate score based on valid attempts
|
| 362 |
if valid_attempted_count == 0:
|
| 363 |
score = 0.0
|
| 364 |
message = f"Submission received, but no valid/matching task IDs were found in the {total_attempted_in_payload} answers provided."
|
| 365 |
logger.warning(f"No valid answers processed for {submission.username} out of {total_attempted_in_payload} submitted.")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
else:
|
|
|
|
| 367 |
score = round((correct_count / len(ground_truth_answers)) * 100, 2)
|
| 368 |
-
message = f"Score calculated successfully: {correct_count}/{
|
| 369 |
if valid_attempted_count < total_attempted_in_payload:
|
| 370 |
message += f" ({total_attempted_in_payload - valid_attempted_count} submitted answers had invalid or duplicate task IDs)."
|
| 371 |
-
logger.info(f"Score for {submission.username}: {score}% ({correct_count}/{valid_attempted_count})")
|
| 372 |
|
| 373 |
|
| 374 |
# Update Hugging Face dataset
|
|
@@ -401,22 +408,18 @@ async def submit_answers(submission: Submission = Body(...)):
|
|
| 401 |
)
|
| 402 |
|
| 403 |
# --- Run the application ---
|
| 404 |
-
# This part is mainly for local development without Docker.
|
| 405 |
-
# Docker uses the CMD instruction in the Dockerfile.
|
| 406 |
if __name__ == "__main__":
|
| 407 |
logger.info("Starting FastAPI server for local development...")
|
| 408 |
-
# Explicitly call load_questions here for local run,
|
| 409 |
-
# as the @app.on_event("startup") might not trigger reliably
|
| 410 |
-
# depending on how uvicorn is invoked directly.
|
| 411 |
try:
|
| 412 |
-
load_questions()
|
| 413 |
if not questions_for_api:
|
| 414 |
logger.error("EXITING: Cannot start server without loaded questions.")
|
|
|
|
|
|
|
|
|
|
| 415 |
else:
|
| 416 |
-
# Read port from environment variable for consistency, default to 8000 for local if not set
|
| 417 |
local_port = int(os.getenv("PORT", "8000"))
|
| 418 |
logger.info(f"Running Uvicorn locally on http://127.0.0.1:{local_port}")
|
| 419 |
-
# Note: host='127.0.0.1' is usually fine for local runs outside docker
|
| 420 |
uvicorn.run(app, host="127.0.0.1", port=local_port, log_level="info")
|
| 421 |
except Exception as e:
|
| 422 |
logger.error(f"Failed to start server: {e}", exc_info=True)
|
|
|
|
| 1 |
+
# Import necessary libraries (ensure all required imports are at the top)
|
| 2 |
import os
|
| 3 |
import pandas as pd
|
| 4 |
from fastapi import FastAPI, HTTPException, Body
|
| 5 |
from pydantic import BaseModel, Field
|
| 6 |
+
from typing import List, Dict, Any #<-- Make sure Any is imported
|
| 7 |
from datasets import load_dataset, Dataset, DatasetDict
|
| 8 |
from huggingface_hub import HfApi, hf_hub_download
|
| 9 |
from datetime import datetime, timezone
|
| 10 |
import logging
|
| 11 |
+
import uvicorn
|
| 12 |
+
import random
|
| 13 |
|
| 14 |
+
# --- Constants and Config ---
|
| 15 |
tool_threshold = 3
|
| 16 |
step_threshold = 5
|
|
|
|
|
|
|
| 17 |
HF_DATASET_ID = "agents-course/unit4-students-scores"
|
| 18 |
+
|
| 19 |
+
# --- Data Structures ---
|
| 20 |
+
# questions_for_api will now hold richer dictionaries
|
| 21 |
+
questions_for_api: List[Dict[str, Any]] = []
|
| 22 |
ground_truth_answers: Dict[str, str] = {}
|
| 23 |
+
|
| 24 |
# --- Logging Setup ---
|
| 25 |
logging.basicConfig(level=logging.INFO)
|
| 26 |
logger = logging.getLogger(__name__)
|
| 27 |
+
|
| 28 |
+
# --- Filtered Dataset Placeholder ---
|
| 29 |
+
# Note: Making filtered_dataset global might not be ideal in larger apps,
|
| 30 |
+
# but keeping it as is based on the original code.
|
| 31 |
+
filtered_dataset = None
|
| 32 |
+
|
| 33 |
+
# --- Modified load_questions Function ---
|
| 34 |
def load_questions():
|
| 35 |
global filtered_dataset
|
| 36 |
global questions_for_api
|
| 37 |
global ground_truth_answers
|
| 38 |
tempo_filtered=[]
|
| 39 |
+
# Clear existing data
|
| 40 |
questions_for_api.clear()
|
| 41 |
ground_truth_answers.clear()
|
| 42 |
|
| 43 |
logger.info("Starting to load and filter GAIA dataset...")
|
| 44 |
try:
|
| 45 |
+
# Load the 'validation' split specifically if that's intended
|
| 46 |
+
dataset = load_dataset("gaia-benchmark/GAIA", "2023_level1", split='validation', trust_remote_code=True)
|
| 47 |
+
logger.info("GAIA dataset validation split loaded.")
|
| 48 |
except Exception as e:
|
| 49 |
+
logger.error(f"Failed to load GAIA dataset validation split: {e}", exc_info=True)
|
|
|
|
| 50 |
raise RuntimeError("Could not load the primary GAIA dataset.") from e
|
| 51 |
|
| 52 |
+
# --- Filtering Logic (remains the same) ---
|
| 53 |
+
for question in dataset: # Iterate directly over the loaded split
|
| 54 |
+
metadata = question.get('Annotator Metadata')
|
| 55 |
|
| 56 |
+
if metadata:
|
| 57 |
num_tools_str = metadata.get('Number of tools')
|
| 58 |
num_steps_str = metadata.get('Number of steps')
|
| 59 |
|
|
|
|
| 60 |
if num_tools_str is not None and num_steps_str is not None:
|
| 61 |
try:
|
|
|
|
| 62 |
num_tools = int(num_tools_str)
|
| 63 |
num_steps = int(num_steps_str)
|
| 64 |
|
|
|
|
| 65 |
if num_tools < tool_threshold and num_steps < step_threshold:
|
| 66 |
+
tempo_filtered.append(question)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
except ValueError:
|
| 68 |
+
logger.warning(f"Skipping Task ID: {question.get('task_id', 'N/A')} - Could not convert tool/step count: tools='{num_tools_str}', steps='{num_steps_str}'.")
|
| 69 |
+
# else: # Optional: Log if numbers are missing
|
| 70 |
+
# logger.debug(f"Skipping Task ID: {question.get('task_id', 'N/A')} - Missing tool/step count in metadata.")
|
| 71 |
else:
|
| 72 |
logger.warning(f"Skipping Task ID: {question.get('task_id', 'N/A')} - Missing 'Annotator Metadata'.")
|
|
|
|
| 73 |
|
| 74 |
+
# Store the filtered list (optional, could process directly)
|
| 75 |
+
filtered_dataset = tempo_filtered
|
| 76 |
logger.info(f"Found {len(filtered_dataset)} questions matching the criteria (tools < {tool_threshold}, steps < {step_threshold}).")
|
|
|
|
| 77 |
|
| 78 |
+
# --- Processing Logic (Modified) ---
|
| 79 |
processed_count = 0
|
| 80 |
for item in filtered_dataset:
|
| 81 |
task_id = item.get('task_id')
|
| 82 |
+
question_text = item.get('Question') # Keep original key for now
|
| 83 |
final_answer = item.get('Final answer')
|
| 84 |
+
|
| 85 |
+
# Validate required fields needed for processing/scoring
|
| 86 |
if task_id and question_text and final_answer is not None:
|
| 87 |
+
# Create a copy to avoid modifying the original item in filtered_dataset
|
| 88 |
+
processed_item: Dict[str, Any] = item.copy()
|
| 89 |
+
|
| 90 |
+
# Remove the fields we explicitly want to exclude
|
| 91 |
+
processed_item.pop('Final answer', None)
|
| 92 |
+
processed_item.pop('Annotator Annotation', None)
|
| 93 |
+
# You could add more fields to pop here if needed later
|
| 94 |
+
# processed_item.pop('Another field to remove', None)
|
| 95 |
+
|
| 96 |
+
# Store the dictionary containing all remaining fields
|
| 97 |
questions_for_api.append(processed_item)
|
| 98 |
+
|
| 99 |
+
# Store the ground truth answer separately for scoring
|
| 100 |
ground_truth_answers[str(task_id)] = str(final_answer)
|
| 101 |
processed_count += 1
|
| 102 |
else:
|
| 103 |
+
# Log which required field was missing if possible
|
| 104 |
+
missing = [k for k, v in {'task_id': task_id, 'Question': question_text, 'Final answer': final_answer}.items() if not v and v is not None]
|
| 105 |
+
logger.warning(f"Skipping item due to missing required fields ({', '.join(missing)}): task_id={task_id}")
|
| 106 |
+
|
| 107 |
+
logger.info(f"Successfully processed {processed_count} questions into API format.")
|
| 108 |
|
| 109 |
if not questions_for_api:
|
| 110 |
logger.error("CRITICAL: No valid questions loaded after filtering. API endpoints needing questions will fail.")
|
|
|
|
| 111 |
# raise RuntimeError("Failed to load mandatory question data after filtering.")
|
| 112 |
|
| 113 |
+
# --- Pydantic Models ---
|
| 114 |
+
# Keep Question simple for potential internal use or basic validation,
|
| 115 |
+
# but the API will return Dict[str, Any]
|
| 116 |
class Question(BaseModel):
|
| 117 |
task_id: str
|
| 118 |
+
Question: str # Keep original casing if that's what in the data
|
| 119 |
|
| 120 |
+
# Keep other models as they are (AnswerItem, Submission, ScoreResponse, ErrorResponse)
|
| 121 |
+
# ... (rest of the Pydantic models remain the same) ...
|
| 122 |
class AnswerItem(BaseModel):
|
| 123 |
task_id: str
|
| 124 |
submitted_answer: str = Field(..., description="The agent's answer for the task_id")
|
|
|
|
| 139 |
class ErrorResponse(BaseModel):
|
| 140 |
detail: str
|
| 141 |
|
| 142 |
+
|
| 143 |
# --- FastAPI Application ---
|
| 144 |
app = FastAPI(
|
| 145 |
title="Agent Evaluation API",
|
| 146 |
description="API to fetch questions and submit agent answers for scoring.",
|
| 147 |
)
|
| 148 |
|
| 149 |
+
# --- Startup Event ---
|
| 150 |
@app.on_event("startup")
|
| 151 |
async def startup_event():
|
|
|
|
|
|
|
|
|
|
| 152 |
logger.info("Application startup: Loading questions...")
|
| 153 |
try:
|
| 154 |
+
load_questions()
|
| 155 |
if not questions_for_api:
|
| 156 |
+
logger.error("CRITICAL: No questions were loaded during startup.")
|
|
|
|
|
|
|
| 157 |
else:
|
| 158 |
logger.info(f"Successfully loaded {len(questions_for_api)} questions.")
|
| 159 |
except Exception as e:
|
| 160 |
logger.error(f"CRITICAL ERROR DURING STARTUP while loading questions: {e}", exc_info=True)
|
|
|
|
| 161 |
# import sys
|
| 162 |
+
# sys.exit(1) # Consider exiting if questions are critical
|
| 163 |
|
| 164 |
+
# --- Helper Function (update_huggingface_dataset remains the same) ---
|
| 165 |
+
# ... (update_huggingface_dataset function code) ...
|
| 166 |
def update_huggingface_dataset(username: str, score: float):
|
| 167 |
"""Loads the dataset, updates the score if higher, and pushes back."""
|
| 168 |
try:
|
|
|
|
| 244 |
|
| 245 |
updated_ds = DatasetDict({'train': Dataset.from_pandas(df)})
|
| 246 |
logger.info(f"Dataset to push: {updated_ds}") # Log the dataset structure
|
| 247 |
+
# updated_ds.push_to_hub(HF_DATASET_ID) # Uncomment this line to enable leaderboard updates
|
| 248 |
logger.warning("Dataset push to hub is currently commented out. Uncomment the line above to enable leaderboard updates.") # REMINDER
|
| 249 |
logger.info("Dataset push simulated/attempted.")
|
| 250 |
return True
|
|
|
|
| 256 |
# Re-raise the exception to be caught by the endpoint handler
|
| 257 |
raise HTTPException(status_code=500, detail=f"Failed to update Hugging Face dataset: {e}")
|
| 258 |
|
| 259 |
+
# --- API Endpoints (Modified response_model) ---
|
|
|
|
| 260 |
|
| 261 |
@app.get("/questions",
|
| 262 |
+
# Return a list of dictionaries with arbitrary keys/values
|
| 263 |
+
response_model=List[Dict[str, Any]],
|
| 264 |
+
summary="Get All Filtered Questions (Full Data)",
|
| 265 |
+
description="Returns the complete list of questions with all associated data (excluding answer/annotation) filtered based on criteria.")
|
| 266 |
async def get_questions():
|
| 267 |
"""
|
| 268 |
+
Provides the list of questions (with extended data) that agents should answer.
|
| 269 |
"""
|
|
|
|
| 270 |
if not questions_for_api:
|
| 271 |
logger.error("GET /questions requested but no questions are loaded.")
|
| 272 |
raise HTTPException(status_code=404, detail="No questions available.")
|
| 273 |
+
# questions_for_api now contains the richer dictionaries
|
| 274 |
return questions_for_api
|
| 275 |
|
|
|
|
| 276 |
@app.get("/random-question",
|
| 277 |
+
# Return a single dictionary with arbitrary keys/values
|
| 278 |
+
response_model=Dict[str, Any],
|
| 279 |
+
summary="Get One Random Question (Full Data)",
|
| 280 |
+
description="Returns a single random question with all associated data (excluding answer/annotation) from the available filtered set.",
|
| 281 |
responses={
|
| 282 |
+
200: {"description": "A random question with its full data."},
|
| 283 |
404: {"model": ErrorResponse, "description": "No questions available to choose from."}
|
| 284 |
})
|
| 285 |
async def get_random_question():
|
| 286 |
"""
|
| 287 |
+
Provides a single, randomly selected question with its extended data.
|
| 288 |
"""
|
| 289 |
if not questions_for_api:
|
| 290 |
logger.warning("GET /random-question requested but no questions are loaded.")
|
|
|
|
| 292 |
|
| 293 |
# Select and return a random question dictionary
|
| 294 |
random_question = random.choice(questions_for_api)
|
| 295 |
+
logger.info(f"Returning random question with task_id: {random_question.get('task_id', 'N/A')}")
|
| 296 |
+
# random_question is already the richer dictionary
|
| 297 |
return random_question
|
|
|
|
|
|
|
| 298 |
|
| 299 |
+
# --- Submit Endpoint (remains the same, uses ground_truth_answers) ---
|
| 300 |
@app.post("/submit",
|
| 301 |
response_model=ScoreResponse,
|
| 302 |
summary="Submit Agent Answers",
|
|
|
|
| 360 |
logger.debug(f"Incorrect answer for {task_id} from {submission.username}. Submitted: '{submitted}', Expected: '{ground_truth}'")
|
| 361 |
|
| 362 |
|
| 363 |
+
# Calculate score based on valid attempts AND total number of questions available
|
| 364 |
if valid_attempted_count == 0:
|
| 365 |
score = 0.0
|
| 366 |
message = f"Submission received, but no valid/matching task IDs were found in the {total_attempted_in_payload} answers provided."
|
| 367 |
logger.warning(f"No valid answers processed for {submission.username} out of {total_attempted_in_payload} submitted.")
|
| 368 |
+
elif not ground_truth_answers: # Prevent division by zero if no questions loaded
|
| 369 |
+
score = 0.0
|
| 370 |
+
message = "Score cannot be calculated because no ground truth answers are loaded."
|
| 371 |
+
logger.error(f"Cannot calculate score for {submission.username}: ground_truth_answers is empty.")
|
| 372 |
else:
|
| 373 |
+
# Score is based on correct answers divided by the TOTAL number of questions in the filtered set
|
| 374 |
score = round((correct_count / len(ground_truth_answers)) * 100, 2)
|
| 375 |
+
message = f"Score calculated successfully: {correct_count}/{len(ground_truth_answers)} total questions answered correctly ({valid_attempted_count} valid tasks attempted)."
|
| 376 |
if valid_attempted_count < total_attempted_in_payload:
|
| 377 |
message += f" ({total_attempted_in_payload - valid_attempted_count} submitted answers had invalid or duplicate task IDs)."
|
| 378 |
+
logger.info(f"Score for {submission.username}: {score}% ({correct_count}/{len(ground_truth_answers)} correct, based on {valid_attempted_count} valid attempts)")
|
| 379 |
|
| 380 |
|
| 381 |
# Update Hugging Face dataset
|
|
|
|
| 408 |
)
|
| 409 |
|
| 410 |
# --- Run the application ---
|
|
|
|
|
|
|
| 411 |
if __name__ == "__main__":
|
| 412 |
logger.info("Starting FastAPI server for local development...")
|
|
|
|
|
|
|
|
|
|
| 413 |
try:
|
| 414 |
+
load_questions() # Load questions before starting server
|
| 415 |
if not questions_for_api:
|
| 416 |
logger.error("EXITING: Cannot start server without loaded questions.")
|
| 417 |
+
# Optional: exit if questions are essential
|
| 418 |
+
# import sys
|
| 419 |
+
# sys.exit(1)
|
| 420 |
else:
|
|
|
|
| 421 |
local_port = int(os.getenv("PORT", "8000"))
|
| 422 |
logger.info(f"Running Uvicorn locally on http://127.0.0.1:{local_port}")
|
|
|
|
| 423 |
uvicorn.run(app, host="127.0.0.1", port=local_port, log_level="info")
|
| 424 |
except Exception as e:
|
| 425 |
logger.error(f"Failed to start server: {e}", exc_info=True)
|