Spaces:
				
			
			
	
			
			
		Paused
		
	
	
	
			
			
	
	
	
	
		
		
		Paused
		
	
		Jason
		
	commited on
		
		
					Disable HF account age requirement; submission fixes (#76)
Browse files- submission.py +25 -72
 
    	
        submission.py
    CHANGED
    
    | 
         @@ -4,6 +4,7 @@ import sys 
     | 
|
| 4 | 
         
             
            import matplotlib
         
     | 
| 5 | 
         
             
            from agenteval.cli import SUBMISSION_METADATA_FILENAME
         
     | 
| 6 | 
         
             
            from agenteval.models import SubmissionMetadata
         
     | 
| 
         | 
|
| 7 | 
         
             
            from gradio_modal import Modal
         
     | 
| 8 | 
         | 
| 9 | 
         
             
            matplotlib.use('Agg')
         
     | 
| 
         @@ -62,6 +63,8 @@ def try_load_dataset_submission(*args, **kwargs) -> DatasetDict: # Renamed to av 
     | 
|
| 62 | 
         
             
                    return DatasetDict()
         
     | 
| 63 | 
         
             
                except ValueError: # Handles cases where dataset is empty or ill-formed
         
     | 
| 64 | 
         
             
                    return DatasetDict()
         
     | 
| 
         | 
|
| 
         | 
|
| 65 | 
         | 
| 66 | 
         
             
            def checked_upload_folder(
         
     | 
| 67 | 
         
             
                    api_hf: HfApi, # Renamed to avoid conflict with global api
         
     | 
| 
         @@ -138,17 +141,16 @@ def add_new_eval( 
     | 
|
| 138 | 
         | 
| 139 | 
         
             
                logger.debug(f"agent {agent_name}: User account age check {profile.username}")
         
     | 
| 140 | 
         
             
                try:
         
     | 
| 141 | 
         
            -
                     
     | 
| 142 | 
         
            -
                     
     | 
| 143 | 
         
            -
                     
     | 
| 144 | 
         
            -
                     
     | 
| 145 | 
         
            -
                     
     | 
| 146 | 
         
            -
             
     | 
| 147 | 
         
            -
             
     | 
| 148 | 
         
            -
             
     | 
| 149 | 
         
            -
             
     | 
| 150 | 
         
            -
             
     | 
| 151 | 
         
            -
                        )
         
     | 
| 152 | 
         
             
                except Exception as e:
         
     | 
| 153 | 
         
             
                    logger.warning(f"Error checking user account age: {e}")
         
     | 
| 154 | 
         
             
                    return (
         
     | 
| 
         @@ -271,9 +273,8 @@ def add_new_eval( 
     | 
|
| 271 | 
         
             
                contact_info["submit_time"] = submission_time.isoformat()
         
     | 
| 272 | 
         
             
                contact_info["username_auth"] = profile.username
         
     | 
| 273 | 
         
             
                contact_info["email"] = email
         
     | 
| 274 | 
         
            -
                contact_info["email_opt_in"] = email_opt_in 
     | 
| 275 | 
         
            -
                contact_info["role"] = role 
     | 
| 276 | 
         
            -
                contact_info
         
     | 
| 277 | 
         | 
| 278 | 
         
             
                logger.debug(f"agent {agent_name}: Contact info: {contact_info}")
         
     | 
| 279 | 
         
             
                if val_or_test in contact_infos:
         
     | 
| 
         @@ -299,65 +300,14 @@ def add_new_eval( 
     | 
|
| 299 | 
         
             
                    gr.update(visible=False)                            # loading_modal
         
     | 
| 300 | 
         
             
                )
         
     | 
| 301 | 
         | 
| 302 | 
         
            -
            def _deprecated_scoring_logic():
         
     | 
| 303 | 
         
            -
                # No longer triggered on eval submission. Kept for quick reference for a little while (2025). TODO delete this.
         
     | 
| 304 | 
         
            -
             
     | 
| 305 | 
         
            -
                # 3. Process and score the submission
         
     | 
| 306 | 
         
            -
                eval_result_obj = None # Define to avoid NameError
         
     | 
| 307 | 
         
            -
                try:
         
     | 
| 308 | 
         
            -
                    json_path = Path(extracted_dir) / AGENTEVAL_MANIFEST_NAME
         
     | 
| 309 | 
         
            -
                    if not json_path.exists():
         
     | 
| 310 | 
         
            -
                        return format_error(f"Missing manifest {AGENTEVAL_MANIFEST_NAME} in submission.")
         
     | 
| 311 | 
         | 
| 312 | 
         
            -
             
     | 
| 313 | 
         
            -
             
     | 
| 314 | 
         
            -
             
     | 
| 315 | 
         
            -
             
     | 
| 316 | 
         
            -
             
     | 
| 
         | 
|
| 317 | 
         | 
| 318 | 
         
            -
                    # Re-compute results from logs for integrity
         
     | 
| 319 | 
         
            -
                    eval_result_obj.results = process_eval_logs(extracted_dir)[0] # Assuming process_eval_logs returns a tuple/list
         
     | 
| 320 | 
         
            -
                    eval_result_obj.save_json(str(json_path)) # Save the re-processed manifest
         
     | 
| 321 | 
         
            -
             
     | 
| 322 | 
         
            -
                except Exception as e:
         
     | 
| 323 | 
         
            -
                    return format_error(f"Error scoring submission: {e}. Check manifest and log files.")
         
     | 
| 324 | 
         
            -
             
     | 
| 325 | 
         
            -
                # 4. Upload scored submission files
         
     | 
| 326 | 
         
            -
                logs_url_private_val, logs_url_public_val = None, None
         
     | 
| 327 | 
         
            -
                scored_submission_name = f"{submission_name}_scored"
         
     | 
| 328 | 
         
            -
                if not LOCAL_DEBUG:
         
     | 
| 329 | 
         
            -
                    try:
         
     | 
| 330 | 
         
            -
                        logs_url_private_val = checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET, CONFIG_NAME, val_or_test, scored_submission_name)
         
     | 
| 331 | 
         
            -
                        if val_or_test == "validation" and not IS_INTERNAL: # Public copy for validation
         
     | 
| 332 | 
         
            -
                            logs_url_public_val = checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET_PUBLIC, CONFIG_NAME, val_or_test, scored_submission_name)
         
     | 
| 333 | 
         
            -
                    except ValueError as e: return format_error(str(e))
         
     | 
| 334 | 
         
            -
                    except Exception as e: return format_error(f"Failed to upload scored submission: {e}")
         
     | 
| 335 | 
         
            -
                else: print("mock uploaded scored submission", flush=True)
         
     | 
| 336 | 
         
            -
             
     | 
| 337 | 
         
            -
             
     | 
| 338 | 
         
            -
                # Update LeaderboardSubmission with submission details
         
     | 
| 339 | 
         
            -
                eval_result_obj.submission.agent_name = agent_name
         
     | 
| 340 | 
         
            -
                eval_result_obj.submission.agent_description = agent_description
         
     | 
| 341 | 
         
            -
                eval_result_obj.submission.agent_url = agent_url
         
     | 
| 342 | 
         
            -
                eval_result_obj.submission.openness = openness
         
     | 
| 343 | 
         
            -
                eval_result_obj.submission.degree_of_control = degree_of_control
         
     | 
| 344 | 
         
            -
                eval_result_obj.submission.username = username
         
     | 
| 345 | 
         
            -
                eval_result_obj.submission.submit_time = submission_time
         
     | 
| 346 | 
         
            -
                eval_result_obj.submission.logs_url = logs_url_private_val
         
     | 
| 347 | 
         
            -
                eval_result_obj.submission.logs_url_public = logs_url_public_val
         
     | 
| 348 | 
         
            -
             
     | 
| 349 | 
         
            -
                # 5. Upload summary statistics to RESULTS_DATASET (for the leaderboard)
         
     | 
| 350 | 
         
            -
                if not LOCAL_DEBUG:
         
     | 
| 351 | 
         
            -
                    try:
         
     | 
| 352 | 
         
            -
                        upload_summary_to_hf(api, eval_result_obj, RESULTS_DATASET, CONFIG_NAME, val_or_test, scored_submission_name)
         
     | 
| 353 | 
         
            -
                    except Exception as e:
         
     | 
| 354 | 
         
            -
                        return format_error(f"Failed to upload summary results to leaderboard: {e}")
         
     | 
| 355 | 
         
            -
                else: print("mock uploaded results to lb", flush=True)
         
     | 
| 356 | 
         
            -
             
     | 
| 357 | 
         
            -
                return format_log(
         
     | 
| 358 | 
         
            -
                    f"Agent '{agent_name}' submitted successfully by '{username}' to '{val_or_test}' split. "
         
     | 
| 359 | 
         
            -
                    "Please refresh the leaderboard in a few moments. It may take some time for changes to propagate."
         
     | 
| 360 | 
         
            -
                )
         
     | 
| 361 | 
         | 
| 362 | 
         
             
            openness_label_html = """
         
     | 
| 363 | 
         
             
            <div class="form-label-with-tooltip">
         
     | 
| 
         @@ -422,7 +372,10 @@ def build_page(): 
     | 
|
| 422 | 
         
             
                    with gr.Group(elem_classes="custom-form-group"):
         
     | 
| 423 | 
         
             
                        gr.HTML(value="""<h2>Agent Information</h2>""", elem_id="agent-info-label-html")
         
     | 
| 424 | 
         
             
                        gr.HTML(value="""<h3>Split</h3>""", elem_classes="form-label")
         
     | 
| 425 | 
         
            -
                        level_of_test_radio = gr.Radio([ 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 426 | 
         
             
                        gr.HTML(value="""<h3>Agent name</h3>""", elem_classes="form-label")
         
     | 
| 427 | 
         
             
                        agent_name_tb = gr.Textbox(label="This is how your agent will appear on the leaderboard. Use a clear, descriptive name (e.g., Asta Scholar QA, Perplexity Deep Research). Omit model names (e.g. GPT-4, Mistral) as they’ll be shown automatically based on your logs.")
         
     | 
| 428 | 
         
             
                        gr.HTML(value="""<h3>Agent description</h3>""", elem_classes="form-label")
         
     | 
| 
         | 
|
| 4 | 
         
             
            import matplotlib
         
     | 
| 5 | 
         
             
            from agenteval.cli import SUBMISSION_METADATA_FILENAME
         
     | 
| 6 | 
         
             
            from agenteval.models import SubmissionMetadata
         
     | 
| 7 | 
         
            +
            from datasets.exceptions import DataFilesNotFoundError
         
     | 
| 8 | 
         
             
            from gradio_modal import Modal
         
     | 
| 9 | 
         | 
| 10 | 
         
             
            matplotlib.use('Agg')
         
     | 
| 
         | 
|
| 63 | 
         
             
                    return DatasetDict()
         
     | 
| 64 | 
         
             
                except ValueError: # Handles cases where dataset is empty or ill-formed
         
     | 
| 65 | 
         
             
                    return DatasetDict()
         
     | 
| 66 | 
         
            +
                except DataFilesNotFoundError:
         
     | 
| 67 | 
         
            +
                    return DatasetDict()
         
     | 
| 68 | 
         | 
| 69 | 
         
             
            def checked_upload_folder(
         
     | 
| 70 | 
         
             
                    api_hf: HfApi, # Renamed to avoid conflict with global api
         
     | 
| 
         | 
|
| 141 | 
         | 
| 142 | 
         
             
                logger.debug(f"agent {agent_name}: User account age check {profile.username}")
         
     | 
| 143 | 
         
             
                try:
         
     | 
| 144 | 
         
            +
                    # Account age check disabled for launch.
         
     | 
| 145 | 
         
            +
                    # https://github.com/allenai/astabench-issues/issues/419
         
     | 
| 146 | 
         
            +
                    # if _is_hf_acct_too_new(submission_time, profile.username):
         
     | 
| 147 | 
         
            +
                    #     return (
         
     | 
| 148 | 
         
            +
                    #         format_error("This account is not authorized to submit here (account too new)."),  # error_message
         
     | 
| 149 | 
         
            +
                    #         gr.update(visible=True),                            # error_modal
         
     | 
| 150 | 
         
            +
                    #         gr.update(visible=False),                           # success_modal
         
     | 
| 151 | 
         
            +
                    #         gr.update(visible=False)                            # loading_modal
         
     | 
| 152 | 
         
            +
                    #     )
         
     | 
| 153 | 
         
            +
                    pass
         
     | 
| 
         | 
|
| 154 | 
         
             
                except Exception as e:
         
     | 
| 155 | 
         
             
                    logger.warning(f"Error checking user account age: {e}")
         
     | 
| 156 | 
         
             
                    return (
         
     | 
| 
         | 
|
| 273 | 
         
             
                contact_info["submit_time"] = submission_time.isoformat()
         
     | 
| 274 | 
         
             
                contact_info["username_auth"] = profile.username
         
     | 
| 275 | 
         
             
                contact_info["email"] = email
         
     | 
| 276 | 
         
            +
                contact_info["email_opt_in"] = email_opt_in
         
     | 
| 277 | 
         
            +
                contact_info["role"] = role
         
     | 
| 
         | 
|
| 278 | 
         | 
| 279 | 
         
             
                logger.debug(f"agent {agent_name}: Contact info: {contact_info}")
         
     | 
| 280 | 
         
             
                if val_or_test in contact_infos:
         
     | 
| 
         | 
|
| 300 | 
         
             
                    gr.update(visible=False)                            # loading_modal
         
     | 
| 301 | 
         
             
                )
         
     | 
| 302 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 303 | 
         | 
| 304 | 
         
            +
            def _is_hf_acct_too_new(submission_time: datetime, username: str):
         
     | 
| 305 | 
         
            +
                user_data_resp = requests.get(f"https://huggingface.co/api/users/{username}/overview")
         
     | 
| 306 | 
         
            +
                user_data_resp.raise_for_status()
         
     | 
| 307 | 
         
            +
                creation_date_str = user_data_resp.json()["createdAt"]
         
     | 
| 308 | 
         
            +
                created_at = datetime.strptime(creation_date_str, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
         
     | 
| 309 | 
         
            +
                return submission_time - created_at < timedelta(days=60)
         
     | 
| 310 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 311 | 
         | 
| 312 | 
         
             
            openness_label_html = """
         
     | 
| 313 | 
         
             
            <div class="form-label-with-tooltip">
         
     | 
| 
         | 
|
| 372 | 
         
             
                    with gr.Group(elem_classes="custom-form-group"):
         
     | 
| 373 | 
         
             
                        gr.HTML(value="""<h2>Agent Information</h2>""", elem_id="agent-info-label-html")
         
     | 
| 374 | 
         
             
                        gr.HTML(value="""<h3>Split</h3>""", elem_classes="form-label")
         
     | 
| 375 | 
         
            +
                        level_of_test_radio = gr.Radio(choices=[
         
     | 
| 376 | 
         
            +
                            ("Test set", "test"),
         
     | 
| 377 | 
         
            +
                            ("Validation set", "validation"),
         
     | 
| 378 | 
         
            +
                        ], elem_classes="form-label-fieldset", value="validation", label="The Test Set is used for final leaderboard rankings. The Validation Set is for development and iteration. Choose based on your evaluation goal.")
         
     | 
| 379 | 
         
             
                        gr.HTML(value="""<h3>Agent name</h3>""", elem_classes="form-label")
         
     | 
| 380 | 
         
             
                        agent_name_tb = gr.Textbox(label="This is how your agent will appear on the leaderboard. Use a clear, descriptive name (e.g., Asta Scholar QA, Perplexity Deep Research). Omit model names (e.g. GPT-4, Mistral) as they’ll be shown automatically based on your logs.")
         
     | 
| 381 | 
         
             
                        gr.HTML(value="""<h3>Agent description</h3>""", elem_classes="form-label")
         
     |