Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
|
@@ -297,9 +297,7 @@ def update_huggingface_dataset(username: str, score: float, code_link: str):
|
|
| 297 |
ds_dict = None
|
| 298 |
df = None
|
| 299 |
try:
|
| 300 |
-
|
| 301 |
-
# This might not be necessary if load_dataset handles non-existence gracefully
|
| 302 |
-
# hf_hub_download(repo_id=HF_DATASET_ID, filename="data/train-00000-of-00001.parquet", repo_type="dataset")
|
| 303 |
ds_dict = load_dataset(HF_DATASET_ID, trust_remote_code=True) # Added trust_remote_code=True if needed
|
| 304 |
logger.info("Dataset loaded successfully.")
|
| 305 |
if "train" in ds_dict:
|
|
@@ -310,22 +308,22 @@ def update_huggingface_dataset(username: str, score: float, code_link: str):
|
|
| 310 |
|
| 311 |
except Exception as load_error:
|
| 312 |
logger.warning(f"Could not load dataset '{HF_DATASET_ID}' or it's empty/new ({load_error}). Will create structure.")
|
| 313 |
-
|
| 314 |
df = pd.DataFrame({col: pd.Series(dtype=dtype) for col, dtype in expected_columns.items()})
|
| 315 |
|
| 316 |
-
|
| 317 |
for col, dtype in expected_columns.items():
|
| 318 |
if col not in df.columns:
|
| 319 |
logger.warning(f"Column '{col}' not found in loaded data. Adding it.")
|
| 320 |
-
|
| 321 |
df[col] = pd.Series(dtype=dtype)
|
| 322 |
|
| 323 |
-
|
| 324 |
df['score'] = pd.to_numeric(df['score'], errors='coerce').fillna(0.0)
|
| 325 |
-
|
| 326 |
df['username'] = df['username'].astype(str).fillna('')
|
| 327 |
df['timestamp'] = df['timestamp'].astype(str).fillna('')
|
| 328 |
-
df['code'] = df['code'].astype(str).fillna('')
|
| 329 |
|
| 330 |
# 2. Find existing score for the user
|
| 331 |
existing_entries = df[df['username'] == username]
|
|
@@ -337,9 +335,7 @@ def update_huggingface_dataset(username: str, score: float, code_link: str):
|
|
| 337 |
max_existing_score = existing_entries['score'].max() # Already numeric
|
| 338 |
if score > max_existing_score:
|
| 339 |
logger.info(f"New score {score} is higher than existing max {max_existing_score} for {username}. Updating entry.")
|
| 340 |
-
|
| 341 |
-
df = df[df['username'] != username].copy() # Use .copy() to avoid SettingWithCopyWarning
|
| 342 |
-
# Add new entry with score and code link
|
| 343 |
new_entry = pd.DataFrame([{
|
| 344 |
'username': username,
|
| 345 |
'score': score,
|
|
@@ -366,32 +362,23 @@ def update_huggingface_dataset(username: str, score: float, code_link: str):
|
|
| 366 |
if needs_update:
|
| 367 |
logger.info(f"Preparing to push updated dataset to '{HF_DATASET_ID}'...")
|
| 368 |
|
| 369 |
-
# Ensure final DataFrame columns match the expected schema exactly before converting
|
| 370 |
-
# Select and order columns just in case
|
| 371 |
df = df[list(expected_columns.keys())]
|
| 372 |
-
# Explicitly cast types again before creating Dataset object
|
| 373 |
for col, dtype in expected_columns.items():
|
| 374 |
-
# Handle potential pandas nullable types if necessary, default to standard types
|
| 375 |
if dtype == 'str':
|
| 376 |
df[col] = df[col].astype(str).fillna('')
|
| 377 |
elif dtype == 'float':
|
| 378 |
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0) # Ensure float conversion
|
| 379 |
-
# Add other type handling if needed
|
| 380 |
|
| 381 |
logger.info(f"Final DataFrame columns and types:\n{df.dtypes}")
|
| 382 |
logger.info(f"Sample data before push:\n{df.head().to_string()}")
|
| 383 |
|
| 384 |
-
# Create the Dataset object from the final DataFrame
|
| 385 |
updated_ds = Dataset.from_pandas(df)
|
| 386 |
-
# Wrap it in a DatasetDict (standard practice)
|
| 387 |
final_ds_dict = DatasetDict({'train': updated_ds})
|
| 388 |
|
| 389 |
logger.info(f"Dataset structure to push: {final_ds_dict}")
|
| 390 |
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
# logger.info(f"Successfully pushed updated dataset to '{HF_DATASET_ID}'.")
|
| 394 |
-
logger.warning("Dataset push to hub is currently commented out in the code. Uncomment the 'push_to_hub' line to enable leaderboard updates.")
|
| 395 |
return True
|
| 396 |
else:
|
| 397 |
logger.info("No changes needed, dataset not pushed.")
|
|
@@ -399,8 +386,7 @@ def update_huggingface_dataset(username: str, score: float, code_link: str):
|
|
| 399 |
|
| 400 |
except Exception as e:
|
| 401 |
logger.error(f"Error interacting with Hugging Face dataset '{HF_DATASET_ID}': {e}", exc_info=True)
|
| 402 |
-
|
| 403 |
-
# Adjust the exception type if not using FastAPI's HTTPException
|
| 404 |
raise HTTPException(status_code=500, detail=f"Failed to update Hugging Face dataset: {e}")
|
| 405 |
|
| 406 |
|
|
|
|
| 297 |
ds_dict = None
|
| 298 |
df = None
|
| 299 |
try:
|
| 300 |
+
|
|
|
|
|
|
|
| 301 |
ds_dict = load_dataset(HF_DATASET_ID, trust_remote_code=True) # Added trust_remote_code=True if needed
|
| 302 |
logger.info("Dataset loaded successfully.")
|
| 303 |
if "train" in ds_dict:
|
|
|
|
| 308 |
|
| 309 |
except Exception as load_error:
|
| 310 |
logger.warning(f"Could not load dataset '{HF_DATASET_ID}' or it's empty/new ({load_error}). Will create structure.")
|
| 311 |
+
|
| 312 |
df = pd.DataFrame({col: pd.Series(dtype=dtype) for col, dtype in expected_columns.items()})
|
| 313 |
|
| 314 |
+
|
| 315 |
for col, dtype in expected_columns.items():
|
| 316 |
if col not in df.columns:
|
| 317 |
logger.warning(f"Column '{col}' not found in loaded data. Adding it.")
|
| 318 |
+
|
| 319 |
df[col] = pd.Series(dtype=dtype)
|
| 320 |
|
| 321 |
+
|
| 322 |
df['score'] = pd.to_numeric(df['score'], errors='coerce').fillna(0.0)
|
| 323 |
+
|
| 324 |
df['username'] = df['username'].astype(str).fillna('')
|
| 325 |
df['timestamp'] = df['timestamp'].astype(str).fillna('')
|
| 326 |
+
df['code'] = df['code'].astype(str).fillna('')
|
| 327 |
|
| 328 |
# 2. Find existing score for the user
|
| 329 |
existing_entries = df[df['username'] == username]
|
|
|
|
| 335 |
max_existing_score = existing_entries['score'].max() # Already numeric
|
| 336 |
if score > max_existing_score:
|
| 337 |
logger.info(f"New score {score} is higher than existing max {max_existing_score} for {username}. Updating entry.")
|
| 338 |
+
df = df[df['username'] != username].copy()
|
|
|
|
|
|
|
| 339 |
new_entry = pd.DataFrame([{
|
| 340 |
'username': username,
|
| 341 |
'score': score,
|
|
|
|
| 362 |
if needs_update:
|
| 363 |
logger.info(f"Preparing to push updated dataset to '{HF_DATASET_ID}'...")
|
| 364 |
|
|
|
|
|
|
|
| 365 |
df = df[list(expected_columns.keys())]
|
|
|
|
| 366 |
for col, dtype in expected_columns.items():
|
|
|
|
| 367 |
if dtype == 'str':
|
| 368 |
df[col] = df[col].astype(str).fillna('')
|
| 369 |
elif dtype == 'float':
|
| 370 |
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0) # Ensure float conversion
|
|
|
|
| 371 |
|
| 372 |
logger.info(f"Final DataFrame columns and types:\n{df.dtypes}")
|
| 373 |
logger.info(f"Sample data before push:\n{df.head().to_string()}")
|
| 374 |
|
|
|
|
| 375 |
updated_ds = Dataset.from_pandas(df)
|
|
|
|
| 376 |
final_ds_dict = DatasetDict({'train': updated_ds})
|
| 377 |
|
| 378 |
logger.info(f"Dataset structure to push: {final_ds_dict}")
|
| 379 |
|
| 380 |
+
final_ds_dict.push_to_hub(HF_DATASET_ID)
|
| 381 |
+
logger.warning("Dataset push to hub is currently commented out in the code.")
|
|
|
|
|
|
|
| 382 |
return True
|
| 383 |
else:
|
| 384 |
logger.info("No changes needed, dataset not pushed.")
|
|
|
|
| 386 |
|
| 387 |
except Exception as e:
|
| 388 |
logger.error(f"Error interacting with Hugging Face dataset '{HF_DATASET_ID}': {e}", exc_info=True)
|
| 389 |
+
|
|
|
|
| 390 |
raise HTTPException(status_code=500, detail=f"Failed to update Hugging Face dataset: {e}")
|
| 391 |
|
| 392 |
|