Upload app.py with huggingface_hub
Browse files
app.py
CHANGED
|
@@ -136,8 +136,20 @@ async def collect_and_store_data():
|
|
| 136 |
logger.info(f"Creating new dataset (existing not found): {e}")
|
| 137 |
combined_df = new_df
|
| 138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
# Convert back to dataset and push
|
| 140 |
-
new_dataset = Dataset.from_pandas(
|
| 141 |
new_dataset.push_to_hub(DATASET_REPO_NAME, token=HF_TOKEN, private=False)
|
| 142 |
|
| 143 |
logger.info(f"Successfully stored data for {len(results)} providers")
|
|
|
|
| 136 |
logger.info(f"Creating new dataset (existing not found): {e}")
|
| 137 |
combined_df = new_df
|
| 138 |
|
| 139 |
+
# De-duplicate by monthly_requests_int, keeping earliest timestamp for each value
|
| 140 |
+
combined_df['timestamp'] = pd.to_datetime(combined_df['timestamp'])
|
| 141 |
+
combined_df = combined_df.sort_values('timestamp')
|
| 142 |
+
|
| 143 |
+
# Group by provider and monthly_requests_int, keep first (earliest) occurrence
|
| 144 |
+
deduplicated_df = combined_df.groupby(['provider', 'monthly_requests_int']).first().reset_index()
|
| 145 |
+
|
| 146 |
+
# Convert timestamp back to string format
|
| 147 |
+
deduplicated_df['timestamp'] = deduplicated_df['timestamp'].dt.strftime('%Y-%m-%dT%H:%M:%S.%f%z')
|
| 148 |
+
|
| 149 |
+
logger.info(f"De-duplicated dataset: {len(combined_df)} -> {len(deduplicated_df)} records")
|
| 150 |
+
|
| 151 |
# Convert back to dataset and push
|
| 152 |
+
new_dataset = Dataset.from_pandas(deduplicated_df)
|
| 153 |
new_dataset.push_to_hub(DATASET_REPO_NAME, token=HF_TOKEN, private=False)
|
| 154 |
|
| 155 |
logger.info(f"Successfully stored data for {len(results)} providers")
|