piclets-server / init_dataset.py
Fraser's picture
UPDATE
f9201f6
raw
history blame
6.72 kB
"""
Initialize HuggingFace dataset structure for Piclets Discovery game
This script sets up the required directory structure and initial files
in the Fraser/piclets dataset repository.
Usage:
python init_dataset.py
Requires:
- HF_TOKEN environment variable or set in HuggingFace Space secrets
- Write access to Fraser/piclets dataset
"""
import json
import os
from datetime import datetime
from huggingface_hub import HfApi, hf_hub_download
from pathlib import Path
# Configuration
DATASET_REPO = os.getenv("DATASET_REPO", "Fraser/piclets")
HF_TOKEN = os.getenv("HF_TOKEN")
def init_metadata_files():
"""Initialize global metadata files"""
# Initial stats
stats = {
"totalPiclets": 0,
"totalVariations": 0,
"totalScans": 0,
"totalUsers": 0,
"lastUpdated": datetime.now().isoformat()
}
# Initial leaderboard (empty)
leaderboard = {
"topDiscoverers": [],
"recentActivity": [],
"lastUpdated": datetime.now().isoformat()
}
return stats, leaderboard
def create_example_files():
"""Create example user and piclet files for testing"""
# Example user profile
example_user = {
"sub": "example_123456",
"preferred_username": "example_user",
"name": "Example User",
"picture": None,
"joinedAt": datetime.now().isoformat(),
"lastSeen": datetime.now().isoformat(),
"discoveries": [],
"uniqueFinds": 0,
"totalFinds": 0,
"rarityScore": 0,
"visibility": "public"
}
# Example piclet (empty canonical, ready for discoveries)
example_piclet = {
"canonical": None,
"variations": [],
"metadata": {
"created": datetime.now().isoformat(),
"lastUpdated": datetime.now().isoformat()
}
}
return example_user, example_piclet
def upload_initial_structure():
"""Upload initial dataset structure to HuggingFace"""
if not HF_TOKEN:
print("ERROR: HF_TOKEN environment variable not set")
print("Please set HF_TOKEN with write access to the dataset")
return False
print(f"Initializing dataset: {DATASET_REPO}")
api = HfApi()
# Create temporary directory for files
temp_dir = Path("temp_dataset_init")
temp_dir.mkdir(exist_ok=True)
try:
# 1. Create metadata directory and files
metadata_dir = temp_dir / "metadata"
metadata_dir.mkdir(exist_ok=True)
stats, leaderboard = init_metadata_files()
with open(metadata_dir / "stats.json", "w") as f:
json.dump(stats, f, indent=2)
print("βœ“ Created metadata/stats.json")
with open(metadata_dir / "leaderboard.json", "w") as f:
json.dump(leaderboard, f, indent=2)
print("βœ“ Created metadata/leaderboard.json")
# 2. Create users directory with example
users_dir = temp_dir / "users"
users_dir.mkdir(exist_ok=True)
example_user, _ = create_example_files()
with open(users_dir / ".gitkeep", "w") as f:
f.write("# User profiles stored here\n")
print("βœ“ Created users/ directory")
# 3. Create piclets directory with example
piclets_dir = temp_dir / "piclets"
piclets_dir.mkdir(exist_ok=True)
with open(piclets_dir / ".gitkeep", "w") as f:
f.write("# Canonical piclets and variations stored here\n")
print("βœ“ Created piclets/ directory")
# 4. Create README
readme_content = """# Piclets Discovery Dataset
This dataset stores the canonical Piclets, variations, and user profiles for the Piclets Discovery game.
## Structure
```
metadata/
stats.json # Global statistics
leaderboard.json # Top discoverers and recent activity
users/
{sub}.json # User profiles keyed by HuggingFace user ID
piclets/
{normalized_name}.json # Canonical piclet + variations
```
## Authentication
All endpoints require HuggingFace OAuth tokens:
- Frontend sends `Authorization: Bearer <token>` headers
- Server verifies via `https://huggingface.co/oauth/userinfo`
- User profiles use stable `sub` field as primary key
## API
Server endpoint: `Fraser/piclets-server` (HuggingFace Space)
See server repository for full API documentation.
"""
with open(temp_dir / "README.md", "w") as f:
f.write(readme_content)
print("βœ“ Created README.md")
# 5. Upload all files to dataset
print(f"\nUploading to {DATASET_REPO}...")
api.upload_folder(
folder_path=str(temp_dir),
repo_id=DATASET_REPO,
repo_type="dataset",
token=HF_TOKEN,
commit_message="Initialize dataset structure for Piclets Discovery"
)
print(f"\nβœ“ Dataset initialized successfully!")
print(f"View at: https://huggingface.co/datasets/{DATASET_REPO}")
return True
except Exception as e:
print(f"\nβœ— Error initializing dataset: {e}")
return False
finally:
# Cleanup temp directory
import shutil
if temp_dir.exists():
shutil.rmtree(temp_dir)
def verify_dataset_structure():
"""Verify that dataset structure exists"""
if not HF_TOKEN:
print("ERROR: HF_TOKEN not set")
return False
print(f"Verifying dataset structure: {DATASET_REPO}")
try:
# Try to download metadata files
stats_path = hf_hub_download(
repo_id=DATASET_REPO,
filename="metadata/stats.json",
repo_type="dataset",
token=HF_TOKEN
)
with open(stats_path) as f:
stats = json.load(f)
print(f"βœ“ Dataset exists with {stats['totalPiclets']} piclets")
print(f"βœ“ Structure verified")
return True
except Exception as e:
print(f"βœ— Dataset not initialized or error: {e}")
return False
if __name__ == "__main__":
import sys
if len(sys.argv) > 1 and sys.argv[1] == "--verify":
# Verify mode
if verify_dataset_structure():
sys.exit(0)
else:
sys.exit(1)
else:
# Initialize mode
print("=" * 60)
print("Piclets Discovery Dataset Initialization")
print("=" * 60)
print()
if upload_initial_structure():
print("\nNext steps:")
print("1. Verify at https://huggingface.co/datasets/Fraser/piclets")
print("2. Test with: python init_dataset.py --verify")
print("3. Deploy piclets-server to HuggingFace Space")
sys.exit(0)
else:
sys.exit(1)