piclets-server / init_dataset.py
Fraser's picture
UPDATE
f9201f6
"""
Initialize HuggingFace dataset structure for Piclets Discovery game
This script sets up the required directory structure and initial files
in the Fraser/piclets dataset repository.
Usage:
python init_dataset.py
Requires:
- HF_TOKEN environment variable or set in HuggingFace Space secrets
- Write access to Fraser/piclets dataset
"""
import json
import os
from datetime import datetime
from huggingface_hub import HfApi, hf_hub_download
from pathlib import Path
# Configuration
DATASET_REPO = os.getenv("DATASET_REPO", "Fraser/piclets")
HF_TOKEN = os.getenv("HF_TOKEN")
def init_metadata_files():
"""Initialize global metadata files"""
# Initial stats
stats = {
"totalPiclets": 0,
"totalVariations": 0,
"totalScans": 0,
"totalUsers": 0,
"lastUpdated": datetime.now().isoformat()
}
# Initial leaderboard (empty)
leaderboard = {
"topDiscoverers": [],
"recentActivity": [],
"lastUpdated": datetime.now().isoformat()
}
return stats, leaderboard
def create_example_files():
"""Create example user and piclet files for testing"""
# Example user profile
example_user = {
"sub": "example_123456",
"preferred_username": "example_user",
"name": "Example User",
"picture": None,
"joinedAt": datetime.now().isoformat(),
"lastSeen": datetime.now().isoformat(),
"discoveries": [],
"uniqueFinds": 0,
"totalFinds": 0,
"rarityScore": 0,
"visibility": "public"
}
# Example piclet (empty canonical, ready for discoveries)
example_piclet = {
"canonical": None,
"variations": [],
"metadata": {
"created": datetime.now().isoformat(),
"lastUpdated": datetime.now().isoformat()
}
}
return example_user, example_piclet
def upload_initial_structure():
"""Upload initial dataset structure to HuggingFace"""
if not HF_TOKEN:
print("ERROR: HF_TOKEN environment variable not set")
print("Please set HF_TOKEN with write access to the dataset")
return False
print(f"Initializing dataset: {DATASET_REPO}")
api = HfApi()
# Create temporary directory for files
temp_dir = Path("temp_dataset_init")
temp_dir.mkdir(exist_ok=True)
try:
# 1. Create metadata directory and files
metadata_dir = temp_dir / "metadata"
metadata_dir.mkdir(exist_ok=True)
stats, leaderboard = init_metadata_files()
with open(metadata_dir / "stats.json", "w") as f:
json.dump(stats, f, indent=2)
print("βœ“ Created metadata/stats.json")
with open(metadata_dir / "leaderboard.json", "w") as f:
json.dump(leaderboard, f, indent=2)
print("βœ“ Created metadata/leaderboard.json")
# 2. Create users directory with example
users_dir = temp_dir / "users"
users_dir.mkdir(exist_ok=True)
example_user, _ = create_example_files()
with open(users_dir / ".gitkeep", "w") as f:
f.write("# User profiles stored here\n")
print("βœ“ Created users/ directory")
# 3. Create piclets directory with example
piclets_dir = temp_dir / "piclets"
piclets_dir.mkdir(exist_ok=True)
with open(piclets_dir / ".gitkeep", "w") as f:
f.write("# Canonical piclets and variations stored here\n")
print("βœ“ Created piclets/ directory")
# 4. Create README
readme_content = """# Piclets Discovery Dataset
This dataset stores the canonical Piclets, variations, and user profiles for the Piclets Discovery game.
## Structure
```
metadata/
stats.json # Global statistics
leaderboard.json # Top discoverers and recent activity
users/
{sub}.json # User profiles keyed by HuggingFace user ID
piclets/
{normalized_name}.json # Canonical piclet + variations
```
## Authentication
All endpoints require HuggingFace OAuth tokens:
- Frontend sends `Authorization: Bearer <token>` headers
- Server verifies via `https://huggingface.co/oauth/userinfo`
- User profiles use stable `sub` field as primary key
## API
Server endpoint: `Fraser/piclets-server` (HuggingFace Space)
See server repository for full API documentation.
"""
with open(temp_dir / "README.md", "w") as f:
f.write(readme_content)
print("βœ“ Created README.md")
# 5. Upload all files to dataset
print(f"\nUploading to {DATASET_REPO}...")
api.upload_folder(
folder_path=str(temp_dir),
repo_id=DATASET_REPO,
repo_type="dataset",
token=HF_TOKEN,
commit_message="Initialize dataset structure for Piclets Discovery"
)
print(f"\nβœ“ Dataset initialized successfully!")
print(f"View at: https://huggingface.co/datasets/{DATASET_REPO}")
return True
except Exception as e:
print(f"\nβœ— Error initializing dataset: {e}")
return False
finally:
# Cleanup temp directory
import shutil
if temp_dir.exists():
shutil.rmtree(temp_dir)
def verify_dataset_structure():
"""Verify that dataset structure exists"""
if not HF_TOKEN:
print("ERROR: HF_TOKEN not set")
return False
print(f"Verifying dataset structure: {DATASET_REPO}")
try:
# Try to download metadata files
stats_path = hf_hub_download(
repo_id=DATASET_REPO,
filename="metadata/stats.json",
repo_type="dataset",
token=HF_TOKEN
)
with open(stats_path) as f:
stats = json.load(f)
print(f"βœ“ Dataset exists with {stats['totalPiclets']} piclets")
print(f"βœ“ Structure verified")
return True
except Exception as e:
print(f"βœ— Dataset not initialized or error: {e}")
return False
if __name__ == "__main__":
import sys
if len(sys.argv) > 1 and sys.argv[1] == "--verify":
# Verify mode
if verify_dataset_structure():
sys.exit(0)
else:
sys.exit(1)
else:
# Initialize mode
print("=" * 60)
print("Piclets Discovery Dataset Initialization")
print("=" * 60)
print()
if upload_initial_structure():
print("\nNext steps:")
print("1. Verify at https://huggingface.co/datasets/Fraser/piclets")
print("2. Test with: python init_dataset.py --verify")
print("3. Deploy piclets-server to HuggingFace Space")
sys.exit(0)
else:
sys.exit(1)