""" Initialize HuggingFace dataset structure for Piclets Discovery game This script sets up the required directory structure and initial files in the Fraser/piclets dataset repository. Usage: python init_dataset.py Requires: - HF_TOKEN environment variable or set in HuggingFace Space secrets - Write access to Fraser/piclets dataset """ import json import os from datetime import datetime from huggingface_hub import HfApi, hf_hub_download from pathlib import Path # Configuration DATASET_REPO = os.getenv("DATASET_REPO", "Fraser/piclets") HF_TOKEN = os.getenv("HF_TOKEN") def init_metadata_files(): """Initialize global metadata files""" # Initial stats stats = { "totalPiclets": 0, "totalVariations": 0, "totalScans": 0, "totalUsers": 0, "lastUpdated": datetime.now().isoformat() } # Initial leaderboard (empty) leaderboard = { "topDiscoverers": [], "recentActivity": [], "lastUpdated": datetime.now().isoformat() } return stats, leaderboard def create_example_files(): """Create example user and piclet files for testing""" # Example user profile example_user = { "sub": "example_123456", "preferred_username": "example_user", "name": "Example User", "picture": None, "joinedAt": datetime.now().isoformat(), "lastSeen": datetime.now().isoformat(), "discoveries": [], "uniqueFinds": 0, "totalFinds": 0, "rarityScore": 0, "visibility": "public" } # Example piclet (empty canonical, ready for discoveries) example_piclet = { "canonical": None, "variations": [], "metadata": { "created": datetime.now().isoformat(), "lastUpdated": datetime.now().isoformat() } } return example_user, example_piclet def upload_initial_structure(): """Upload initial dataset structure to HuggingFace""" if not HF_TOKEN: print("ERROR: HF_TOKEN environment variable not set") print("Please set HF_TOKEN with write access to the dataset") return False print(f"Initializing dataset: {DATASET_REPO}") api = HfApi() # Create temporary directory for files temp_dir = Path("temp_dataset_init") temp_dir.mkdir(exist_ok=True) try: # 1. Create metadata directory and files metadata_dir = temp_dir / "metadata" metadata_dir.mkdir(exist_ok=True) stats, leaderboard = init_metadata_files() with open(metadata_dir / "stats.json", "w") as f: json.dump(stats, f, indent=2) print("✓ Created metadata/stats.json") with open(metadata_dir / "leaderboard.json", "w") as f: json.dump(leaderboard, f, indent=2) print("✓ Created metadata/leaderboard.json") # 2. Create users directory with example users_dir = temp_dir / "users" users_dir.mkdir(exist_ok=True) example_user, _ = create_example_files() with open(users_dir / ".gitkeep", "w") as f: f.write("# User profiles stored here\n") print("✓ Created users/ directory") # 3. Create piclets directory with example piclets_dir = temp_dir / "piclets" piclets_dir.mkdir(exist_ok=True) with open(piclets_dir / ".gitkeep", "w") as f: f.write("# Canonical piclets and variations stored here\n") print("✓ Created piclets/ directory") # 4. Create README readme_content = """# Piclets Discovery Dataset This dataset stores the canonical Piclets, variations, and user profiles for the Piclets Discovery game. ## Structure ``` metadata/ stats.json # Global statistics leaderboard.json # Top discoverers and recent activity users/ {sub}.json # User profiles keyed by HuggingFace user ID piclets/ {normalized_name}.json # Canonical piclet + variations ``` ## Authentication All endpoints require HuggingFace OAuth tokens: - Frontend sends `Authorization: Bearer ` headers - Server verifies via `https://huggingface.co/oauth/userinfo` - User profiles use stable `sub` field as primary key ## API Server endpoint: `Fraser/piclets-server` (HuggingFace Space) See server repository for full API documentation. """ with open(temp_dir / "README.md", "w") as f: f.write(readme_content) print("✓ Created README.md") # 5. Upload all files to dataset print(f"\nUploading to {DATASET_REPO}...") api.upload_folder( folder_path=str(temp_dir), repo_id=DATASET_REPO, repo_type="dataset", token=HF_TOKEN, commit_message="Initialize dataset structure for Piclets Discovery" ) print(f"\n✓ Dataset initialized successfully!") print(f"View at: https://huggingface.co/datasets/{DATASET_REPO}") return True except Exception as e: print(f"\n✗ Error initializing dataset: {e}") return False finally: # Cleanup temp directory import shutil if temp_dir.exists(): shutil.rmtree(temp_dir) def verify_dataset_structure(): """Verify that dataset structure exists""" if not HF_TOKEN: print("ERROR: HF_TOKEN not set") return False print(f"Verifying dataset structure: {DATASET_REPO}") try: # Try to download metadata files stats_path = hf_hub_download( repo_id=DATASET_REPO, filename="metadata/stats.json", repo_type="dataset", token=HF_TOKEN ) with open(stats_path) as f: stats = json.load(f) print(f"✓ Dataset exists with {stats['totalPiclets']} piclets") print(f"✓ Structure verified") return True except Exception as e: print(f"✗ Dataset not initialized or error: {e}") return False if __name__ == "__main__": import sys if len(sys.argv) > 1 and sys.argv[1] == "--verify": # Verify mode if verify_dataset_structure(): sys.exit(0) else: sys.exit(1) else: # Initialize mode print("=" * 60) print("Piclets Discovery Dataset Initialization") print("=" * 60) print() if upload_initial_structure(): print("\nNext steps:") print("1. Verify at https://huggingface.co/datasets/Fraser/piclets") print("2. Test with: python init_dataset.py --verify") print("3. Deploy piclets-server to HuggingFace Space") sys.exit(0) else: sys.exit(1)