Spaces:

Tonic
/

Petite-LLM-3

Running on Zero

File size: 4,857 Bytes

#!/usr/bin/env python3
"""
Helper script to download the full fine-tuned model files at build time for Hugging Face Spaces
"""

import os
import sys
import subprocess
import logging
from pathlib import Path

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Model configuration
MAIN_MODEL_ID = "Tonic/petite-elle-L-aime-3-sft"
LOCAL_MODEL_PATH = "./model"

def download_model():
    """Download the full fine-tuned model files to local directory"""
    try:
        logger.info(f"Downloading full fine-tuned model from {MAIN_MODEL_ID}")
        
        # Create local directory if it doesn't exist
        os.makedirs(LOCAL_MODEL_PATH, exist_ok=True)
        
        # Use huggingface_hub to download the model files
        from huggingface_hub import hf_hub_download, list_repo_files
        
        # List all files in the repository
        all_files = list_repo_files(MAIN_MODEL_ID)
        
        # Filter files that are in the main repository (not in subfolders)
        main_files = [f for f in all_files if not "/" in f or f.startswith("int4/") == False]
        logger.info(f"Found {len(main_files)} files in main repository")
        
        # Download each required file
        required_files = [
            "config.json",
            "pytorch_model.bin", 
            "tokenizer.json",
            "tokenizer_config.json",
            "special_tokens_map.json",
            "generation_config.json",
            "chat_template.jinja"
        ]
        
        downloaded_count = 0
        for file_name in required_files:
            if file_name in all_files:
                logger.info(f"Downloading {file_name}...")
                hf_hub_download(
                    repo_id=MAIN_MODEL_ID,
                    filename=file_name,
                    local_dir=LOCAL_MODEL_PATH,
                    local_dir_use_symlinks=False
                )
                logger.info(f"Downloaded {file_name}")
                downloaded_count += 1
            else:
                logger.warning(f"File {file_name} not found in main repository")
        
        logger.info(f"Downloaded {downloaded_count} out of {len(required_files)} required files")
        logger.info(f"Model downloaded successfully to {LOCAL_MODEL_PATH}")
        return True
        
    except Exception as e:
        logger.error(f"Error downloading model: {e}")
        return False

def check_model_files():
    """Check if required model files exist"""
    required_files = [
        "config.json",
        "pytorch_model.bin",
        "tokenizer.json",
        "tokenizer_config.json"
    ]
    
    missing_files = []
    for file in required_files:
        file_path = os.path.join(LOCAL_MODEL_PATH, file)
        if not os.path.exists(file_path):
            missing_files.append(file)
    
    if missing_files:
        logger.error(f"Missing model files: {missing_files}")
        return False
    
    logger.info("All required model files found")
    return True

def verify_model_integrity():
    """Verify that the downloaded model files are valid"""
    try:
        # Try to load the tokenizer to verify it's working
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_PATH)
        logger.info("Tokenizer loaded successfully from local files")
        
        # Try to load the model config
        from transformers import AutoConfig
        config = AutoConfig.from_pretrained(LOCAL_MODEL_PATH)
        logger.info("Model config loaded successfully from local files")
        
        return True
        
    except Exception as e:
        logger.error(f"Error verifying model integrity: {e}")
        return False

def main():
    """Main function to download model at build time"""
    logger.info("Starting model download for Hugging Face Space...")
    
    # Check if model files already exist
    if check_model_files():
        logger.info("Model files already exist, verifying integrity...")
        if verify_model_integrity():
            logger.info("Model files verified successfully")
            return True
        else:
            logger.warning("Model files exist but failed integrity check, re-downloading...")
    
    # Download the model
    if download_model():
        logger.info("Model download completed successfully")
        
        # Verify the downloaded files
        if check_model_files() and verify_model_integrity():
            logger.info("Model download and verification completed successfully")
            return True
        else:
            logger.error("Model download completed but verification failed")
            return False
    else:
        logger.error("Model download failed")
        return False

if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)