Petite-LLM-3 / docsandtests /download_model.py
Tonic's picture
small mods on the title and description
5975026
#!/usr/bin/env python3
"""
Helper script to download the full fine-tuned model files at build time for Hugging Face Spaces
"""
import os
import sys
import subprocess
import logging
from pathlib import Path
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Model configuration
MAIN_MODEL_ID = "Tonic/petite-elle-L-aime-3-sft"
LOCAL_MODEL_PATH = "./model"
def download_model():
"""Download the full fine-tuned model files to local directory"""
try:
logger.info(f"Downloading full fine-tuned model from {MAIN_MODEL_ID}")
# Create local directory if it doesn't exist
os.makedirs(LOCAL_MODEL_PATH, exist_ok=True)
# Use huggingface_hub to download the model files
from huggingface_hub import hf_hub_download, list_repo_files
# List all files in the repository
all_files = list_repo_files(MAIN_MODEL_ID)
# Filter files that are in the main repository (not in subfolders)
main_files = [f for f in all_files if not "/" in f or f.startswith("int4/") == False]
logger.info(f"Found {len(main_files)} files in main repository")
# Download each required file
required_files = [
"config.json",
"pytorch_model.bin",
"tokenizer.json",
"tokenizer_config.json",
"special_tokens_map.json",
"generation_config.json",
"chat_template.jinja"
]
downloaded_count = 0
for file_name in required_files:
if file_name in all_files:
logger.info(f"Downloading {file_name}...")
hf_hub_download(
repo_id=MAIN_MODEL_ID,
filename=file_name,
local_dir=LOCAL_MODEL_PATH,
local_dir_use_symlinks=False
)
logger.info(f"Downloaded {file_name}")
downloaded_count += 1
else:
logger.warning(f"File {file_name} not found in main repository")
logger.info(f"Downloaded {downloaded_count} out of {len(required_files)} required files")
logger.info(f"Model downloaded successfully to {LOCAL_MODEL_PATH}")
return True
except Exception as e:
logger.error(f"Error downloading model: {e}")
return False
def check_model_files():
"""Check if required model files exist"""
required_files = [
"config.json",
"pytorch_model.bin",
"tokenizer.json",
"tokenizer_config.json"
]
missing_files = []
for file in required_files:
file_path = os.path.join(LOCAL_MODEL_PATH, file)
if not os.path.exists(file_path):
missing_files.append(file)
if missing_files:
logger.error(f"Missing model files: {missing_files}")
return False
logger.info("All required model files found")
return True
def verify_model_integrity():
"""Verify that the downloaded model files are valid"""
try:
# Try to load the tokenizer to verify it's working
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_PATH)
logger.info("Tokenizer loaded successfully from local files")
# Try to load the model config
from transformers import AutoConfig
config = AutoConfig.from_pretrained(LOCAL_MODEL_PATH)
logger.info("Model config loaded successfully from local files")
return True
except Exception as e:
logger.error(f"Error verifying model integrity: {e}")
return False
def main():
"""Main function to download model at build time"""
logger.info("Starting model download for Hugging Face Space...")
# Check if model files already exist
if check_model_files():
logger.info("Model files already exist, verifying integrity...")
if verify_model_integrity():
logger.info("Model files verified successfully")
return True
else:
logger.warning("Model files exist but failed integrity check, re-downloading...")
# Download the model
if download_model():
logger.info("Model download completed successfully")
# Verify the downloaded files
if check_model_files() and verify_model_integrity():
logger.info("Model download and verification completed successfully")
return True
else:
logger.error("Model download completed but verification failed")
return False
else:
logger.error("Model download failed")
return False
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)