#!/usr/bin/env python3 """ Advanced helper script to download the int4 model files using HfFileSystem """ import os import sys import logging from pathlib import Path from tqdm import tqdm # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Model configuration MAIN_MODEL_ID = "Tonic/petite-elle-L-aime-3-sft" INT4_MODEL_ID = "Tonic/petite-elle-L-aime-3-sft/int4" LOCAL_MODEL_PATH = "./int4" def get_file_info(fs, repo_path): """Get detailed information about files in the repository""" try: files = fs.ls(repo_path, detail=True) return [f for f in files if f['type'] == 'file'] except Exception as e: logger.error(f"Error listing files in {repo_path}: {e}") return [] def download_with_progress(fs, remote_path, local_path, file_size): """Download a file with progress bar""" try: # Create directory if it doesn't exist os.makedirs(os.path.dirname(local_path), exist_ok=True) # Download with progress bar with tqdm(total=file_size, unit='B', unit_scale=True, desc=os.path.basename(local_path)) as pbar: with fs.open(remote_path, 'rb') as remote_file: with open(local_path, 'wb') as local_file: chunk_size = 8192 while True: chunk = remote_file.read(chunk_size) if not chunk: break local_file.write(chunk) pbar.update(len(chunk)) return True except Exception as e: logger.error(f"Error downloading {remote_path}: {e}") return False def download_model_advanced(): """Download the int4 model files using advanced HfFileSystem features""" try: logger.info(f"Downloading int4 model from {INT4_MODEL_ID}") # Create local directory if it doesn't exist os.makedirs(LOCAL_MODEL_PATH, exist_ok=True) # Use HfFileSystem for downloading from huggingface_hub import HfFileSystem # Initialize the file system fs = HfFileSystem() # Check if repository exists if not fs.exists(INT4_MODEL_ID): logger.error(f"Repository {INT4_MODEL_ID} does not exist") return False # Get file information files = get_file_info(fs, INT4_MODEL_ID) if not files: logger.error("No files found in repository") return False # Filter essential model files essential_files = [ 'config.json', 'pytorch_model.bin', 'tokenizer.json', 'tokenizer_config.json', 'special_tokens_map.json', 'generation_config.json' ] files_to_download = [] for file_info in files: file_name = os.path.basename(file_info['name']) if file_name in essential_files: files_to_download.append(file_info) logger.info(f"Found {len(files_to_download)} essential files to download") # Download each file successful_downloads = 0 for file_info in files_to_download: file_path = file_info['name'] file_name = os.path.basename(file_path) local_file_path = os.path.join(LOCAL_MODEL_PATH, file_name) file_size = file_info.get('size', 0) logger.info(f"Downloading {file_name} ({file_size} bytes)...") # Download the file with progress if download_with_progress(fs, file_path, local_file_path, file_size): successful_downloads += 1 logger.info(f"Successfully downloaded {file_name}") else: logger.error(f"Failed to download {file_name}") logger.info(f"Downloaded {successful_downloads}/{len(files_to_download)} files") return successful_downloads == len(files_to_download) except Exception as e: logger.error(f"Error downloading model: {e}") return False def verify_download_advanced(): """Advanced verification of downloaded model files""" try: logger.info("Verifying downloaded model files...") # Expected file sizes (approximate) expected_files = { "config.json": (1000, 10000), # (min_size, max_size) in bytes "pytorch_model.bin": (1000000, 5000000000), # Should be several MB "tokenizer.json": (10000, 1000000), # Should be several KB "tokenizer_config.json": (100, 10000), # Minimum size "special_tokens_map.json": (100, 10000), "generation_config.json": (100, 10000) } verification_results = [] for file_name, (min_size, max_size) in expected_files.items(): file_path = os.path.join(LOCAL_MODEL_PATH, file_name) if os.path.exists(file_path): actual_size = os.path.getsize(file_path) if min_size <= actual_size <= max_size: logger.info(f"✅ {file_name} verified ({actual_size} bytes)") verification_results.append(True) else: logger.warning(f"⚠️ {file_name} size unexpected ({actual_size} bytes)") verification_results.append(False) else: logger.error(f"❌ Missing {file_name}") verification_results.append(False) success_rate = sum(verification_results) / len(verification_results) logger.info(f"Verification complete: {sum(verification_results)}/{len(verification_results)} files valid") return success_rate >= 0.8 # Allow 20% tolerance except Exception as e: logger.error(f"Error verifying files: {e}") return False def check_model_files(): """Check if required model files exist""" required_files = [ "config.json", "pytorch_model.bin", "tokenizer.json", "tokenizer_config.json" ] missing_files = [] for file in required_files: file_path = os.path.join(LOCAL_MODEL_PATH, file) if not os.path.exists(file_path): missing_files.append(file) if missing_files: logger.error(f"Missing model files: {missing_files}") return False logger.info("All required model files found") return True def main(): """Main function to download model at build time""" logger.info("Starting advanced model download for Hugging Face Space...") # Check if model files already exist if check_model_files(): logger.info("Model files already exist, skipping download") return True # Download the model using advanced method if download_model_advanced(): # Verify the download if verify_download_advanced(): logger.info("Model download and verification completed successfully") return True else: logger.error("Model verification failed") return False else: logger.error("Model download failed") return False if __name__ == "__main__": success = main() sys.exit(0 if success else 1)