Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Quick test for the training fix | |
| """ | |
| import os | |
| import sys | |
| # Add project root to path | |
| project_root = os.path.dirname(os.path.abspath(__file__)) | |
| sys.path.insert(0, project_root) | |
| def main(): | |
| print("π§ Testing H100 Lightweight Training Fix") | |
| print("=" * 50) | |
| # Set environment variables to fix mixed precision issues | |
| os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" | |
| os.environ["CUDA_LAUNCH_BLOCKING"] = "1" | |
| os.environ["TORCH_USE_CUDA_DSA"] = "1" | |
| print("β Environment variables set") | |
| # Test configuration | |
| try: | |
| from config.train_smollm3_h100_lightweight import SmolLM3ConfigH100Lightweight | |
| config = SmolLM3ConfigH100Lightweight() | |
| print(f"β Configuration loaded: fp16={config.fp16}, bf16={config.bf16}") | |
| # Test model loading (without actually loading the full model) | |
| from src.model import SmolLM3Model | |
| # Create model instance | |
| model = SmolLM3Model( | |
| model_name="HuggingFaceTB/SmolLM3-3B", | |
| max_seq_length=4096, | |
| config=config | |
| ) | |
| print(f"β Model dtype: {model.torch_dtype}") | |
| print(f"β Model device map: {model.device_map}") | |
| # Test training arguments | |
| training_args = model.get_training_arguments("/tmp/test") | |
| print(f"β Training args: fp16={training_args.fp16}, bf16={training_args.bf16}") | |
| print("\nπ All tests passed!") | |
| print("You can now run the training with:") | |
| print(" ./launch.sh") | |
| except Exception as e: | |
| print(f"β Error: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return 1 | |
| return 0 | |
| if __name__ == "__main__": | |
| exit(main()) |