Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Gradio Interface for SmolLM3/GPT-OSS Fine-tuning Pipeline | |
| This app mirrors the core flow of launch.sh with a click-and-run UI. | |
| Tokens are read from environment variables: | |
| - HF_WRITE_TOKEN (required) | |
| - HF_READ_TOKEN (optional; used to switch the Trackio Space token after training) | |
| Key steps (configurable via UI): | |
| 1) Optional HF Dataset repo setup for Trackio | |
| 2) Optional Trackio Space deployment | |
| 3) Training (SmolLM3 or GPT-OSS) | |
| 4) Push trained model to the HF Hub | |
| 5) Optional switch Trackio HF_TOKEN to read token | |
| This uses the existing scripts in scripts/ and config/ to avoid code duplication. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import sys | |
| import time | |
| import json | |
| import shlex | |
| import traceback | |
| import importlib.util | |
| import re | |
| from dataclasses import dataclass | |
| from datetime import datetime | |
| from pathlib import Path | |
| from typing import Dict, Any, Generator, Optional, Tuple | |
| # Third-party | |
| try: | |
| import gradio as gr # type: ignore | |
| except Exception as _e: | |
| raise RuntimeError( | |
| "Gradio is required. Please install it first: pip install gradio" | |
| ) from _e | |
| # -------------------------------------------------------------------------------------- | |
| # Utilities | |
| # -------------------------------------------------------------------------------------- | |
| PROJECT_ROOT = Path(__file__).resolve().parent | |
| def mask_token(token: Optional[str]) -> str: | |
| if not token: | |
| return "<not set>" | |
| token = str(token) | |
| if len(token) <= 8: | |
| return "*" * len(token) | |
| return f"{token[:4]}****{token[-4:]}" | |
| def get_python() -> str: | |
| return sys.executable or "python" | |
| def get_username_from_token(token: str) -> Optional[str]: | |
| try: | |
| from huggingface_hub import HfApi # type: ignore | |
| api = HfApi(token=token) | |
| info = api.whoami() | |
| if isinstance(info, dict): | |
| return info.get("name") or info.get("username") | |
| if isinstance(info, str): | |
| return info | |
| except Exception: | |
| return None | |
| return None | |
| def detect_nvidia_driver() -> Tuple[bool, str]: | |
| """Detect NVIDIA driver/GPU presence with multiple strategies. | |
| Returns (available, human_message). | |
| """ | |
| # 1) Try torch CUDA | |
| try: | |
| import torch # type: ignore | |
| if torch.cuda.is_available(): | |
| try: | |
| num = torch.cuda.device_count() | |
| names = [torch.cuda.get_device_name(i) for i in range(num)] | |
| return True, f"NVIDIA GPU detected: {', '.join(names)}" | |
| except Exception: | |
| return True, "NVIDIA GPU detected (torch.cuda available)" | |
| except Exception: | |
| pass | |
| # 2) Try NVML via pynvml | |
| try: | |
| import pynvml # type: ignore | |
| try: | |
| pynvml.nvmlInit() | |
| cnt = pynvml.nvmlDeviceGetCount() | |
| names = [] | |
| for i in range(cnt): | |
| h = pynvml.nvmlDeviceGetHandleByIndex(i) | |
| names.append(pynvml.nvmlDeviceGetName(h).decode("utf-8", errors="ignore")) | |
| drv = pynvml.nvmlSystemGetDriverVersion().decode("utf-8", errors="ignore") | |
| pynvml.nvmlShutdown() | |
| if cnt > 0: | |
| return True, f"NVIDIA driver {drv}; GPUs: {', '.join(names)}" | |
| except Exception: | |
| pass | |
| except Exception: | |
| pass | |
| # 3) Try nvidia-smi | |
| try: | |
| import subprocess | |
| res = subprocess.run(["nvidia-smi", "-L"], capture_output=True, text=True, timeout=3) | |
| if res.returncode == 0 and res.stdout.strip(): | |
| return True, res.stdout.strip().splitlines()[0] | |
| except Exception: | |
| pass | |
| return False, "No NVIDIA driver/GPU detected" | |
| def duplicate_space_hint() -> str: | |
| space_id = os.environ.get("SPACE_ID") or os.environ.get("HF_SPACE_ID") | |
| if space_id: | |
| space_url = f"https://huggingface.co/spaces/{space_id}" | |
| dup_url = f"{space_url}?duplicate=true" | |
| return ( | |
| f"ℹ️ No NVIDIA driver detected. If you're on Hugging Face Spaces, " | |
| f"please duplicate this Space to GPU hardware: [Duplicate this Space]({dup_url})." | |
| ) | |
| return ( | |
| "ℹ️ No NVIDIA driver detected. To enable training, run on a machine with an NVIDIA GPU/driver " | |
| "or duplicate this Space on Hugging Face with GPU hardware." | |
| ) | |
| def markdown_links_to_html(text: str) -> str: | |
| """Convert simple Markdown links [text](url) to HTML anchors for UI rendering.""" | |
| try: | |
| return re.sub(r"\[([^\]]+)\]\(([^)]+)\)", r'<a href="\2" target="_blank" rel="noopener noreferrer">\1</a>', text) | |
| except Exception: | |
| return text | |
| def _write_generated_config(filename: str, content: str) -> Path: | |
| """Write a generated config under config/ and return the full path.""" | |
| cfg_dir = PROJECT_ROOT / "config" | |
| cfg_dir.mkdir(parents=True, exist_ok=True) | |
| path = cfg_dir / filename | |
| with open(path, "w", encoding="utf-8") as f: | |
| f.write(content) | |
| return path | |
| def generate_medical_o1_config_file( | |
| dataset_config: str, | |
| system_message: Optional[str], | |
| developer_message: Optional[str], | |
| num_train_epochs: float, | |
| batch_size: int, | |
| gradient_accumulation_steps: int, | |
| learning_rate: float, | |
| max_seq_length: int, | |
| ) -> Path: | |
| """Create a GPT-OSS Medical o1 SFT config file from user inputs.""" | |
| # Sanitize quotes in messages | |
| def _q(s: Optional[str]) -> str: | |
| if s is None or s == "": | |
| return "None" | |
| return repr(s) | |
| py = f""" | |
| from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig | |
| config = GPTOSSEnhancedCustomConfig( | |
| dataset_name="FreedomIntelligence/medical-o1-reasoning-SFT", | |
| dataset_config={repr(dataset_config)}, | |
| dataset_split="train", | |
| dataset_format="medical_o1_sft", | |
| # Field mapping and prefixes | |
| input_field="Question", | |
| target_field="Response", | |
| question_field="Question", | |
| reasoning_field="Complex_CoT", | |
| response_field="Response", | |
| reason_prefix="Reasoning: ", | |
| answer_prefix="Final Answer: ", | |
| # Optional context | |
| system_message={_q(system_message)}, | |
| developer_message={_q(developer_message)}, | |
| # Training hyperparameters | |
| num_train_epochs={num_train_epochs}, | |
| batch_size={batch_size}, | |
| gradient_accumulation_steps={gradient_accumulation_steps}, | |
| learning_rate={learning_rate}, | |
| min_lr=2e-5, | |
| weight_decay=0.01, | |
| warmup_ratio=0.03, | |
| # Sequence length | |
| max_seq_length={max_seq_length}, | |
| # Precision & performance | |
| fp16=False, | |
| bf16=True, | |
| dataloader_num_workers=4, | |
| dataloader_pin_memory=True, | |
| dataloader_prefetch_factor=2, | |
| group_by_length=True, | |
| remove_unused_columns=True, | |
| # LoRA & quantization | |
| use_lora=True, | |
| lora_config={ | |
| "r": 16, | |
| "lora_alpha": 32, | |
| "lora_dropout": 0.05, | |
| "target_modules": "all-linear", | |
| "target_parameters": [ | |
| "7.mlp.experts.gate_up_proj", | |
| "7.mlp.experts.down_proj", | |
| "15.mlp.experts.gate_up_proj", | |
| "15.mlp.experts.down_proj", | |
| "23.mlp.experts.gate_up_proj", | |
| "23.mlp.experts.down_proj", | |
| ], | |
| "bias": "none", | |
| "task_type": "CAUSAL_LM", | |
| }, | |
| use_quantization=True, | |
| quantization_config={ | |
| "dequantize": True, | |
| "load_in_4bit": False, | |
| }, | |
| # Logging & evaluation | |
| eval_strategy="steps", | |
| eval_steps=100, | |
| logging_steps=10, | |
| save_strategy="steps", | |
| save_steps=500, | |
| save_total_limit=3, | |
| metric_for_best_model="eval_loss", | |
| greater_is_better=False, | |
| ) | |
| """ | |
| return _write_generated_config("_generated_gpt_oss_medical_o1_sft.py", py) | |
| def generate_gpt_oss_custom_config_file( | |
| dataset_name: str, | |
| dataset_split: str, | |
| dataset_format: str, | |
| input_field: str, | |
| target_field: Optional[str], | |
| system_message: Optional[str], | |
| developer_message: Optional[str], | |
| model_identity: Optional[str], | |
| max_samples: Optional[int], | |
| min_length: int, | |
| max_length: Optional[int], | |
| num_train_epochs: float, | |
| batch_size: int, | |
| gradient_accumulation_steps: int, | |
| learning_rate: float, | |
| min_lr: float, | |
| weight_decay: float, | |
| warmup_ratio: float, | |
| max_seq_length: int, | |
| lora_r: int, | |
| lora_alpha: int, | |
| lora_dropout: float, | |
| mixed_precision: str, # "bf16"|"fp16"|"fp32" | |
| num_workers: int, | |
| quantization_type: str, # "mxfp4"|"bnb4"|"none" | |
| max_grad_norm: float, | |
| logging_steps: int, | |
| eval_steps: int, | |
| save_steps: int, | |
| ) -> Path: | |
| # Precision flags | |
| if mixed_precision.lower() == "bf16": | |
| fp16_flag = False | |
| bf16_flag = True | |
| elif mixed_precision.lower() == "fp16": | |
| fp16_flag = True | |
| bf16_flag = False | |
| else: | |
| fp16_flag = False | |
| bf16_flag = False | |
| # Quantization flags/config | |
| if quantization_type == "mxfp4": | |
| use_quant = True | |
| quant_cfg = '{"dequantize": True, "load_in_4bit": False}' | |
| elif quantization_type == "bnb4": | |
| use_quant = True | |
| quant_cfg = '{"dequantize": False, "load_in_4bit": True, "bnb_4bit_compute_dtype": "bfloat16", "bnb_4bit_use_double_quant": True, "bnb_4bit_quant_type": "nf4"}' | |
| else: | |
| use_quant = False | |
| quant_cfg = '{"dequantize": False, "load_in_4bit": False}' | |
| def _q(s: Optional[str]) -> str: | |
| if s is None or s == "": | |
| return "None" | |
| return repr(s) | |
| py = f""" | |
| from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig | |
| config = GPTOSSEnhancedCustomConfig( | |
| # Dataset | |
| dataset_name={repr(dataset_name)}, | |
| dataset_split={repr(dataset_split)}, | |
| dataset_format={repr(dataset_format)}, | |
| input_field={repr(input_field)}, | |
| target_field={repr(target_field)} if {repr(target_field)} != 'None' else None, | |
| system_message={_q(system_message)}, | |
| developer_message={_q(developer_message)}, | |
| max_samples={repr(max_samples)} if {repr(max_samples)} != 'None' else None, | |
| min_length={min_length}, | |
| max_length={repr(max_length)} if {repr(max_length)} != 'None' else None, | |
| # Training hyperparameters | |
| num_train_epochs={num_train_epochs}, | |
| batch_size={batch_size}, | |
| gradient_accumulation_steps={gradient_accumulation_steps}, | |
| learning_rate={learning_rate}, | |
| min_lr={min_lr}, | |
| weight_decay={weight_decay}, | |
| warmup_ratio={warmup_ratio}, | |
| max_grad_norm={max_grad_norm}, | |
| # Model | |
| max_seq_length={max_seq_length}, | |
| # Precision | |
| fp16={str(fp16_flag)}, | |
| bf16={str(bf16_flag)}, | |
| # LoRA | |
| lora_config={{ | |
| "r": {lora_r}, | |
| "lora_alpha": {lora_alpha}, | |
| "lora_dropout": {lora_dropout}, | |
| "target_modules": "all-linear", | |
| "bias": "none", | |
| "task_type": "CAUSAL_LM", | |
| }}, | |
| # Quantization | |
| use_quantization={str(use_quant)}, | |
| quantization_config={quant_cfg}, | |
| # Performance | |
| dataloader_num_workers={num_workers}, | |
| dataloader_pin_memory=True, | |
| group_by_length=True, | |
| # Logging & eval | |
| logging_steps={logging_steps}, | |
| eval_steps={eval_steps}, | |
| save_steps={save_steps}, | |
| # Chat template (Harmony) | |
| chat_template_kwargs={{ | |
| "add_generation_prompt": True, | |
| "tokenize": False, | |
| "auto_insert_role": True, | |
| "reasoning_effort": "medium", | |
| "model_identity": {_q(model_identity) if _q(model_identity) != 'None' else repr('You are GPT-Tonic, a large language model trained by TonicAI.')}, | |
| "builtin_tools": [], | |
| }}, | |
| ) | |
| """ | |
| return _write_generated_config("_generated_gpt_oss_custom.py", py) | |
| def generate_smollm3_custom_config_file( | |
| model_name: str, | |
| dataset_name: Optional[str], | |
| max_seq_length: int, | |
| batch_size: int, | |
| gradient_accumulation_steps: int, | |
| learning_rate: float, | |
| save_steps: int, | |
| eval_steps: int, | |
| logging_steps: int, | |
| filter_bad_entries: bool, | |
| input_field: str, | |
| target_field: str, | |
| sample_size: Optional[int], | |
| sample_seed: int, | |
| trainer_type: str, | |
| ) -> Path: | |
| # Create subclass to include dataset fields similar to other configs | |
| def _bool(b: bool) -> str: | |
| return "True" if b else "False" | |
| ds_section = """ | |
| # HF Dataset configuration | |
| dataset_name={} | |
| dataset_split="train" | |
| input_field={} | |
| target_field={} | |
| filter_bad_entries={} | |
| bad_entry_field="bad_entry" | |
| sample_size={} | |
| sample_seed={} | |
| """.format( | |
| repr(dataset_name) if dataset_name else "None", | |
| repr(input_field), | |
| repr(target_field), | |
| _bool(filter_bad_entries), | |
| repr(sample_size) if sample_size is not None else "None", | |
| sample_seed, | |
| ) | |
| py = f""" | |
| from dataclasses import dataclass | |
| from typing import Optional | |
| from config.train_smollm3 import SmolLM3Config | |
| @dataclass | |
| class SmolLM3GeneratedConfig(SmolLM3Config): | |
| {ds_section} | |
| config = SmolLM3GeneratedConfig( | |
| trainer_type={repr(trainer_type.lower())}, | |
| model_name={repr(model_name)}, | |
| max_seq_length={max_seq_length}, | |
| use_flash_attention=True, | |
| use_gradient_checkpointing=True, | |
| batch_size={batch_size}, | |
| gradient_accumulation_steps={gradient_accumulation_steps}, | |
| learning_rate={learning_rate}, | |
| weight_decay=0.01, | |
| warmup_steps=100, | |
| max_iters=None, | |
| eval_interval={eval_steps}, | |
| log_interval={logging_steps}, | |
| save_interval={save_steps}, | |
| optimizer="adamw", | |
| beta1=0.9, | |
| beta2=0.95, | |
| eps=1e-8, | |
| scheduler="cosine", | |
| min_lr=1e-6, | |
| fp16=True, | |
| bf16=False, | |
| save_steps={save_steps}, | |
| eval_steps={eval_steps}, | |
| logging_steps={logging_steps}, | |
| save_total_limit=3, | |
| eval_strategy="steps", | |
| metric_for_best_model="eval_loss", | |
| greater_is_better=False, | |
| load_best_model_at_end=True, | |
| ) | |
| """ | |
| return _write_generated_config("_generated_smollm3_custom.py", py) | |
| def generate_smollm3_long_context_config_file( | |
| model_name: str, | |
| dataset_name: Optional[str], | |
| input_field: str, | |
| target_field: str, | |
| filter_bad_entries: bool, | |
| sample_size: Optional[int], | |
| sample_seed: int, | |
| max_seq_length: int, | |
| batch_size: int, | |
| gradient_accumulation_steps: int, | |
| learning_rate: float, | |
| warmup_steps: int, | |
| max_iters: int, | |
| save_steps: int, | |
| eval_steps: int, | |
| logging_steps: int, | |
| use_chat_template: bool, | |
| no_think_system_message: bool, | |
| trainer_type: str, | |
| ) -> Path: | |
| """Create a SmolLM3 long-context config file with optional dataset fields.""" | |
| def _bool(b: bool) -> str: | |
| return "True" if b else "False" | |
| ds_section = """ | |
| # HF Dataset configuration | |
| dataset_name={} | |
| dataset_split="train" | |
| input_field={} | |
| target_field={} | |
| filter_bad_entries={} | |
| bad_entry_field="bad_entry" | |
| sample_size={} | |
| sample_seed={} | |
| """.format( | |
| repr(dataset_name) if dataset_name else "None", | |
| repr(input_field), | |
| repr(target_field), | |
| _bool(filter_bad_entries), | |
| repr(sample_size) if sample_size is not None else "None", | |
| sample_seed, | |
| ) | |
| py = f""" | |
| from dataclasses import dataclass | |
| from typing import Optional | |
| from config.train_smollm3 import SmolLM3Config | |
| @dataclass | |
| class SmolLM3LongContextGeneratedConfig(SmolLM3Config): | |
| {ds_section} | |
| config = SmolLM3LongContextGeneratedConfig( | |
| trainer_type={repr(trainer_type.lower())}, | |
| model_name={repr(model_name)}, | |
| max_seq_length={max_seq_length}, | |
| use_flash_attention=True, | |
| use_gradient_checkpointing=True, | |
| batch_size={batch_size}, | |
| gradient_accumulation_steps={gradient_accumulation_steps}, | |
| learning_rate={learning_rate}, | |
| weight_decay=0.01, | |
| warmup_steps={warmup_steps}, | |
| max_iters={max_iters}, | |
| fp16=True, | |
| bf16=False, | |
| save_steps={save_steps}, | |
| eval_steps={eval_steps}, | |
| logging_steps={logging_steps}, | |
| save_total_limit=3, | |
| eval_strategy="steps", | |
| metric_for_best_model="eval_loss", | |
| greater_is_better=False, | |
| load_best_model_at_end=True, | |
| use_chat_template={_bool(use_chat_template)}, | |
| chat_template_kwargs={{ | |
| "add_generation_prompt": True, | |
| "no_think_system_message": {_bool(no_think_system_message)} | |
| }} | |
| ) | |
| """ | |
| return _write_generated_config("_generated_smollm3_long_context.py", py) | |
| def ensure_dataset_repo(username: str, dataset_name: str, token: str) -> Tuple[str, bool, str]: | |
| """Create or ensure dataset repo exists. Returns (repo_id, created_or_exists, message).""" | |
| from huggingface_hub import create_repo # type: ignore | |
| repo_id = f"{username}/{dataset_name}" | |
| try: | |
| create_repo(repo_id=repo_id, repo_type="dataset", token=token, exist_ok=True, private=False) | |
| return repo_id, True, f"Dataset repo ready: {repo_id}" | |
| except Exception as e: | |
| return repo_id, False, f"Failed to create dataset repo {repo_id}: {e}" | |
| def import_config_object(config_path: Path) -> Optional[Any]: | |
| """Import a config file and return its 'config' object if present, else None.""" | |
| try: | |
| spec = importlib.util.spec_from_file_location("config_module", str(config_path)) | |
| if not spec or not spec.loader: | |
| return None | |
| module = importlib.util.module_from_spec(spec) | |
| spec.loader.exec_module(module) # type: ignore | |
| if hasattr(module, "config"): | |
| return getattr(module, "config") | |
| return None | |
| except Exception: | |
| return None | |
| def run_command_stream(args: list[str], env: Dict[str, str], cwd: Optional[Path] = None) -> Generator[str, None, int]: | |
| """Run a command and yield stdout/stderr lines as they arrive. Returns exit code at the end.""" | |
| import subprocess | |
| yield f"$ {' '.join(shlex.quote(a) for a in ([get_python()] + args))}" | |
| process = subprocess.Popen( | |
| [get_python()] + args, | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.STDOUT, | |
| text=True, | |
| env=env, | |
| cwd=str(cwd or PROJECT_ROOT), | |
| bufsize=1, | |
| universal_newlines=True, | |
| ) | |
| assert process.stdout is not None | |
| for line in iter(process.stdout.readline, ""): | |
| yield line.rstrip() | |
| process.stdout.close() | |
| code = process.wait() | |
| yield f"[exit_code={code}]" | |
| return code | |
| # -------------------------------------------------------------------------------------- | |
| # Configuration Mappings (mirror launch.sh) | |
| # -------------------------------------------------------------------------------------- | |
| SMOL_CONFIGS = { | |
| "Basic Training": { | |
| "config_file": "config/train_smollm3.py", | |
| "default_model": "HuggingFaceTB/SmolLM3-3B", | |
| }, | |
| "H100 Lightweight (Rapid)": { | |
| "config_file": "config/train_smollm3_h100_lightweight.py", | |
| "default_model": "HuggingFaceTB/SmolLM3-3B", | |
| }, | |
| "A100 Large Scale": { | |
| "config_file": "config/train_smollm3_openhermes_fr_a100_large.py", | |
| "default_model": "HuggingFaceTB/SmolLM3-3B", | |
| }, | |
| "Multiple Passes": { | |
| "config_file": "config/train_smollm3_openhermes_fr_a100_multiple_passes.py", | |
| "default_model": "HuggingFaceTB/SmolLM3-3B", | |
| }, | |
| } | |
| GPT_OSS_CONFIGS = { | |
| "GPT-OSS Basic Training": { | |
| "config_file": "config/train_gpt_oss_basic.py", | |
| "default_model": "openai/gpt-oss-20b", | |
| }, | |
| "GPT-OSS H100 Optimized": { | |
| "config_file": "config/train_gpt_oss_h100_optimized.py", | |
| "default_model": "openai/gpt-oss-20b", | |
| }, | |
| "GPT-OSS Multilingual Reasoning": { | |
| "config_file": "config/train_gpt_oss_multilingual_reasoning.py", | |
| "default_model": "openai/gpt-oss-20b", | |
| }, | |
| "GPT-OSS Memory Optimized": { | |
| "config_file": "config/train_gpt_oss_memory_optimized.py", | |
| "default_model": "openai/gpt-oss-20b", | |
| }, | |
| "GPT-OSS OpenHermes-FR (Recommended)": { | |
| "config_file": "config/train_gpt_oss_openhermes_fr.py", | |
| "default_model": "openai/gpt-oss-20b", | |
| }, | |
| "GPT-OSS OpenHermes-FR Memory Optimized": { | |
| "config_file": "config/train_gpt_oss_openhermes_fr_memory_optimized.py", | |
| "default_model": "openai/gpt-oss-20b", | |
| }, | |
| # Custom dataset and medical SFT can be added later as advanced UI panels | |
| } | |
| def get_config_map(family: str) -> Dict[str, Dict[str, str]]: | |
| return SMOL_CONFIGS if family == "SmolLM3" else GPT_OSS_CONFIGS | |
| # -------------------------------------------------------------------------------------- | |
| # Pipeline Orchestration | |
| # -------------------------------------------------------------------------------------- | |
| class PipelineInputs: | |
| model_family: str | |
| config_choice: str | |
| trainer_type: str # "SFT" | "DPO" | |
| monitoring_mode: str # "both" | "trackio" | "dataset" | "none" | |
| experiment_name: str | |
| repo_short: str | |
| author_name: str | |
| model_description: str | |
| trackio_space_name: Optional[str] | |
| deploy_trackio_space: bool | |
| create_dataset_repo: bool | |
| push_to_hub: bool | |
| switch_to_read_after: bool | |
| scheduler_override: Optional[str] | |
| min_lr: Optional[float] | |
| min_lr_rate: Optional[float] | |
| # Optional override config path generated from Advanced tab | |
| override_config_path: Optional[str] = None | |
| def make_defaults(model_family: str) -> Tuple[str, str]: | |
| ts = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| family_slug = "gpt-oss" if model_family == "GPT-OSS" else "smollm3" | |
| exp = f"smolfactory-{family_slug}_{ts}" | |
| repo_short = f"smolfactory-{datetime.now().strftime('%Y%m%d')}" | |
| return exp, repo_short | |
| def run_pipeline(params: PipelineInputs) -> Generator[str, None, None]: | |
| # Tokens from environment | |
| write_token = os.environ.get("HF_WRITE_TOKEN") or os.environ.get("HF_TOKEN") | |
| read_token = os.environ.get("HF_READ_TOKEN") | |
| if not write_token: | |
| yield "❌ HF_WRITE_TOKEN (or HF_TOKEN) is not set in the environment." | |
| return | |
| # Resolve username | |
| username = get_username_from_token(write_token) or os.environ.get("HF_USERNAME") | |
| if not username: | |
| yield "❌ Could not resolve Hugging Face username from token." | |
| return | |
| yield f"✅ Authenticated as: {username}" | |
| # Compute Trackio URL if applicable | |
| trackio_url: Optional[str] = None | |
| if params.monitoring_mode != "none" and params.trackio_space_name: | |
| trackio_url = f"https://huggingface.co/spaces/{username}/{params.trackio_space_name}" | |
| yield f"Trackio Space URL: {trackio_url}" | |
| # Decide space deploy token per monitoring mode | |
| space_deploy_token = write_token if params.monitoring_mode in ("both", "trackio") else (read_token or write_token) | |
| # Dataset repo setup | |
| dataset_repo = f"{username}/trackio-experiments" | |
| if params.create_dataset_repo and params.monitoring_mode != "none": | |
| yield f"Creating/ensuring dataset repo exists: {dataset_repo}" | |
| rid, ok, msg = ensure_dataset_repo(username, "trackio-experiments", write_token) | |
| yield ("✅ " if ok else "⚠️ ") + msg | |
| dataset_repo = rid | |
| # Resolve config file and model name (allow override from Advanced tab) | |
| conf_map = get_config_map(params.model_family) | |
| if params.override_config_path: | |
| config_file = Path(params.override_config_path) | |
| if not config_file.exists(): | |
| yield f"❌ Generated config file not found: {config_file}" | |
| return | |
| # Best-effort to infer base model from generated config | |
| cfg_obj = import_config_object(config_file) | |
| base_model_fallback = getattr(cfg_obj, "model_name", None) or ( | |
| conf_map.get(params.config_choice, {}).get("default_model", "") | |
| ) | |
| else: | |
| if params.config_choice not in conf_map: | |
| yield f"❌ Unknown config choice: {params.config_choice}" | |
| return | |
| config_file = PROJECT_ROOT / conf_map[params.config_choice]["config_file"] | |
| base_model_fallback = conf_map[params.config_choice]["default_model"] | |
| if not config_file.exists(): | |
| yield f"❌ Config file not found: {config_file}" | |
| return | |
| cfg_obj = import_config_object(config_file) | |
| base_model = getattr(cfg_obj, "model_name", base_model_fallback) if cfg_obj else base_model_fallback | |
| dataset_name = getattr(cfg_obj, "dataset_name", None) if cfg_obj else None | |
| batch_size = getattr(cfg_obj, "batch_size", None) if cfg_obj else None | |
| learning_rate = getattr(cfg_obj, "learning_rate", None) if cfg_obj else None | |
| max_seq_length = getattr(cfg_obj, "max_seq_length", None) if cfg_obj else None | |
| # Prepare env for subprocesses | |
| env = os.environ.copy() | |
| env["HF_TOKEN"] = write_token | |
| env["HUGGING_FACE_HUB_TOKEN"] = write_token | |
| env["HF_USERNAME"] = username | |
| env["TRACKIO_DATASET_REPO"] = dataset_repo | |
| env["MONITORING_MODE"] = params.monitoring_mode | |
| # Optional Trackio Space deployment | |
| if params.deploy_trackio_space and params.monitoring_mode != "none" and params.trackio_space_name: | |
| yield f"\n=== Deploying Trackio Space: {params.trackio_space_name} ===" | |
| # deploy_trackio_space.py expects: space_name, token, git_email, git_name, dataset_repo | |
| args = [ | |
| str(PROJECT_ROOT / "scripts/trackio_tonic/deploy_trackio_space.py"), | |
| params.trackio_space_name, | |
| space_deploy_token, | |
| f"{username}@users.noreply.hf.co", | |
| username, | |
| dataset_repo, | |
| ] | |
| for line in run_command_stream(args, env, cwd=PROJECT_ROOT / "scripts/trackio_tonic"): | |
| yield line | |
| # Dataset setup and Trackio configuration (mirror launch.sh) when monitoring is enabled | |
| if params.monitoring_mode != "none": | |
| # Ensure HF Dataset structure | |
| yield f"\n=== Setting up HF Dataset: {dataset_repo} ===" | |
| ds_args = [ | |
| str(PROJECT_ROOT / "scripts/dataset_tonic/setup_hf_dataset.py"), | |
| write_token, | |
| ] | |
| for line in run_command_stream(ds_args, env, cwd=PROJECT_ROOT / "scripts/dataset_tonic"): | |
| yield line | |
| # Configure Trackio Space | |
| yield f"\n=== Configuring Trackio Space ({params.trackio_space_name or 'N/A'}) ===" | |
| conf_args = [str(PROJECT_ROOT / "scripts/trackio_tonic/configure_trackio.py")] | |
| # Use space deploy token (READ for dataset-only; WRITE otherwise) | |
| conf_env = env.copy() | |
| conf_env["HF_TOKEN"] = space_deploy_token | |
| conf_env["HUGGING_FACE_HUB_TOKEN"] = space_deploy_token | |
| for line in run_command_stream(conf_args, conf_env, cwd=PROJECT_ROOT / "scripts/trackio_tonic"): | |
| yield line | |
| # Training output directory | |
| out_dir = PROJECT_ROOT / "outputs" / f"{params.experiment_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| yield f"\nOutput directory: {out_dir}" | |
| # Scheduler overrides (GPT-OSS only) | |
| if params.model_family == "GPT-OSS" and params.scheduler_override: | |
| env["GPT_OSS_SCHEDULER"] = params.scheduler_override | |
| if params.min_lr is not None: | |
| env["GPT_OSS_MIN_LR"] = str(params.min_lr) | |
| if params.min_lr_rate is not None: | |
| env["GPT_OSS_MIN_LR_RATE"] = str(params.min_lr_rate) | |
| # Start training | |
| yield f"\n=== Starting Training ({params.model_family}) ===" | |
| if params.model_family == "GPT-OSS": | |
| args = [ | |
| str(PROJECT_ROOT / "scripts/training/train_gpt_oss.py"), | |
| "--config", str(config_file), | |
| "--experiment-name", params.experiment_name, | |
| "--output-dir", str(out_dir), | |
| "--trackio-url", trackio_url or "", | |
| "--trainer-type", params.trainer_type.lower(), | |
| ] | |
| else: | |
| args = [ | |
| str(PROJECT_ROOT / "scripts/training/train.py"), | |
| "--config", str(config_file), | |
| "--experiment-name", params.experiment_name, | |
| "--output-dir", str(out_dir), | |
| "--trackio-url", trackio_url or "", | |
| "--trainer-type", params.trainer_type.lower(), | |
| ] | |
| # Stream training logs | |
| train_failed = False | |
| for line in run_command_stream(args, env): | |
| yield line | |
| if line.strip().startswith("[exit_code=") and not line.strip().endswith("[exit_code=0]"): | |
| train_failed = True | |
| if train_failed: | |
| yield "❌ Training failed. Aborting remaining steps." | |
| return | |
| # Push to Hub | |
| if params.push_to_hub: | |
| yield "\n=== Pushing Model to Hugging Face Hub ===" | |
| repo_name = f"{username}/{params.repo_short}" | |
| if params.model_family == "GPT-OSS": | |
| push_args = [ | |
| str(PROJECT_ROOT / "scripts/model_tonic/push_gpt_oss_to_huggingface.py"), | |
| str(out_dir), | |
| repo_name, | |
| "--token", write_token, | |
| "--trackio-url", trackio_url or "", | |
| "--experiment-name", params.experiment_name, | |
| "--dataset-repo", dataset_repo, | |
| "--author-name", params.author_name or username, | |
| "--model-description", params.model_description, | |
| "--training-config-type", params.config_choice, | |
| "--model-name", base_model, | |
| ] | |
| if dataset_name: | |
| push_args += ["--dataset-name", str(dataset_name)] | |
| if batch_size is not None: | |
| push_args += ["--batch-size", str(batch_size)] | |
| if learning_rate is not None: | |
| push_args += ["--learning-rate", str(learning_rate)] | |
| if max_seq_length is not None: | |
| push_args += ["--max-seq-length", str(max_seq_length)] | |
| push_args += ["--trainer-type", params.trainer_type] | |
| else: | |
| push_args = [ | |
| str(PROJECT_ROOT / "scripts/model_tonic/push_to_huggingface.py"), | |
| str(out_dir), | |
| repo_name, | |
| "--token", write_token, | |
| "--trackio-url", trackio_url or "", | |
| "--experiment-name", params.experiment_name, | |
| "--dataset-repo", dataset_repo, | |
| "--author-name", params.author_name or username, | |
| "--model-description", params.model_description, | |
| "--training-config-type", params.config_choice, | |
| "--model-name", base_model, | |
| ] | |
| if dataset_name: | |
| push_args += ["--dataset-name", str(dataset_name)] | |
| if batch_size is not None: | |
| push_args += ["--batch-size", str(batch_size)] | |
| if learning_rate is not None: | |
| push_args += ["--learning-rate", str(learning_rate)] | |
| if max_seq_length is not None: | |
| push_args += ["--max-seq-length", str(max_seq_length)] | |
| push_args += ["--trainer-type", params.trainer_type] | |
| for line in run_command_stream(push_args, env): | |
| yield line | |
| # Switch Space token to read-only (security) | |
| if params.switch_to_read_after and params.monitoring_mode in ("both", "trackio") and params.trackio_space_name and read_token: | |
| yield "\n=== Switching Trackio Space HF_TOKEN to READ token ===" | |
| space_id = f"{username}/{params.trackio_space_name}" | |
| sw_args = [ | |
| str(PROJECT_ROOT / "scripts/trackio_tonic/switch_to_read_token.py"), | |
| space_id, | |
| read_token, | |
| write_token, | |
| ] | |
| for line in run_command_stream(sw_args, env, cwd=PROJECT_ROOT / "scripts/trackio_tonic"): | |
| yield line | |
| elif params.switch_to_read_after and not read_token: | |
| yield "⚠️ HF_READ_TOKEN not set; skipping token switch." | |
| # Final summary | |
| yield "\n🎉 Pipeline completed." | |
| if params.monitoring_mode != "none" and trackio_url: | |
| yield f"Trackio: {trackio_url}" | |
| yield f"Model repo (if pushed): https://huggingface.co/{username}/{params.repo_short}" | |
| yield f"Outputs: {out_dir}" | |
| # -------------------------------------------------------------------------------------- | |
| # Gradio UI | |
| # -------------------------------------------------------------------------------------- | |
| MODEL_FAMILIES = ["SmolLM3", "GPT-OSS"] | |
| TRAINER_CHOICES = ["SFT", "DPO"] | |
| MONITORING_CHOICES = ["both", "trackio", "dataset", "none"] | |
| SCHEDULER_CHOICES = [None, "linear", "cosine", "cosine_with_min_lr", "constant"] | |
| def ui_defaults(family: str) -> Tuple[str, str, str, str]: | |
| exp, repo_short = make_defaults(family) | |
| default_desc = ( | |
| "A fine-tuned GPT-OSS-20B model optimized for multilingual reasoning and instruction following." | |
| if family == "GPT-OSS" | |
| else "A fine-tuned SmolLM3-3B model optimized for instruction following and French language tasks." | |
| ) | |
| trackio_space_name = f"trackio-monitoring-{datetime.now().strftime('%Y%m%d')}" | |
| return exp, repo_short, default_desc, trackio_space_name | |
| title_md = """ | |
| # 🙋🏻♂️ Welcome to 🌟Tonic's 🤏🏻🏭 SmolFactory ! | |
| """ | |
| howto_md = """ | |
| ### How to use | |
| To get started: duplicate the space, select a model family and a configuration, click run. | |
| """ | |
| joinus_md = """ | |
| ### Join us : | |
| 🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗 | |
| """ | |
| # Load inline SVG to render before the Join Us section | |
| try: | |
| _OUTPUT_SVG_HTML = (PROJECT_ROOT / "docs" / "output.svg").read_text(encoding="utf-8") | |
| except Exception: | |
| _OUTPUT_SVG_HTML = "" | |
| def on_family_change(family: str): | |
| """Update UI when the model family changes. | |
| - Refresh available prebuilt configuration choices | |
| - Reset defaults (experiment name, repo short, description, space name) | |
| - Reveal the next step (trainer type) | |
| """ | |
| confs = list(get_config_map(family).keys()) | |
| exp, repo_short, desc, space = ui_defaults(family) | |
| # Initial dataset information placeholder until a specific config is chosen | |
| training_md = ( | |
| f"Select a training configuration for {family} to see details (dataset, batch size, etc.)." | |
| ) | |
| # Update objects: | |
| return ( | |
| gr.update(choices=confs, value=(confs[0] if confs else None)), | |
| exp, | |
| repo_short, | |
| desc, | |
| space, | |
| training_md, | |
| gr.update(choices=[], value=None), | |
| gr.update(visible=True), # show step 2 (trainer) | |
| gr.update(visible=True), # show step 3 immediately (default monitoring 'dataset') | |
| gr.update(visible=True), # show step 4 immediately so users see configs | |
| gr.update(visible=False), # GPT-OSS advanced group hidden until enabled | |
| gr.update(visible=False), # SmolLM3 advanced group hidden until enabled | |
| ) | |
| def on_config_change(family: str, config_choice: str): | |
| """When a prebuilt configuration is selected, update dataset info and helpful details. | |
| Also auto-fill advanced fields with defaults from the selected config. | |
| """ | |
| if not config_choice: | |
| return ( | |
| "", | |
| gr.update(choices=[], value=None), | |
| # Advanced fields (GPT-OSS) | |
| "", "train", "openhermes_fr", "prompt", "accepted_completion", "", "", "", | |
| None, 10, None, 1.0, 4, 4, 2e-4, 2e-5, 0.01, 0.03, | |
| 2048, 16, 32, 0.05, "bf16", 4, "mxfp4", 1.0, 10, 100, 500, | |
| # GPT-OSS Medical o1 SFT defaults | |
| "default", "", "", 1.0, 4, 4, 2e-4, 2048, | |
| # Advanced fields (SmolLM3) | |
| "HuggingFaceTB/SmolLM3-3B", None, "prompt", "completion", False, None, 42, | |
| 4096, 2, 8, 5e-6, 500, 100, 10, | |
| ) | |
| conf_map = get_config_map(family) | |
| cfg_path = PROJECT_ROOT / conf_map[config_choice]["config_file"] | |
| cfg_obj = import_config_object(cfg_path) | |
| dataset_name = getattr(cfg_obj, "dataset_name", None) if cfg_obj else None | |
| batch_size = getattr(cfg_obj, "batch_size", None) if cfg_obj else None | |
| learning_rate = getattr(cfg_obj, "learning_rate", None) if cfg_obj else None | |
| max_seq_length = getattr(cfg_obj, "max_seq_length", None) if cfg_obj else None | |
| base_model = conf_map[config_choice]["default_model"] | |
| md_lines = [ | |
| f"**Configuration**: {config_choice}", | |
| f"**Base model**: {base_model}", | |
| ] | |
| if dataset_name: | |
| md_lines.append(f"**Dataset**: `{dataset_name}`") | |
| if batch_size is not None: | |
| md_lines.append(f"**Batch size**: {batch_size}") | |
| if learning_rate is not None: | |
| md_lines.append(f"**Learning rate**: {learning_rate}") | |
| if max_seq_length is not None: | |
| md_lines.append(f"**Max seq length**: {max_seq_length}") | |
| training_md = "\n".join(md_lines) | |
| # dataset selection (allow custom but prefill with the config's dataset if any) | |
| ds_choices = [dataset_name] if dataset_name else [] | |
| # Defaults for Advanced (GPT-OSS) | |
| adv_dataset_name = dataset_name or ("HuggingFaceH4/Multilingual-Thinking" if family == "GPT-OSS" else (dataset_name or "")) | |
| adv_dataset_split = getattr(cfg_obj, "dataset_split", "train") if cfg_obj else "train" | |
| # Infer dataset_format heuristically | |
| if family == "GPT-OSS": | |
| adv_dataset_format = getattr(cfg_obj, "dataset_format", None) or ( | |
| "messages" if getattr(cfg_obj, "input_field", "") == "messages" else "openhermes_fr" | |
| ) | |
| adv_input_field = getattr(cfg_obj, "input_field", "prompt") | |
| adv_target_field = getattr(cfg_obj, "target_field", "accepted_completion") or "" | |
| adv_num_train_epochs = float(getattr(cfg_obj, "num_train_epochs", 1.0)) if cfg_obj and hasattr(cfg_obj, "num_train_epochs") else 1.0 | |
| adv_batch_size = int(getattr(cfg_obj, "batch_size", 4) or 4) | |
| adv_gas = int(getattr(cfg_obj, "gradient_accumulation_steps", 4) or 4) | |
| adv_lr = float(getattr(cfg_obj, "learning_rate", 2e-4) or 2e-4) | |
| adv_min_lr = float(getattr(cfg_obj, "min_lr", 2e-5) or 2e-5) | |
| adv_wd = float(getattr(cfg_obj, "weight_decay", 0.01) or 0.01) | |
| adv_warmup = float(getattr(cfg_obj, "warmup_ratio", 0.03) or 0.03) | |
| adv_msl = int(getattr(cfg_obj, "max_seq_length", 2048) or 2048) | |
| lora_cfg = getattr(cfg_obj, "lora_config", {}) or {} | |
| adv_lora_r = int(lora_cfg.get("r", 16)) | |
| adv_lora_alpha = int(lora_cfg.get("lora_alpha", 32)) | |
| adv_lora_dropout = float(lora_cfg.get("lora_dropout", 0.05)) | |
| adv_mixed_precision = "bf16" if getattr(cfg_obj, "bf16", True) else ("fp16" if getattr(cfg_obj, "fp16", False) else "fp32") | |
| adv_num_workers = int(getattr(cfg_obj, "dataloader_num_workers", 4) or 4) | |
| qcfg = getattr(cfg_obj, "quantization_config", {}) or {} | |
| if qcfg.get("load_in_4bit", False): | |
| adv_quantization_type = "bnb4" | |
| elif qcfg.get("dequantize", False): | |
| adv_quantization_type = "mxfp4" | |
| else: | |
| adv_quantization_type = "none" | |
| adv_mgn = float(getattr(cfg_obj, "max_grad_norm", 1.0) or 1.0) | |
| adv_log = int(getattr(cfg_obj, "logging_steps", 10) or 10) | |
| adv_eval = int(getattr(cfg_obj, "eval_steps", 100) or 100) | |
| adv_save = int(getattr(cfg_obj, "save_steps", 500) or 500) | |
| else: | |
| # SmolLM3 defaults for Advanced | |
| adv_dataset_format = "openhermes_fr" | |
| adv_input_field = getattr(cfg_obj, "input_field", "prompt") if cfg_obj else "prompt" | |
| adv_target_field = getattr(cfg_obj, "target_field", "completion") if cfg_obj else "completion" | |
| adv_num_train_epochs = 1.0 | |
| adv_batch_size = int(getattr(cfg_obj, "batch_size", 2) or 2) | |
| adv_gas = int(getattr(cfg_obj, "gradient_accumulation_steps", 8) or 8) | |
| adv_lr = float(getattr(cfg_obj, "learning_rate", 5e-6) or 5e-6) | |
| adv_min_lr = float(getattr(cfg_obj, "min_lr", 1e-6) or 1e-6) | |
| adv_wd = float(getattr(cfg_obj, "weight_decay", 0.01) or 0.01) | |
| adv_warmup = float(getattr(cfg_obj, "warmup_steps", 100) or 100) # Smol uses steps | |
| adv_msl = int(getattr(cfg_obj, "max_seq_length", 4096) or 4096) | |
| adv_lora_r = 16 | |
| adv_lora_alpha = 32 | |
| adv_lora_dropout = 0.05 | |
| adv_mixed_precision = "fp16" if getattr(cfg_obj, "fp16", True) else ("bf16" if getattr(cfg_obj, "bf16", False) else "fp32") | |
| adv_num_workers = int(getattr(cfg_obj, "dataloader_num_workers", 4) or 4) | |
| adv_quantization_type = "none" | |
| adv_mgn = float(getattr(cfg_obj, "max_grad_norm", 1.0) or 1.0) | |
| adv_log = int(getattr(cfg_obj, "logging_steps", 10) or 10) | |
| adv_eval = int(getattr(cfg_obj, "eval_steps", 100) or 100) | |
| adv_save = int(getattr(cfg_obj, "save_steps", 500) or 500) | |
| # SmolLM3 advanced model/dataset | |
| adv_sm_model_name = getattr(cfg_obj, "model_name", "HuggingFaceTB/SmolLM3-3B") if cfg_obj else "HuggingFaceTB/SmolLM3-3B" | |
| adv_sm_dataset_name = dataset_name if family == "SmolLM3" else None | |
| adv_sm_input_field = adv_input_field | |
| adv_sm_target_field = adv_target_field | |
| adv_sm_filter_bad = bool(getattr(cfg_obj, "filter_bad_entries", False)) if cfg_obj else False | |
| adv_sm_sample_size = getattr(cfg_obj, "sample_size", None) | |
| adv_sm_sample_seed = getattr(cfg_obj, "sample_seed", 42) | |
| return ( | |
| training_md, | |
| gr.update(choices=ds_choices, value=(dataset_name or None)), | |
| # Advanced (GPT-OSS) | |
| adv_dataset_name, | |
| adv_dataset_split, | |
| adv_dataset_format, | |
| adv_input_field, | |
| adv_target_field, | |
| getattr(cfg_obj, "system_message", None) if cfg_obj else "", | |
| getattr(cfg_obj, "developer_message", None) if cfg_obj else "", | |
| getattr(cfg_obj, "chat_template_kwargs", {}).get("model_identity") if cfg_obj and getattr(cfg_obj, "chat_template_kwargs", None) else "", | |
| getattr(cfg_obj, "max_samples", None) if cfg_obj else None, | |
| int(getattr(cfg_obj, "min_length", 10) or 10) if cfg_obj else 10, | |
| getattr(cfg_obj, "max_length", None) if cfg_obj else None, | |
| adv_num_train_epochs, | |
| adv_batch_size, | |
| adv_gas, | |
| adv_lr, | |
| adv_min_lr, | |
| adv_wd, | |
| adv_warmup, | |
| adv_msl, | |
| adv_lora_r, | |
| adv_lora_alpha, | |
| adv_lora_dropout, | |
| adv_mixed_precision, | |
| adv_num_workers, | |
| adv_quantization_type, | |
| adv_mgn, | |
| adv_log, | |
| adv_eval, | |
| adv_save, | |
| # GPT-OSS Medical o1 SFT defaults | |
| "default", | |
| "", | |
| "", | |
| 1.0, | |
| 4, | |
| 4, | |
| 2e-4, | |
| 2048, | |
| # Advanced (SmolLM3) | |
| adv_sm_model_name, | |
| adv_sm_dataset_name, | |
| adv_sm_input_field, | |
| adv_sm_target_field, | |
| adv_sm_filter_bad, | |
| adv_sm_sample_size, | |
| adv_sm_sample_seed, | |
| # SmolLM3 training overrides | |
| int(getattr(cfg_obj, "max_seq_length", 4096) or 4096) if family == "SmolLM3" else 4096, | |
| int(getattr(cfg_obj, "batch_size", 2) or 2) if family == "SmolLM3" else 2, | |
| int(getattr(cfg_obj, "gradient_accumulation_steps", 8) or 8) if family == "SmolLM3" else 8, | |
| float(getattr(cfg_obj, "learning_rate", 5e-6) or 5e-6) if family == "SmolLM3" else 5e-6, | |
| int(getattr(cfg_obj, "save_steps", 500) or 500) if family == "SmolLM3" else 500, | |
| int(getattr(cfg_obj, "eval_steps", 100) or 100) if family == "SmolLM3" else 100, | |
| int(getattr(cfg_obj, "logging_steps", 10) or 10) if family == "SmolLM3" else 10, | |
| ) | |
| def on_trainer_selected(_: str): | |
| """Reveal monitoring step once trainer type is chosen.""" | |
| return gr.update(visible=True) | |
| def on_monitoring_change(mode: str): | |
| """Reveal configuration/details step and adjust Trackio-related visibility by mode.""" | |
| show_trackio = mode in ("both", "trackio") | |
| show_dataset_repo = mode != "none" | |
| return ( | |
| gr.update(visible=True), | |
| gr.update(visible=show_trackio), # trackio space name | |
| gr.update(visible=show_trackio), # deploy trackio space | |
| gr.update(visible=show_dataset_repo), # create dataset repo | |
| ) | |
| def start_pipeline( | |
| model_family: str, | |
| config_choice: str, | |
| trainer_type: str, | |
| monitoring_mode: str, | |
| experiment_name: str, | |
| repo_short: str, | |
| author_name: str, | |
| model_description: str, | |
| trackio_space_name: str, | |
| deploy_trackio_space: bool, | |
| create_dataset_repo: bool, | |
| push_to_hub: bool, | |
| switch_to_read_after: bool, | |
| scheduler_override: Optional[str], | |
| min_lr: Optional[float], | |
| min_lr_rate: Optional[float], | |
| ) -> Generator[str, None, None]: | |
| try: | |
| params = PipelineInputs( | |
| model_family=model_family, | |
| config_choice=config_choice, | |
| trainer_type=trainer_type, | |
| monitoring_mode=monitoring_mode, | |
| experiment_name=experiment_name, | |
| repo_short=repo_short, | |
| author_name=author_name, | |
| model_description=model_description, | |
| trackio_space_name=trackio_space_name or None, | |
| deploy_trackio_space=deploy_trackio_space, | |
| create_dataset_repo=create_dataset_repo, | |
| push_to_hub=push_to_hub, | |
| switch_to_read_after=switch_to_read_after, | |
| scheduler_override=(scheduler_override or None), | |
| min_lr=min_lr, | |
| min_lr_rate=min_lr_rate, | |
| ) | |
| # Show token presence | |
| write_token = os.environ.get("HF_WRITE_TOKEN") or os.environ.get("HF_TOKEN") | |
| read_token = os.environ.get("HF_READ_TOKEN") | |
| yield f"HF_WRITE_TOKEN: {mask_token(write_token)}" | |
| yield f"HF_READ_TOKEN: {mask_token(read_token)}" | |
| # Run the orchestrated pipeline | |
| for line in run_pipeline(params): | |
| yield line | |
| # Small delay for smoother streaming | |
| time.sleep(0.01) | |
| except Exception as e: | |
| yield f"❌ Error: {e}" | |
| tb = traceback.format_exc(limit=2) | |
| yield tb | |
| with gr.Blocks(title="SmolLM3 / GPT-OSS Fine-tuning Pipeline") as demo: | |
| # GPU/driver detection banner | |
| has_gpu, gpu_msg = detect_nvidia_driver() | |
| if has_gpu: | |
| gr.HTML( | |
| f""" | |
| <div style="background-color: rgba(59, 130, 246, 0.1); border: 1px solid rgba(59, 130, 246, 0.3); border-radius: 8px; padding: 12px; margin-bottom: 16px; text-align: center;"> | |
| <p style="color: rgb(59, 130, 246); margin: 0; font-size: 14px; font-weight: 600;"> | |
| ✅ NVIDIA GPU ready — {gpu_msg} | |
| </p> | |
| <p style="color: rgb(59, 130, 246); margin: 6px 0 0; font-size: 12px;"> | |
| Reads tokens from environment: <code>HF_WRITE_TOKEN</code> (required), <code>HF_READ_TOKEN</code> (optional) | |
| </p> | |
| <p style="color: rgb(59, 130, 246); margin: 4px 0 0; font-size: 12px;"> | |
| Select a config and run training; optionally deploy Trackio and push to Hub | |
| </p> | |
| </div> | |
| """ | |
| ) | |
| gr.Markdown(title_md) | |
| gr.Markdown(howto_md) | |
| if _OUTPUT_SVG_HTML: | |
| gr.HTML(_OUTPUT_SVG_HTML) | |
| gr.Markdown(joinus_md) | |
| else: | |
| hint_html = markdown_links_to_html(duplicate_space_hint()) | |
| gr.HTML( | |
| f""" | |
| <div style="background-color: rgba(245, 158, 11, 0.1); border: 1px solid rgba(245, 158, 11, 0.3); border-radius: 8px; padding: 12px; margin-bottom: 16px; text-align: center;"> | |
| <p style="color: rgb(234, 88, 12); margin: 0; font-size: 14px; font-weight: 600;"> | |
| ⚠️ No NVIDIA GPU/driver detected — training requires a GPU runtime | |
| </p> | |
| <p style="color: rgb(234, 88, 12); margin: 6px 0 0; font-size: 12px;"> | |
| {hint_html} | |
| </p> | |
| <p style="color: rgb(234, 88, 12); margin: 4px 0 0; font-size: 12px;"> | |
| Reads tokens from environment: <code>HF_WRITE_TOKEN</code> (required), <code>HF_READ_TOKEN</code> (optional) | |
| </p> | |
| <p style="color: rgb(234, 88, 12); margin: 4px 0 0; font-size: 12px;"> | |
| You can still configure and push, but training requires a GPU runtime. | |
| </p> | |
| </div> | |
| """ | |
| ) | |
| gr.Markdown(title_md) | |
| gr.Markdown(howto_md) | |
| if _OUTPUT_SVG_HTML: | |
| gr.HTML(_OUTPUT_SVG_HTML) | |
| gr.Markdown(joinus_md) | |
| # --- Progressive interface -------------------------------------------------------- | |
| gr.Markdown("### Configure your run in simple steps") | |
| # Step 1: Model family | |
| with gr.Group(): | |
| model_family = gr.Dropdown(choices=MODEL_FAMILIES, value="SmolLM3", label="1) Model family") | |
| # Step 2: Trainer (revealed after family) | |
| step2_group = gr.Group(visible=False) | |
| with step2_group: | |
| trainer_type = gr.Radio(choices=TRAINER_CHOICES, value="SFT", label="2) Trainer type") | |
| # Step 3: Monitoring (revealed after trainer) | |
| step3_group = gr.Group(visible=False) | |
| with step3_group: | |
| monitoring_mode = gr.Dropdown(choices=MONITORING_CHOICES, value="dataset", label="3) Monitoring mode") | |
| # Step 4: Config & details (revealed after monitoring) | |
| step4_group = gr.Group(visible=False) | |
| with step4_group: | |
| # Defaults based on initial family selection | |
| exp_default, repo_default, desc_default, trackio_space_default = ui_defaults("SmolLM3") | |
| config_choice = gr.Dropdown( | |
| choices=list(get_config_map("SmolLM3").keys()), | |
| value="Basic Training", | |
| label="4) Training configuration", | |
| ) | |
| with gr.Tabs(): | |
| with gr.Tab("Overview"): | |
| training_info = gr.Markdown("Select a training configuration to see details.") | |
| dataset_choice = gr.Dropdown( | |
| choices=[], | |
| value=None, | |
| allow_custom_value=True, | |
| label="Dataset (from config; optional)", | |
| ) | |
| with gr.Row(): | |
| experiment_name = gr.Textbox(value=exp_default, label="Experiment name") | |
| repo_short = gr.Textbox(value=repo_default, label="Model repo (short name)") | |
| with gr.Row(): | |
| author_name = gr.Textbox(value=os.environ.get("HF_USERNAME", ""), label="Author name") | |
| model_description = gr.Textbox(value=desc_default, label="Model description") | |
| trackio_space_name = gr.Textbox( | |
| value=trackio_space_default, | |
| label="Trackio Space name (used when monitoring != none)", | |
| visible=False, | |
| ) | |
| deploy_trackio_space = gr.Checkbox(value=True, label="Deploy Trackio Space", visible=False) | |
| create_dataset_repo = gr.Checkbox(value=True, label="Create/ensure HF Dataset repo", visible=True) | |
| with gr.Row(): | |
| push_to_hub = gr.Checkbox(value=True, label="Push model to Hugging Face Hub") | |
| switch_to_read_after = gr.Checkbox(value=True, label="Switch Space token to READ after training") | |
| with gr.Tab("Advanced"): | |
| # GPT-OSS specific scheduler overrides | |
| advanced_enabled = gr.Checkbox(value=False, label="Use advanced overrides (generate config)") | |
| # Family-specific advanced groups | |
| gpt_oss_advanced_group = gr.Group(visible=False) | |
| with gpt_oss_advanced_group: | |
| gr.Markdown("Advanced configuration for GPT-OSS") | |
| adv_gpt_mode = gr.Radio( | |
| choices=["custom", "medical_o1_sft"], | |
| value="custom", | |
| label="Advanced mode", | |
| ) | |
| # --- GPT-OSS Custom advanced controls --- | |
| gpt_oss_custom_group = gr.Group(visible=True) | |
| with gpt_oss_custom_group: | |
| with gr.Accordion("Dataset", open=True): | |
| adv_dataset_name = gr.Textbox(value="", label="Dataset name") | |
| with gr.Row(): | |
| adv_dataset_split = gr.Textbox(value="train", label="Dataset split") | |
| adv_dataset_format = gr.Dropdown( | |
| choices=["openhermes_fr", "messages", "text"], | |
| value="openhermes_fr", | |
| label="Dataset format", | |
| ) | |
| with gr.Row(): | |
| adv_input_field = gr.Textbox(value="prompt", label="Input field") | |
| adv_target_field = gr.Textbox(value="accepted_completion", label="Target field (optional)") | |
| with gr.Row(): | |
| adv_system_message = gr.Textbox(value="", label="System message (optional)") | |
| adv_developer_message = gr.Textbox(value="", label="Developer message (optional)") | |
| adv_model_identity = gr.Textbox(value="", label="Model identity (optional)") | |
| with gr.Row(): | |
| adv_max_samples = gr.Number(value=None, precision=0, label="Max samples (optional)") | |
| adv_min_length = gr.Number(value=10, precision=0, label="Min length") | |
| adv_max_length = gr.Number(value=None, precision=0, label="Max length (optional)") | |
| with gr.Accordion("Training", open=True): | |
| with gr.Row(): | |
| adv_num_train_epochs = gr.Number(value=1.0, precision=2, label="Epochs") | |
| adv_batch_size = gr.Number(value=4, precision=0, label="Batch size") | |
| adv_gradient_accumulation_steps = gr.Number(value=4, precision=0, label="Grad accumulation") | |
| with gr.Row(): | |
| adv_learning_rate = gr.Number(value=2e-4, precision=6, label="Learning rate") | |
| adv_min_lr_num = gr.Number(value=2e-5, precision=6, label="Min LR") | |
| adv_weight_decay = gr.Number(value=0.01, precision=6, label="Weight decay") | |
| adv_warmup_ratio = gr.Number(value=0.03, precision=3, label="Warmup ratio") | |
| adv_max_seq_length = gr.Number(value=2048, precision=0, label="Max seq length") | |
| with gr.Accordion("LoRA & Quantization", open=False): | |
| with gr.Row(): | |
| adv_lora_r = gr.Number(value=16, precision=0, label="LoRA r") | |
| adv_lora_alpha = gr.Number(value=32, precision=0, label="LoRA alpha") | |
| adv_lora_dropout = gr.Number(value=0.05, precision=3, label="LoRA dropout") | |
| with gr.Row(): | |
| adv_mixed_precision = gr.Dropdown(choices=["bf16", "fp16", "fp32"], value="bf16", label="Mixed precision") | |
| adv_num_workers = gr.Number(value=4, precision=0, label="Data workers") | |
| adv_quantization_type = gr.Dropdown(choices=["mxfp4", "bnb4", "none"], value="mxfp4", label="Quantization") | |
| adv_max_grad_norm = gr.Number(value=1.0, precision=3, label="Max grad norm") | |
| with gr.Accordion("Eval & Logging", open=False): | |
| with gr.Row(): | |
| adv_logging_steps = gr.Number(value=10, precision=0, label="Logging steps") | |
| adv_eval_steps = gr.Number(value=100, precision=0, label="Eval steps") | |
| adv_save_steps = gr.Number(value=500, precision=0, label="Save steps") | |
| with gr.Accordion("Scheduler (GPT-OSS only)", open=False): | |
| scheduler_override = gr.Dropdown( | |
| choices=[c for c in SCHEDULER_CHOICES if c is not None], | |
| value=None, | |
| allow_custom_value=True, | |
| label="Scheduler override", | |
| ) | |
| with gr.Row(): | |
| min_lr = gr.Number(value=None, precision=6, label="min_lr (cosine_with_min_lr)") | |
| min_lr_rate = gr.Number(value=None, precision=6, label="min_lr_rate (cosine_with_min_lr)") | |
| # --- GPT-OSS Medical o1 SFT controls --- | |
| gpt_oss_medical_group = gr.Group(visible=False) | |
| with gpt_oss_medical_group: | |
| gr.Markdown("Build a Medical o1 SFT configuration (dataset fixed to FreedomIntelligence/medical-o1-reasoning-SFT)") | |
| with gr.Accordion("Dataset", open=True): | |
| adv_med_dataset_config = gr.Textbox(value="default", label="Dataset config (subset)") | |
| with gr.Accordion("Context (optional)", open=False): | |
| with gr.Row(): | |
| adv_med_system_message = gr.Textbox(value="", label="System message") | |
| adv_med_developer_message = gr.Textbox(value="", label="Developer message") | |
| with gr.Accordion("Training", open=True): | |
| with gr.Row(): | |
| adv_med_num_train_epochs = gr.Number(value=1.0, precision=2, label="Epochs") | |
| adv_med_batch_size = gr.Number(value=4, precision=0, label="Batch size") | |
| adv_med_gradient_accumulation_steps = gr.Number(value=4, precision=0, label="Grad accumulation") | |
| with gr.Row(): | |
| adv_med_learning_rate = gr.Number(value=2e-4, precision=6, label="Learning rate") | |
| adv_med_max_seq_length = gr.Number(value=2048, precision=0, label="Max seq length") | |
| smollm3_advanced_group = gr.Group(visible=False) | |
| with smollm3_advanced_group: | |
| gr.Markdown("Advanced configuration for SmolLM3") | |
| adv_sm_mode = gr.Radio( | |
| choices=["custom", "long_context"], | |
| value="custom", | |
| label="Advanced mode", | |
| ) | |
| # --- SmolLM3 Custom --- | |
| sm_custom_group = gr.Group(visible=True) | |
| with sm_custom_group: | |
| with gr.Accordion("Dataset", open=True): | |
| adv_sm_model_name = gr.Textbox(value="HuggingFaceTB/SmolLM3-3B", label="Model name") | |
| adv_sm_dataset_name = gr.Textbox(value="", label="Dataset name (optional)") | |
| with gr.Row(): | |
| adv_sm_input_field = gr.Textbox(value="prompt", label="Input field") | |
| adv_sm_target_field = gr.Textbox(value="completion", label="Target field") | |
| with gr.Row(): | |
| adv_sm_filter_bad_entries = gr.Checkbox(value=False, label="Filter bad entries") | |
| adv_sm_sample_size = gr.Number(value=None, precision=0, label="Sample size (optional)") | |
| adv_sm_sample_seed = gr.Number(value=42, precision=0, label="Sample seed") | |
| with gr.Accordion("Training", open=True): | |
| with gr.Row(): | |
| adv_sm_max_seq_length = gr.Number(value=4096, precision=0, label="Max seq length") | |
| adv_sm_batch_size = gr.Number(value=2, precision=0, label="Batch size") | |
| adv_sm_gas = gr.Number(value=8, precision=0, label="Grad accumulation") | |
| adv_sm_learning_rate = gr.Number(value=5e-6, precision=6, label="Learning rate") | |
| with gr.Row(): | |
| adv_sm_save_steps = gr.Number(value=500, precision=0, label="Save steps") | |
| adv_sm_eval_steps = gr.Number(value=100, precision=0, label="Eval steps") | |
| adv_sm_logging_steps = gr.Number(value=10, precision=0, label="Logging steps") | |
| # --- SmolLM3 Long-Context --- | |
| sm_long_group = gr.Group(visible=False) | |
| with sm_long_group: | |
| gr.Markdown("Generate a Long-Context SmolLM3 config") | |
| with gr.Accordion("Dataset", open=True): | |
| adv_sm_lc_model_name = gr.Textbox(value="HuggingFaceTB/SmolLM3-3B", label="Model name") | |
| adv_sm_lc_dataset_name = gr.Textbox(value="", label="Dataset name (optional)") | |
| with gr.Row(): | |
| adv_sm_lc_input_field = gr.Textbox(value="prompt", label="Input field") | |
| adv_sm_lc_target_field = gr.Textbox(value="completion", label="Target field") | |
| with gr.Row(): | |
| adv_sm_lc_filter_bad_entries = gr.Checkbox(value=False, label="Filter bad entries") | |
| adv_sm_lc_sample_size = gr.Number(value=None, precision=0, label="Sample size (optional)") | |
| adv_sm_lc_sample_seed = gr.Number(value=42, precision=0, label="Sample seed") | |
| with gr.Accordion("Training", open=True): | |
| with gr.Row(): | |
| adv_sm_lc_max_seq_length = gr.Number(value=131072, precision=0, label="Max seq length (up to 131072)") | |
| adv_sm_lc_batch_size = gr.Number(value=1, precision=0, label="Batch size") | |
| adv_sm_lc_gas = gr.Number(value=8, precision=0, label="Grad accumulation") | |
| adv_sm_lc_learning_rate = gr.Number(value=1e-5, precision=6, label="Learning rate") | |
| with gr.Row(): | |
| adv_sm_lc_warmup_steps = gr.Number(value=200, precision=0, label="Warmup steps") | |
| adv_sm_lc_max_iters = gr.Number(value=500, precision=0, label="Max iters") | |
| with gr.Row(): | |
| adv_sm_lc_save_steps = gr.Number(value=100, precision=0, label="Save steps") | |
| adv_sm_lc_eval_steps = gr.Number(value=50, precision=0, label="Eval steps") | |
| adv_sm_lc_logging_steps = gr.Number(value=10, precision=0, label="Logging steps") | |
| with gr.Accordion("Chat Template", open=False): | |
| with gr.Row(): | |
| adv_sm_lc_use_chat_template = gr.Checkbox(value=True, label="Use chat template") | |
| adv_sm_lc_no_think_system_message = gr.Checkbox(value=True, label="No-think system message") | |
| def _toggle_sm_mode(mode: str): | |
| return ( | |
| gr.update(visible=mode == "custom"), | |
| gr.update(visible=mode == "long_context"), | |
| ) | |
| adv_sm_mode.change( | |
| _toggle_sm_mode, | |
| inputs=[adv_sm_mode], | |
| outputs=[sm_custom_group, sm_long_group], | |
| ) | |
| def _toggle_advanced(enable: bool, family_val: str): | |
| return ( | |
| gr.update(visible=enable and family_val == "GPT-OSS"), | |
| gr.update(visible=enable and family_val == "SmolLM3"), | |
| ) | |
| advanced_enabled.change( | |
| _toggle_advanced, | |
| inputs=[advanced_enabled, model_family], | |
| outputs=[gpt_oss_advanced_group, smollm3_advanced_group], | |
| ) | |
| # Toggle between GPT-OSS Custom and Medical modes | |
| def _toggle_gpt_oss_mode(mode: str): | |
| return ( | |
| gr.update(visible=mode == "custom"), | |
| gr.update(visible=mode == "medical_o1_sft"), | |
| ) | |
| adv_gpt_mode.change( | |
| _toggle_gpt_oss_mode, | |
| inputs=[adv_gpt_mode], | |
| outputs=[gpt_oss_custom_group, gpt_oss_medical_group], | |
| ) | |
| # Final action & logs | |
| start_btn = gr.Button("Run Pipeline", variant="primary") | |
| logs = gr.Textbox(value="", label="Logs", lines=20) | |
| # --- Events --------------------------------------------------------------------- | |
| model_family.change( | |
| on_family_change, | |
| inputs=model_family, | |
| outputs=[ | |
| config_choice, | |
| experiment_name, | |
| repo_short, | |
| model_description, | |
| trackio_space_name, | |
| training_info, | |
| dataset_choice, | |
| step2_group, | |
| step3_group, | |
| step4_group, | |
| gpt_oss_advanced_group, # show advanced for GPT-OSS | |
| smollm3_advanced_group, # show advanced for SmolLM3 | |
| ], | |
| ) | |
| trainer_type.change(on_trainer_selected, inputs=trainer_type, outputs=step3_group) | |
| monitoring_mode.change( | |
| on_monitoring_change, | |
| inputs=monitoring_mode, | |
| outputs=[step4_group, trackio_space_name, deploy_trackio_space, create_dataset_repo], | |
| ) | |
| config_choice.change( | |
| on_config_change, | |
| inputs=[model_family, config_choice], | |
| outputs=[ | |
| training_info, | |
| dataset_choice, | |
| # Advanced (GPT-OSS) outputs | |
| adv_dataset_name, | |
| adv_dataset_split, | |
| adv_dataset_format, | |
| adv_input_field, | |
| adv_target_field, | |
| adv_system_message, | |
| adv_developer_message, | |
| adv_model_identity, | |
| adv_max_samples, | |
| adv_min_length, | |
| adv_max_length, | |
| adv_num_train_epochs, | |
| adv_batch_size, | |
| adv_gradient_accumulation_steps, | |
| adv_learning_rate, | |
| adv_min_lr_num, | |
| adv_weight_decay, | |
| adv_warmup_ratio, | |
| adv_max_seq_length, | |
| adv_lora_r, | |
| adv_lora_alpha, | |
| adv_lora_dropout, | |
| adv_mixed_precision, | |
| adv_num_workers, | |
| adv_quantization_type, | |
| adv_max_grad_norm, | |
| adv_logging_steps, | |
| adv_eval_steps, | |
| adv_save_steps, | |
| # GPT-OSS Medical o1 SFT outputs (prefill defaults) | |
| adv_med_dataset_config, | |
| adv_med_system_message, | |
| adv_med_developer_message, | |
| adv_med_num_train_epochs, | |
| adv_med_batch_size, | |
| adv_med_gradient_accumulation_steps, | |
| adv_med_learning_rate, | |
| adv_med_max_seq_length, | |
| # Advanced (SmolLM3) | |
| adv_sm_model_name, | |
| adv_sm_dataset_name, | |
| adv_sm_input_field, | |
| adv_sm_target_field, | |
| adv_sm_filter_bad_entries, | |
| adv_sm_sample_size, | |
| adv_sm_sample_seed, | |
| adv_sm_max_seq_length, | |
| adv_sm_batch_size, | |
| adv_sm_gas, | |
| adv_sm_learning_rate, | |
| adv_sm_save_steps, | |
| adv_sm_eval_steps, | |
| adv_sm_logging_steps, | |
| ], | |
| ) | |
| # Keep Advanced dataset fields in sync when user selects a different dataset | |
| def _sync_dataset_fields(ds_value: Optional[str]): | |
| ds_text = ds_value or "" | |
| return ds_text, ds_text | |
| dataset_choice.change( | |
| _sync_dataset_fields, | |
| inputs=[dataset_choice], | |
| outputs=[adv_dataset_name, adv_sm_dataset_name], | |
| ) | |
| def _start_with_overrides( | |
| model_family_v, | |
| config_choice_v, | |
| trainer_type_v, | |
| monitoring_mode_v, | |
| experiment_name_v, | |
| repo_short_v, | |
| author_name_v, | |
| model_description_v, | |
| trackio_space_name_v, | |
| deploy_trackio_space_v, | |
| create_dataset_repo_v, | |
| push_to_hub_v, | |
| switch_to_read_after_v, | |
| scheduler_override_v, | |
| min_lr_v, | |
| min_lr_rate_v, | |
| advanced_enabled_v, | |
| adv_gpt_mode_v, | |
| # GPT-OSS advanced | |
| adv_dataset_name_v, | |
| adv_dataset_split_v, | |
| adv_dataset_format_v, | |
| adv_input_field_v, | |
| adv_target_field_v, | |
| adv_system_message_v, | |
| adv_developer_message_v, | |
| adv_model_identity_v, | |
| adv_max_samples_v, | |
| adv_min_length_v, | |
| adv_max_length_v, | |
| adv_num_train_epochs_v, | |
| adv_batch_size_v, | |
| adv_gas_v, | |
| adv_lr_v, | |
| adv_min_lr_num_v, | |
| adv_wd_v, | |
| adv_warmup_ratio_v, | |
| adv_max_seq_length_v, | |
| adv_lora_r_v, | |
| adv_lora_alpha_v, | |
| adv_lora_dropout_v, | |
| adv_mixed_precision_v, | |
| adv_num_workers_v, | |
| adv_quantization_type_v, | |
| adv_max_grad_norm_v, | |
| adv_logging_steps_v, | |
| adv_eval_steps_v, | |
| adv_save_steps_v, | |
| # GPT-OSS Medical o1 SFT | |
| adv_med_dataset_config_v, | |
| adv_med_system_message_v, | |
| adv_med_developer_message_v, | |
| adv_med_num_train_epochs_v, | |
| adv_med_batch_size_v, | |
| adv_med_gradient_accumulation_steps_v, | |
| adv_med_learning_rate_v, | |
| adv_med_max_seq_length_v, | |
| # SmolLM3 advanced | |
| adv_sm_mode_v, | |
| adv_sm_model_name_v, | |
| adv_sm_dataset_name_v, | |
| adv_sm_input_field_v, | |
| adv_sm_target_field_v, | |
| adv_sm_filter_bad_entries_v, | |
| adv_sm_sample_size_v, | |
| adv_sm_sample_seed_v, | |
| adv_sm_max_seq_length_v, | |
| adv_sm_batch_size_v, | |
| adv_sm_gas_v, | |
| adv_sm_learning_rate_v, | |
| adv_sm_save_steps_v, | |
| adv_sm_eval_steps_v, | |
| adv_sm_logging_steps_v, | |
| # SmolLM3 long context | |
| adv_sm_lc_model_name_v, | |
| adv_sm_lc_dataset_name_v, | |
| adv_sm_lc_input_field_v, | |
| adv_sm_lc_target_field_v, | |
| adv_sm_lc_filter_bad_entries_v, | |
| adv_sm_lc_sample_size_v, | |
| adv_sm_lc_sample_seed_v, | |
| adv_sm_lc_max_seq_length_v, | |
| adv_sm_lc_batch_size_v, | |
| adv_sm_lc_gas_v, | |
| adv_sm_lc_learning_rate_v, | |
| adv_sm_lc_warmup_steps_v, | |
| adv_sm_lc_max_iters_v, | |
| adv_sm_lc_save_steps_v, | |
| adv_sm_lc_eval_steps_v, | |
| adv_sm_lc_logging_steps_v, | |
| adv_sm_lc_use_chat_template_v, | |
| adv_sm_lc_no_think_system_message_v, | |
| ): | |
| # If advanced overrides enabled, generate a config file and pass its path | |
| override_path: Optional[str] = None | |
| if advanced_enabled_v: | |
| try: | |
| if model_family_v == "GPT-OSS": | |
| if str(adv_gpt_mode_v) == "medical_o1_sft": | |
| cfg_path = generate_medical_o1_config_file( | |
| dataset_config=str(adv_med_dataset_config_v or "default"), | |
| system_message=(str(adv_med_system_message_v) if adv_med_system_message_v else None), | |
| developer_message=(str(adv_med_developer_message_v) if adv_med_developer_message_v else None), | |
| num_train_epochs=float(adv_med_num_train_epochs_v or 1.0), | |
| batch_size=int(adv_med_batch_size_v or 4), | |
| gradient_accumulation_steps=int(adv_med_gradient_accumulation_steps_v or 4), | |
| learning_rate=float(adv_med_learning_rate_v or 2e-4), | |
| max_seq_length=int(adv_med_max_seq_length_v or 2048), | |
| ) | |
| else: | |
| cfg_path = generate_gpt_oss_custom_config_file( | |
| dataset_name=str(adv_dataset_name_v or ""), | |
| dataset_split=str(adv_dataset_split_v or "train"), | |
| dataset_format=str(adv_dataset_format_v or "openhermes_fr"), | |
| input_field=str(adv_input_field_v or "prompt"), | |
| target_field=(str(adv_target_field_v) if adv_target_field_v else None), | |
| system_message=(str(adv_system_message_v) if adv_system_message_v else None), | |
| developer_message=(str(adv_developer_message_v) if adv_developer_message_v else None), | |
| model_identity=(str(adv_model_identity_v) if adv_model_identity_v else None), | |
| max_samples=(int(adv_max_samples_v) if adv_max_samples_v else None), | |
| min_length=int(adv_min_length_v or 10), | |
| max_length=(int(adv_max_length_v) if adv_max_length_v else None), | |
| num_train_epochs=float(adv_num_train_epochs_v or 1.0), | |
| batch_size=int(adv_batch_size_v or 4), | |
| gradient_accumulation_steps=int(adv_gas_v or 4), | |
| learning_rate=float(adv_lr_v or 2e-4), | |
| min_lr=float(adv_min_lr_num_v or 2e-5), | |
| weight_decay=float(adv_wd_v or 0.01), | |
| warmup_ratio=float(adv_warmup_ratio_v or 0.03), | |
| max_seq_length=int(adv_max_seq_length_v or 2048), | |
| lora_r=int(adv_lora_r_v or 16), | |
| lora_alpha=int(adv_lora_alpha_v or 32), | |
| lora_dropout=float(adv_lora_dropout_v or 0.05), | |
| mixed_precision=str(adv_mixed_precision_v or "bf16"), | |
| num_workers=int(adv_num_workers_v or 4), | |
| quantization_type=str(adv_quantization_type_v or "mxfp4"), | |
| max_grad_norm=float(adv_max_grad_norm_v or 1.0), | |
| logging_steps=int(adv_logging_steps_v or 10), | |
| eval_steps=int(adv_eval_steps_v or 100), | |
| save_steps=int(adv_save_steps_v or 500), | |
| ) | |
| else: | |
| if str(adv_sm_mode_v) == "long_context": | |
| cfg_path = generate_smollm3_long_context_config_file( | |
| model_name=str(adv_sm_lc_model_name_v or "HuggingFaceTB/SmolLM3-3B"), | |
| dataset_name=(str(adv_sm_lc_dataset_name_v) if adv_sm_lc_dataset_name_v else None), | |
| input_field=str(adv_sm_lc_input_field_v or "prompt"), | |
| target_field=str(adv_sm_lc_target_field_v or "completion"), | |
| filter_bad_entries=bool(adv_sm_lc_filter_bad_entries_v), | |
| sample_size=(int(adv_sm_lc_sample_size_v) if adv_sm_lc_sample_size_v else None), | |
| sample_seed=int(adv_sm_lc_sample_seed_v or 42), | |
| max_seq_length=int(adv_sm_lc_max_seq_length_v or 131072), | |
| batch_size=int(adv_sm_lc_batch_size_v or 1), | |
| gradient_accumulation_steps=int(adv_sm_lc_gas_v or 8), | |
| learning_rate=float(adv_sm_lc_learning_rate_v or 1e-5), | |
| warmup_steps=int(adv_sm_lc_warmup_steps_v or 200), | |
| max_iters=int(adv_sm_lc_max_iters_v or 500), | |
| save_steps=int(adv_sm_lc_save_steps_v or 100), | |
| eval_steps=int(adv_sm_lc_eval_steps_v or 50), | |
| logging_steps=int(adv_sm_lc_logging_steps_v or 10), | |
| use_chat_template=bool(adv_sm_lc_use_chat_template_v), | |
| no_think_system_message=bool(adv_sm_lc_no_think_system_message_v), | |
| trainer_type=str(trainer_type_v).lower(), | |
| ) | |
| else: | |
| cfg_path = generate_smollm3_custom_config_file( | |
| model_name=str(adv_sm_model_name_v or "HuggingFaceTB/SmolLM3-3B"), | |
| dataset_name=(str(adv_sm_dataset_name_v) if adv_sm_dataset_name_v else None), | |
| max_seq_length=int(adv_sm_max_seq_length_v or 4096), | |
| batch_size=int(adv_sm_batch_size_v or 2), | |
| gradient_accumulation_steps=int(adv_sm_gas_v or 8), | |
| learning_rate=float(adv_sm_learning_rate_v or 5e-6), | |
| save_steps=int(adv_sm_save_steps_v or 500), | |
| eval_steps=int(adv_sm_eval_steps_v or 100), | |
| logging_steps=int(adv_sm_logging_steps_v or 10), | |
| filter_bad_entries=bool(adv_sm_filter_bad_entries_v), | |
| input_field=str(adv_sm_input_field_v or "prompt"), | |
| target_field=str(adv_sm_target_field_v or "completion"), | |
| sample_size=(int(adv_sm_sample_size_v) if adv_sm_sample_size_v else None), | |
| sample_seed=int(adv_sm_sample_seed_v or 42), | |
| trainer_type=str(trainer_type_v).lower(), | |
| ) | |
| override_path = str(cfg_path) | |
| except Exception as e: | |
| # Surface error in logs via generator | |
| def _err_gen(): | |
| yield f"❌ Failed to generate advanced config: {e}" | |
| return _err_gen() | |
| def _gen(): | |
| params = PipelineInputs( | |
| model_family=model_family_v, | |
| config_choice=config_choice_v, | |
| trainer_type=trainer_type_v, | |
| monitoring_mode=monitoring_mode_v, | |
| experiment_name=experiment_name_v, | |
| repo_short=repo_short_v, | |
| author_name=author_name_v, | |
| model_description=model_description_v, | |
| trackio_space_name=trackio_space_name_v or None, | |
| deploy_trackio_space=bool(deploy_trackio_space_v), | |
| create_dataset_repo=bool(create_dataset_repo_v), | |
| push_to_hub=bool(push_to_hub_v), | |
| switch_to_read_after=bool(switch_to_read_after_v), | |
| scheduler_override=(scheduler_override_v or None), | |
| min_lr=min_lr_v, | |
| min_lr_rate=min_lr_rate_v, | |
| override_config_path=override_path, | |
| ) | |
| write_token = os.environ.get("HF_WRITE_TOKEN") or os.environ.get("HF_TOKEN") | |
| read_token = os.environ.get("HF_READ_TOKEN") | |
| yield f"HF_WRITE_TOKEN: {mask_token(write_token)}" | |
| yield f"HF_READ_TOKEN: {mask_token(read_token)}" | |
| for line in run_pipeline(params): | |
| yield line | |
| time.sleep(0.01) | |
| return _gen() | |
| start_btn.click( | |
| _start_with_overrides, | |
| inputs=[ | |
| model_family, | |
| config_choice, | |
| trainer_type, | |
| monitoring_mode, | |
| experiment_name, | |
| repo_short, | |
| author_name, | |
| model_description, | |
| trackio_space_name, | |
| deploy_trackio_space, | |
| create_dataset_repo, | |
| push_to_hub, | |
| switch_to_read_after, | |
| scheduler_override, | |
| min_lr, | |
| min_lr_rate, | |
| advanced_enabled, | |
| adv_gpt_mode, | |
| # GPT-OSS advanced | |
| adv_dataset_name, | |
| adv_dataset_split, | |
| adv_dataset_format, | |
| adv_input_field, | |
| adv_target_field, | |
| adv_system_message, | |
| adv_developer_message, | |
| adv_model_identity, | |
| adv_max_samples, | |
| adv_min_length, | |
| adv_max_length, | |
| adv_num_train_epochs, | |
| adv_batch_size, | |
| adv_gradient_accumulation_steps, | |
| adv_learning_rate, | |
| adv_min_lr_num, | |
| adv_weight_decay, | |
| adv_warmup_ratio, | |
| adv_max_seq_length, | |
| adv_lora_r, | |
| adv_lora_alpha, | |
| adv_lora_dropout, | |
| adv_mixed_precision, | |
| adv_num_workers, | |
| adv_quantization_type, | |
| adv_max_grad_norm, | |
| adv_logging_steps, | |
| adv_eval_steps, | |
| adv_save_steps, | |
| # GPT-OSS Medical o1 SFT | |
| adv_med_dataset_config, | |
| adv_med_system_message, | |
| adv_med_developer_message, | |
| adv_med_num_train_epochs, | |
| adv_med_batch_size, | |
| adv_med_gradient_accumulation_steps, | |
| adv_med_learning_rate, | |
| adv_med_max_seq_length, | |
| # SmolLM3 advanced | |
| adv_sm_mode, | |
| adv_sm_model_name, | |
| adv_sm_dataset_name, | |
| adv_sm_input_field, | |
| adv_sm_target_field, | |
| adv_sm_filter_bad_entries, | |
| adv_sm_sample_size, | |
| adv_sm_sample_seed, | |
| adv_sm_max_seq_length, | |
| adv_sm_batch_size, | |
| adv_sm_gas, | |
| adv_sm_learning_rate, | |
| adv_sm_save_steps, | |
| adv_sm_eval_steps, | |
| adv_sm_logging_steps, | |
| # SmolLM3 long context | |
| adv_sm_lc_model_name, | |
| adv_sm_lc_dataset_name, | |
| adv_sm_lc_input_field, | |
| adv_sm_lc_target_field, | |
| adv_sm_lc_filter_bad_entries, | |
| adv_sm_lc_sample_size, | |
| adv_sm_lc_sample_seed, | |
| adv_sm_lc_max_seq_length, | |
| adv_sm_lc_batch_size, | |
| adv_sm_lc_gas, | |
| adv_sm_lc_learning_rate, | |
| adv_sm_lc_warmup_steps, | |
| adv_sm_lc_max_iters, | |
| adv_sm_lc_save_steps, | |
| adv_sm_lc_eval_steps, | |
| adv_sm_lc_logging_steps, | |
| adv_sm_lc_use_chat_template, | |
| adv_sm_lc_no_think_system_message, | |
| ], | |
| outputs=[logs], | |
| ) | |
| if __name__ == "__main__": | |
| # Optional: allow setting server parameters via env | |
| server_port = int(os.environ.get("INTERFACE_PORT", "7860")) | |
| server_name = os.environ.get("INTERFACE_HOST", "0.0.0.0") | |
| demo.queue().launch(server_name=server_name, server_port=server_port, mcp_server=True) | |