# /// script # requires-python = ">=3.10" # dependencies = [ # "torch>=2.0.0", # "transformers>=4.50.0", # "datasets>=2.14.0", # "trl>=0.12.0", # "peft>=0.7.0", # "accelerate>=0.25.0", # "bitsandbytes>=0.41.0", # "trackio", # "huggingface_hub", # ] # /// """ LoRA Fine-tuning Script: Add Tool Calling to Synthia-S1-27b This script fine-tunes Tesslate/Synthia-S1-27b with LoRA using the nvidia/Nemotron-Agentic-v1 tool_calling dataset. Usage: # With uv (recommended) uv run train_tool_calling.py # Or with pip pip install torch transformers datasets trl peft accelerate bitsandbytes trackio python train_tool_calling.py Hardware Requirements: - Minimum: 1x A100 80GB or 2x A10G 24GB - Recommended: 1x A100 80GB for fastest training """ import os import json from datasets import load_dataset, Dataset from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, DataCollatorForLanguageModeling from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training from trl import SFTTrainer, SFTConfig import torch import trackio from huggingface_hub import hf_hub_download, HfApi, create_repo # ============================================================================ # CONFIGURATION - Modify these values as needed # ============================================================================ # Model configuration BASE_MODEL = "Tesslate/Synthia-S1-27b" OUTPUT_MODEL = "Synthia-S1-27b-tool-calling" # Will be pushed as Codyfederer/Synthia-S1-27b-tool-calling # Dataset configuration DATASET_NAME = "nvidia/Nemotron-Agentic-v1" DATASET_SPLIT = "tool_calling" MAX_SAMPLES = None # Set to a number (e.g., 10000) to limit dataset size for testing # Training hyperparameters NUM_EPOCHS = 1 # 1 epoch is often sufficient for large datasets MAX_SEQ_LENGTH = 4096 # Adjust based on your GPU memory BATCH_SIZE = 1 # Per device batch size GRADIENT_ACCUMULATION = 16 # Effective batch size = BATCH_SIZE * GRADIENT_ACCUMULATION LEARNING_RATE = 2e-4 WARMUP_RATIO = 0.03 # LoRA configuration LORA_R = 64 # LoRA rank - higher = more capacity but more memory LORA_ALPHA = 128 # LoRA alpha - typically 2x rank LORA_DROPOUT = 0.05 # Quantization (4-bit for memory efficiency) USE_4BIT = False # Using BF16 LoRA on H100 for better quality # Tokenized dataset caching TOKENIZED_DATASET_REPO = "Codyfederer/synthia-tool-calling-tokenized" SAVE_TOKENIZED = True # Save tokenized dataset to Hub for reuse TOKENIZED_DATASET_PRIVATE = True # Make tokenized dataset private LOAD_TOKENIZED_IF_EXISTS = True # Skip tokenization if already exists on Hub # Hub configuration PUSH_TO_HUB = True HUB_PRIVATE = False # Set to True for private model # ============================================================================ # TRAINING SCRIPT # ============================================================================ def tokenize_conversation(example, tokenizer, max_length): """ Tokenize a conversation using the model's chat template. Returns input_ids, attention_mask, and labels for causal LM training. """ messages = example["messages"] # Apply chat template to get the full text text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=False ) # Tokenize the text tokenized = tokenizer( text, truncation=True, max_length=max_length, padding=False, # We'll pad later in the data collator return_tensors=None, # Return lists, not tensors ) # For causal LM, labels are the same as input_ids (shifted internally by the model) tokenized["labels"] = tokenized["input_ids"].copy() return tokenized def main(): print("=" * 60) print("Tool Calling Fine-tuning for Synthia-S1-27b") print("=" * 60) # Initialize Trackio for monitoring trackio.init(project="synthia-tool-calling") # Get HF username for hub_model_id from huggingface_hub import whoami try: username = whoami()["name"] hub_model_id = f"{username}/{OUTPUT_MODEL}" print(f"Will push to: {hub_model_id}") except Exception as e: print(f"Warning: Not logged in to HF Hub ({e})") print("Model will be saved locally only. Run 'huggingface-cli login' to enable Hub push.") hub_model_id = OUTPUT_MODEL global PUSH_TO_HUB PUSH_TO_HUB = False # ------------------------------------------------------------------------- # Load Tokenizer FIRST (needed for tokenization) # ------------------------------------------------------------------------- print(f"\nLoading tokenizer from {BASE_MODEL}...") tokenizer = AutoTokenizer.from_pretrained( BASE_MODEL, trust_remote_code=True, padding_side="right", ) # Ensure pad token is set if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token tokenizer.pad_token_id = tokenizer.eos_token_id print(f"Vocab size: {len(tokenizer):,}") # ------------------------------------------------------------------------- # Try to Load Pre-tokenized Dataset from Hub # ------------------------------------------------------------------------- train_dataset = None eval_dataset = None if LOAD_TOKENIZED_IF_EXISTS: print(f"\nChecking for pre-tokenized dataset: {TOKENIZED_DATASET_REPO}") try: from datasets import load_dataset as hf_load_dataset # Try to load the tokenized dataset tokenized_ds = hf_load_dataset(TOKENIZED_DATASET_REPO) # Check if it has the required columns (input_ids, attention_mask) if "train" in tokenized_ds and "input_ids" in tokenized_ds["train"].column_names: print(" Found pre-tokenized dataset with input_ids!") train_dataset = tokenized_ds["train"] eval_dataset = tokenized_ds.get("test", tokenized_ds.get("validation")) print(f" Train samples: {len(train_dataset):,}") if eval_dataset: print(f" Eval samples: {len(eval_dataset):,}") else: print(" Dataset exists but is not tokenized (no input_ids column)") print(" Will re-tokenize and save...") except Exception as e: print(f" Could not load pre-tokenized dataset: {e}") print(" Will tokenize from scratch...") # ------------------------------------------------------------------------- # Load and Tokenize Dataset (if not loaded from Hub) # ------------------------------------------------------------------------- if train_dataset is None: print(f"\nLoading dataset: {DATASET_NAME} ({DATASET_SPLIT} split)...") # Download the JSONL file directly from the dataset repo jsonl_file = f"data/{DATASET_SPLIT}.jsonl" print(f"Downloading {jsonl_file}...") local_path = hf_hub_download( repo_id=DATASET_NAME, filename=jsonl_file, repo_type="dataset" ) print(f"Downloaded to: {local_path}") # Load JSONL manually to handle schema inconsistencies print("Loading and processing JSONL file...") processed_examples = [] skipped = 0 with open(local_path, 'r', encoding='utf-8') as f: for line_num, line in enumerate(f): if line_num % 50000 == 0: print(f" Processed {line_num:,} lines...") try: example = json.loads(line.strip()) messages = example.get("messages", []) # Convert messages to consistent format formatted_messages = [] for msg in messages: role = msg.get("role", "user") content = msg.get("content", "") # Handle content that might be a list or complex object if isinstance(content, list): # For tool calls, content is often a list of dicts parts = [] for item in content: if isinstance(item, dict): if "text" in item: parts.append(item["text"]) else: parts.append(json.dumps(item)) else: parts.append(str(item)) content = "\n".join(parts) if parts else "" elif isinstance(content, dict): content = json.dumps(content) elif content is None: content = "" else: content = str(content) formatted_messages.append({ "role": role, "content": content }) # Ensure proper role alternation for chat template # Merge consecutive messages with same role, handle tool messages if formatted_messages: merged_messages = [] for msg in formatted_messages: role = msg["role"] content = msg["content"] # Map tool role to assistant (tool responses are from assistant's perspective) if role == "tool": role = "user" # Tool output is provided to the model like user input content = f"[Tool Result]\n{content}" # If same role as previous, merge content if merged_messages and merged_messages[-1]["role"] == role: merged_messages[-1]["content"] += f"\n\n{content}" else: merged_messages.append({"role": role, "content": content}) # Ensure conversation starts with user and alternates if merged_messages and merged_messages[0]["role"] != "user": # Prepend a placeholder user message if starts with assistant merged_messages.insert(0, {"role": "user", "content": "[Start]"}) processed_examples.append({"messages": merged_messages}) except Exception as e: skipped += 1 if skipped < 5: print(f" Warning: Skipped line {line_num}: {e}") print(f"Loaded {len(processed_examples):,} examples (skipped {skipped})") # Create dataset from processed examples dataset = Dataset.from_list(processed_examples) print(f"Dataset size: {len(dataset):,} examples") if MAX_SAMPLES and len(dataset) > MAX_SAMPLES: dataset = dataset.shuffle(seed=42).select(range(MAX_SAMPLES)) print(f"Limited to {MAX_SAMPLES:,} samples for training") # Create train/eval split split_dataset = dataset.train_test_split(test_size=0.02, seed=42) train_dataset = split_dataset["train"] eval_dataset = split_dataset["test"] print(f"Train samples: {len(train_dataset):,}") print(f"Eval samples: {len(eval_dataset):,}") # ------------------------------------------------------------------------- # TOKENIZE the dataset (this is the key step!) # ------------------------------------------------------------------------- print(f"\nTokenizing dataset with max_length={MAX_SEQ_LENGTH}...") print("This may take a while for large datasets...") # Tokenize train dataset train_dataset = train_dataset.map( lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH), remove_columns=["messages"], # Remove text, keep only tokens num_proc=4, # Parallelize desc="Tokenizing train", ) # Tokenize eval dataset eval_dataset = eval_dataset.map( lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH), remove_columns=["messages"], num_proc=4, desc="Tokenizing eval", ) print(f"Tokenization complete!") print(f"Train dataset columns: {train_dataset.column_names}") print(f"Sample input_ids length: {len(train_dataset[0]['input_ids'])}") # Save TOKENIZED dataset to Hub for reuse if SAVE_TOKENIZED: print(f"\nSaving TOKENIZED dataset to Hub: {TOKENIZED_DATASET_REPO}") try: # Create the repo if it doesn't exist (private!) api = HfApi() try: create_repo( TOKENIZED_DATASET_REPO, repo_type="dataset", private=TOKENIZED_DATASET_PRIVATE, exist_ok=True ) print(f" Created/verified repo (private={TOKENIZED_DATASET_PRIVATE})") # Try to update visibility if repo already exists if TOKENIZED_DATASET_PRIVATE: try: api.update_repo_visibility( TOKENIZED_DATASET_REPO, repo_type="dataset", private=True ) print(f" Ensured repo is private") except Exception: pass # Ignore if already private or no permission except Exception as e: print(f" Repo creation note: {e}") # Reset format to ensure data is serializable (not torch tensors) train_dataset.reset_format() eval_dataset.reset_format() # Verify the data looks correct before pushing print(f" Verifying tokenized data...") print(f" Train columns: {train_dataset.column_names}") print(f" Sample input_ids type: {type(train_dataset[0]['input_ids'])}") print(f" Sample input_ids length: {len(train_dataset[0]['input_ids'])}") print(f" First 10 tokens: {train_dataset[0]['input_ids'][:10]}") # Push tokenized datasets to Hub (private is set at repo creation) print(f" Pushing train split ({len(train_dataset):,} examples)...") train_dataset.push_to_hub( TOKENIZED_DATASET_REPO, split="train", ) print(f" Pushing test split ({len(eval_dataset):,} examples)...") eval_dataset.push_to_hub( TOKENIZED_DATASET_REPO, split="test", ) print(f" SUCCESS! Saved TOKENIZED data to: https://huggingface.co/datasets/{TOKENIZED_DATASET_REPO}") print(f" Columns saved: {train_dataset.column_names}") print(f" Dataset is private: {TOKENIZED_DATASET_PRIVATE}") # Verify the upload by trying to load it back print(f" Verifying upload...") try: from datasets import load_dataset as verify_load verify_ds = verify_load(TOKENIZED_DATASET_REPO, split="train", streaming=True) sample = next(iter(verify_ds)) if "input_ids" in sample: print(f" VERIFIED: Dataset contains input_ids with {len(sample['input_ids'])} tokens") else: print(f" WARNING: Dataset uploaded but input_ids not found in columns: {list(sample.keys())}") except Exception as ve: print(f" Could not verify upload: {ve}") except Exception as e: print(f" ERROR saving to Hub: {e}") import traceback traceback.print_exc() print(" Continuing with training anyway...") # ------------------------------------------------------------------------- # Load Model with Quantization # ------------------------------------------------------------------------- print(f"\nLoading model: {BASE_MODEL}...") if USE_4BIT: print("Using 4-bit quantization (QLoRA)") bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, ) else: bnb_config = None model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, quantization_config=bnb_config, device_map="auto", trust_remote_code=True, torch_dtype=torch.bfloat16, attn_implementation="sdpa", # Use PyTorch's Scaled Dot Product Attention ) if USE_4BIT: model = prepare_model_for_kbit_training(model) print(f"Model loaded. Parameters: {model.num_parameters():,}") # ------------------------------------------------------------------------- # Configure LoRA # ------------------------------------------------------------------------- print(f"\nConfiguring LoRA (r={LORA_R}, alpha={LORA_ALPHA})...") # Target modules for Gemma 3 architecture target_modules = [ "q_proj", "k_proj", "v_proj", "o_proj", # Attention "gate_proj", "up_proj", "down_proj", # MLP ] lora_config = LoraConfig( r=LORA_R, lora_alpha=LORA_ALPHA, lora_dropout=LORA_DROPOUT, target_modules=target_modules, bias="none", task_type="CAUSAL_LM", ) model = get_peft_model(model, lora_config) model.print_trainable_parameters() # ------------------------------------------------------------------------- # Training Configuration # ------------------------------------------------------------------------- print("\nConfiguring training...") training_args = SFTConfig( output_dir=f"./{OUTPUT_MODEL}", # Training params num_train_epochs=NUM_EPOCHS, per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE, gradient_accumulation_steps=GRADIENT_ACCUMULATION, # Optimizer learning_rate=LEARNING_RATE, lr_scheduler_type="cosine", warmup_ratio=WARMUP_RATIO, weight_decay=0.01, optim="adamw_torch", # Memory optimization gradient_checkpointing=True, gradient_checkpointing_kwargs={"use_reentrant": False}, max_grad_norm=1.0, # Sequence length max_length=MAX_SEQ_LENGTH, packing=False, # Disable packing for tool calling (preserve conversation structure) # Evaluation eval_strategy="steps", eval_steps=500, # Saving save_strategy="steps", save_steps=500, save_total_limit=3, # Hub push_to_hub=PUSH_TO_HUB, hub_model_id=hub_model_id if PUSH_TO_HUB else None, hub_strategy="checkpoint", hub_private_repo=HUB_PRIVATE, # Logging logging_steps=10, report_to="trackio", run_name=f"lora-r{LORA_R}-lr{LEARNING_RATE}", # Performance bf16=True, dataloader_num_workers=4, dataloader_pin_memory=True, # Reproducibility seed=42, ) # ------------------------------------------------------------------------- # Initialize Trainer # ------------------------------------------------------------------------- print("\nInitializing trainer...") # Create data collator for padding pre-tokenized data data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False, # Causal LM, not masked LM ) # Check if dataset is pre-tokenized is_pretokenized = "input_ids" in train_dataset.column_names print(f"Dataset is pre-tokenized: {is_pretokenized}") print(f"Dataset columns: {train_dataset.column_names}") trainer = SFTTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, processing_class=tokenizer, data_collator=data_collator, ) # ------------------------------------------------------------------------- # Train! # ------------------------------------------------------------------------- print("\n" + "=" * 60) print("Starting training...") print("=" * 60 + "\n") trainer.train() # ------------------------------------------------------------------------- # Save Final Model # ------------------------------------------------------------------------- print("\nSaving final model...") trainer.save_model() if PUSH_TO_HUB: print(f"Pushing to Hub: {hub_model_id}") trainer.push_to_hub() print(f"\n✅ Model available at: https://huggingface.co/{hub_model_id}") else: print(f"Model saved locally to: ./{OUTPUT_MODEL}") print("\n" + "=" * 60) print("Training complete!") print("=" * 60) if __name__ == "__main__": main()