|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
LoRA Fine-tuning Script: Add Tool Calling to Synthia-S1-27b |
|
|
|
|
|
This script fine-tunes Tesslate/Synthia-S1-27b with LoRA using the |
|
|
nvidia/Nemotron-Agentic-v1 tool_calling dataset. |
|
|
|
|
|
Usage: |
|
|
# With uv (recommended) |
|
|
uv run train_tool_calling.py |
|
|
|
|
|
# Or with pip |
|
|
pip install torch transformers datasets trl peft accelerate bitsandbytes trackio |
|
|
python train_tool_calling.py |
|
|
|
|
|
Hardware Requirements: |
|
|
- Minimum: 1x A100 80GB or 2x A10G 24GB |
|
|
- Recommended: 1x A100 80GB for fastest training |
|
|
""" |
|
|
|
|
|
import os |
|
|
import json |
|
|
from datasets import load_dataset, Dataset |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, DataCollatorForLanguageModeling |
|
|
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training |
|
|
from trl import SFTTrainer, SFTConfig |
|
|
import torch |
|
|
import trackio |
|
|
from huggingface_hub import hf_hub_download, HfApi, create_repo |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
BASE_MODEL = "Tesslate/Synthia-S1-27b" |
|
|
OUTPUT_MODEL = "Synthia-S1-27b-tool-calling" |
|
|
|
|
|
|
|
|
DATASET_NAME = "nvidia/Nemotron-Agentic-v1" |
|
|
DATASET_SPLIT = "tool_calling" |
|
|
MAX_SAMPLES = None |
|
|
|
|
|
|
|
|
NUM_EPOCHS = 1 |
|
|
MAX_SEQ_LENGTH = 4096 |
|
|
BATCH_SIZE = 1 |
|
|
GRADIENT_ACCUMULATION = 16 |
|
|
LEARNING_RATE = 2e-4 |
|
|
WARMUP_RATIO = 0.03 |
|
|
|
|
|
|
|
|
LORA_R = 64 |
|
|
LORA_ALPHA = 128 |
|
|
LORA_DROPOUT = 0.05 |
|
|
|
|
|
|
|
|
USE_4BIT = False |
|
|
|
|
|
|
|
|
TOKENIZED_DATASET_REPO = "Codyfederer/synthia-tool-calling-tokenized" |
|
|
SAVE_TOKENIZED = True |
|
|
TOKENIZED_DATASET_PRIVATE = True |
|
|
LOAD_TOKENIZED_IF_EXISTS = True |
|
|
|
|
|
|
|
|
PUSH_TO_HUB = True |
|
|
HUB_PRIVATE = False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def tokenize_conversation(example, tokenizer, max_length): |
|
|
""" |
|
|
Tokenize a conversation using the model's chat template. |
|
|
Returns input_ids, attention_mask, and labels for causal LM training. |
|
|
""" |
|
|
messages = example["messages"] |
|
|
|
|
|
|
|
|
text = tokenizer.apply_chat_template( |
|
|
messages, |
|
|
tokenize=False, |
|
|
add_generation_prompt=False |
|
|
) |
|
|
|
|
|
|
|
|
tokenized = tokenizer( |
|
|
text, |
|
|
truncation=True, |
|
|
max_length=max_length, |
|
|
padding=False, |
|
|
return_tensors=None, |
|
|
) |
|
|
|
|
|
|
|
|
tokenized["labels"] = tokenized["input_ids"].copy() |
|
|
|
|
|
return tokenized |
|
|
|
|
|
|
|
|
def main(): |
|
|
print("=" * 60) |
|
|
print("Tool Calling Fine-tuning for Synthia-S1-27b") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
trackio.init(project="synthia-tool-calling") |
|
|
|
|
|
|
|
|
from huggingface_hub import whoami |
|
|
try: |
|
|
username = whoami()["name"] |
|
|
hub_model_id = f"{username}/{OUTPUT_MODEL}" |
|
|
print(f"Will push to: {hub_model_id}") |
|
|
except Exception as e: |
|
|
print(f"Warning: Not logged in to HF Hub ({e})") |
|
|
print("Model will be saved locally only. Run 'huggingface-cli login' to enable Hub push.") |
|
|
hub_model_id = OUTPUT_MODEL |
|
|
global PUSH_TO_HUB |
|
|
PUSH_TO_HUB = False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\nLoading tokenizer from {BASE_MODEL}...") |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
|
BASE_MODEL, |
|
|
trust_remote_code=True, |
|
|
padding_side="right", |
|
|
) |
|
|
|
|
|
|
|
|
if tokenizer.pad_token is None: |
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
tokenizer.pad_token_id = tokenizer.eos_token_id |
|
|
|
|
|
print(f"Vocab size: {len(tokenizer):,}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_dataset = None |
|
|
eval_dataset = None |
|
|
|
|
|
if LOAD_TOKENIZED_IF_EXISTS: |
|
|
print(f"\nChecking for pre-tokenized dataset: {TOKENIZED_DATASET_REPO}") |
|
|
try: |
|
|
from datasets import load_dataset as hf_load_dataset |
|
|
|
|
|
|
|
|
tokenized_ds = hf_load_dataset(TOKENIZED_DATASET_REPO) |
|
|
|
|
|
|
|
|
if "train" in tokenized_ds and "input_ids" in tokenized_ds["train"].column_names: |
|
|
print(" Found pre-tokenized dataset with input_ids!") |
|
|
train_dataset = tokenized_ds["train"] |
|
|
eval_dataset = tokenized_ds.get("test", tokenized_ds.get("validation")) |
|
|
print(f" Train samples: {len(train_dataset):,}") |
|
|
if eval_dataset: |
|
|
print(f" Eval samples: {len(eval_dataset):,}") |
|
|
else: |
|
|
print(" Dataset exists but is not tokenized (no input_ids column)") |
|
|
print(" Will re-tokenize and save...") |
|
|
except Exception as e: |
|
|
print(f" Could not load pre-tokenized dataset: {e}") |
|
|
print(" Will tokenize from scratch...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if train_dataset is None: |
|
|
print(f"\nLoading dataset: {DATASET_NAME} ({DATASET_SPLIT} split)...") |
|
|
|
|
|
|
|
|
jsonl_file = f"data/{DATASET_SPLIT}.jsonl" |
|
|
print(f"Downloading {jsonl_file}...") |
|
|
|
|
|
local_path = hf_hub_download( |
|
|
repo_id=DATASET_NAME, |
|
|
filename=jsonl_file, |
|
|
repo_type="dataset" |
|
|
) |
|
|
print(f"Downloaded to: {local_path}") |
|
|
|
|
|
|
|
|
print("Loading and processing JSONL file...") |
|
|
processed_examples = [] |
|
|
skipped = 0 |
|
|
|
|
|
with open(local_path, 'r', encoding='utf-8') as f: |
|
|
for line_num, line in enumerate(f): |
|
|
if line_num % 50000 == 0: |
|
|
print(f" Processed {line_num:,} lines...") |
|
|
try: |
|
|
example = json.loads(line.strip()) |
|
|
messages = example.get("messages", []) |
|
|
|
|
|
|
|
|
formatted_messages = [] |
|
|
for msg in messages: |
|
|
role = msg.get("role", "user") |
|
|
content = msg.get("content", "") |
|
|
|
|
|
|
|
|
if isinstance(content, list): |
|
|
|
|
|
parts = [] |
|
|
for item in content: |
|
|
if isinstance(item, dict): |
|
|
if "text" in item: |
|
|
parts.append(item["text"]) |
|
|
else: |
|
|
parts.append(json.dumps(item)) |
|
|
else: |
|
|
parts.append(str(item)) |
|
|
content = "\n".join(parts) if parts else "" |
|
|
elif isinstance(content, dict): |
|
|
content = json.dumps(content) |
|
|
elif content is None: |
|
|
content = "" |
|
|
else: |
|
|
content = str(content) |
|
|
|
|
|
formatted_messages.append({ |
|
|
"role": role, |
|
|
"content": content |
|
|
}) |
|
|
|
|
|
|
|
|
|
|
|
if formatted_messages: |
|
|
merged_messages = [] |
|
|
for msg in formatted_messages: |
|
|
role = msg["role"] |
|
|
content = msg["content"] |
|
|
|
|
|
|
|
|
if role == "tool": |
|
|
role = "user" |
|
|
content = f"[Tool Result]\n{content}" |
|
|
|
|
|
|
|
|
if merged_messages and merged_messages[-1]["role"] == role: |
|
|
merged_messages[-1]["content"] += f"\n\n{content}" |
|
|
else: |
|
|
merged_messages.append({"role": role, "content": content}) |
|
|
|
|
|
|
|
|
if merged_messages and merged_messages[0]["role"] != "user": |
|
|
|
|
|
merged_messages.insert(0, {"role": "user", "content": "[Start]"}) |
|
|
|
|
|
processed_examples.append({"messages": merged_messages}) |
|
|
|
|
|
except Exception as e: |
|
|
skipped += 1 |
|
|
if skipped < 5: |
|
|
print(f" Warning: Skipped line {line_num}: {e}") |
|
|
|
|
|
print(f"Loaded {len(processed_examples):,} examples (skipped {skipped})") |
|
|
|
|
|
|
|
|
dataset = Dataset.from_list(processed_examples) |
|
|
print(f"Dataset size: {len(dataset):,} examples") |
|
|
|
|
|
if MAX_SAMPLES and len(dataset) > MAX_SAMPLES: |
|
|
dataset = dataset.shuffle(seed=42).select(range(MAX_SAMPLES)) |
|
|
print(f"Limited to {MAX_SAMPLES:,} samples for training") |
|
|
|
|
|
|
|
|
split_dataset = dataset.train_test_split(test_size=0.02, seed=42) |
|
|
train_dataset = split_dataset["train"] |
|
|
eval_dataset = split_dataset["test"] |
|
|
|
|
|
print(f"Train samples: {len(train_dataset):,}") |
|
|
print(f"Eval samples: {len(eval_dataset):,}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\nTokenizing dataset with max_length={MAX_SEQ_LENGTH}...") |
|
|
print("This may take a while for large datasets...") |
|
|
|
|
|
|
|
|
train_dataset = train_dataset.map( |
|
|
lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH), |
|
|
remove_columns=["messages"], |
|
|
num_proc=4, |
|
|
desc="Tokenizing train", |
|
|
) |
|
|
|
|
|
|
|
|
eval_dataset = eval_dataset.map( |
|
|
lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH), |
|
|
remove_columns=["messages"], |
|
|
num_proc=4, |
|
|
desc="Tokenizing eval", |
|
|
) |
|
|
|
|
|
print(f"Tokenization complete!") |
|
|
print(f"Train dataset columns: {train_dataset.column_names}") |
|
|
print(f"Sample input_ids length: {len(train_dataset[0]['input_ids'])}") |
|
|
|
|
|
|
|
|
if SAVE_TOKENIZED: |
|
|
print(f"\nSaving TOKENIZED dataset to Hub: {TOKENIZED_DATASET_REPO}") |
|
|
try: |
|
|
|
|
|
api = HfApi() |
|
|
try: |
|
|
create_repo( |
|
|
TOKENIZED_DATASET_REPO, |
|
|
repo_type="dataset", |
|
|
private=TOKENIZED_DATASET_PRIVATE, |
|
|
exist_ok=True |
|
|
) |
|
|
print(f" Created/verified repo (private={TOKENIZED_DATASET_PRIVATE})") |
|
|
|
|
|
|
|
|
if TOKENIZED_DATASET_PRIVATE: |
|
|
try: |
|
|
api.update_repo_visibility( |
|
|
TOKENIZED_DATASET_REPO, |
|
|
repo_type="dataset", |
|
|
private=True |
|
|
) |
|
|
print(f" Ensured repo is private") |
|
|
except Exception: |
|
|
pass |
|
|
except Exception as e: |
|
|
print(f" Repo creation note: {e}") |
|
|
|
|
|
|
|
|
train_dataset.reset_format() |
|
|
eval_dataset.reset_format() |
|
|
|
|
|
|
|
|
print(f" Verifying tokenized data...") |
|
|
print(f" Train columns: {train_dataset.column_names}") |
|
|
print(f" Sample input_ids type: {type(train_dataset[0]['input_ids'])}") |
|
|
print(f" Sample input_ids length: {len(train_dataset[0]['input_ids'])}") |
|
|
print(f" First 10 tokens: {train_dataset[0]['input_ids'][:10]}") |
|
|
|
|
|
|
|
|
print(f" Pushing train split ({len(train_dataset):,} examples)...") |
|
|
train_dataset.push_to_hub( |
|
|
TOKENIZED_DATASET_REPO, |
|
|
split="train", |
|
|
) |
|
|
print(f" Pushing test split ({len(eval_dataset):,} examples)...") |
|
|
eval_dataset.push_to_hub( |
|
|
TOKENIZED_DATASET_REPO, |
|
|
split="test", |
|
|
) |
|
|
print(f" SUCCESS! Saved TOKENIZED data to: https://huggingface.co/datasets/{TOKENIZED_DATASET_REPO}") |
|
|
print(f" Columns saved: {train_dataset.column_names}") |
|
|
print(f" Dataset is private: {TOKENIZED_DATASET_PRIVATE}") |
|
|
|
|
|
|
|
|
print(f" Verifying upload...") |
|
|
try: |
|
|
from datasets import load_dataset as verify_load |
|
|
verify_ds = verify_load(TOKENIZED_DATASET_REPO, split="train", streaming=True) |
|
|
sample = next(iter(verify_ds)) |
|
|
if "input_ids" in sample: |
|
|
print(f" VERIFIED: Dataset contains input_ids with {len(sample['input_ids'])} tokens") |
|
|
else: |
|
|
print(f" WARNING: Dataset uploaded but input_ids not found in columns: {list(sample.keys())}") |
|
|
except Exception as ve: |
|
|
print(f" Could not verify upload: {ve}") |
|
|
|
|
|
except Exception as e: |
|
|
print(f" ERROR saving to Hub: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
print(" Continuing with training anyway...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\nLoading model: {BASE_MODEL}...") |
|
|
|
|
|
if USE_4BIT: |
|
|
print("Using 4-bit quantization (QLoRA)") |
|
|
bnb_config = BitsAndBytesConfig( |
|
|
load_in_4bit=True, |
|
|
bnb_4bit_quant_type="nf4", |
|
|
bnb_4bit_compute_dtype=torch.bfloat16, |
|
|
bnb_4bit_use_double_quant=True, |
|
|
) |
|
|
else: |
|
|
bnb_config = None |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
BASE_MODEL, |
|
|
quantization_config=bnb_config, |
|
|
device_map="auto", |
|
|
trust_remote_code=True, |
|
|
torch_dtype=torch.bfloat16, |
|
|
attn_implementation="sdpa", |
|
|
) |
|
|
|
|
|
if USE_4BIT: |
|
|
model = prepare_model_for_kbit_training(model) |
|
|
|
|
|
print(f"Model loaded. Parameters: {model.num_parameters():,}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\nConfiguring LoRA (r={LORA_R}, alpha={LORA_ALPHA})...") |
|
|
|
|
|
|
|
|
target_modules = [ |
|
|
"q_proj", "k_proj", "v_proj", "o_proj", |
|
|
"gate_proj", "up_proj", "down_proj", |
|
|
] |
|
|
|
|
|
lora_config = LoraConfig( |
|
|
r=LORA_R, |
|
|
lora_alpha=LORA_ALPHA, |
|
|
lora_dropout=LORA_DROPOUT, |
|
|
target_modules=target_modules, |
|
|
bias="none", |
|
|
task_type="CAUSAL_LM", |
|
|
) |
|
|
|
|
|
model = get_peft_model(model, lora_config) |
|
|
model.print_trainable_parameters() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\nConfiguring training...") |
|
|
|
|
|
training_args = SFTConfig( |
|
|
output_dir=f"./{OUTPUT_MODEL}", |
|
|
|
|
|
|
|
|
num_train_epochs=NUM_EPOCHS, |
|
|
per_device_train_batch_size=BATCH_SIZE, |
|
|
per_device_eval_batch_size=BATCH_SIZE, |
|
|
gradient_accumulation_steps=GRADIENT_ACCUMULATION, |
|
|
|
|
|
|
|
|
learning_rate=LEARNING_RATE, |
|
|
lr_scheduler_type="cosine", |
|
|
warmup_ratio=WARMUP_RATIO, |
|
|
weight_decay=0.01, |
|
|
optim="adamw_torch", |
|
|
|
|
|
|
|
|
gradient_checkpointing=True, |
|
|
gradient_checkpointing_kwargs={"use_reentrant": False}, |
|
|
max_grad_norm=1.0, |
|
|
|
|
|
|
|
|
max_length=MAX_SEQ_LENGTH, |
|
|
packing=False, |
|
|
|
|
|
|
|
|
eval_strategy="steps", |
|
|
eval_steps=500, |
|
|
|
|
|
|
|
|
save_strategy="steps", |
|
|
save_steps=500, |
|
|
save_total_limit=3, |
|
|
|
|
|
|
|
|
push_to_hub=PUSH_TO_HUB, |
|
|
hub_model_id=hub_model_id if PUSH_TO_HUB else None, |
|
|
hub_strategy="checkpoint", |
|
|
hub_private_repo=HUB_PRIVATE, |
|
|
|
|
|
|
|
|
logging_steps=10, |
|
|
report_to="trackio", |
|
|
run_name=f"lora-r{LORA_R}-lr{LEARNING_RATE}", |
|
|
|
|
|
|
|
|
bf16=True, |
|
|
dataloader_num_workers=4, |
|
|
dataloader_pin_memory=True, |
|
|
|
|
|
|
|
|
seed=42, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\nInitializing trainer...") |
|
|
|
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling( |
|
|
tokenizer=tokenizer, |
|
|
mlm=False, |
|
|
) |
|
|
|
|
|
|
|
|
is_pretokenized = "input_ids" in train_dataset.column_names |
|
|
print(f"Dataset is pre-tokenized: {is_pretokenized}") |
|
|
print(f"Dataset columns: {train_dataset.column_names}") |
|
|
|
|
|
trainer = SFTTrainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=train_dataset, |
|
|
eval_dataset=eval_dataset, |
|
|
processing_class=tokenizer, |
|
|
data_collator=data_collator, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("Starting training...") |
|
|
print("=" * 60 + "\n") |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\nSaving final model...") |
|
|
trainer.save_model() |
|
|
|
|
|
if PUSH_TO_HUB: |
|
|
print(f"Pushing to Hub: {hub_model_id}") |
|
|
trainer.push_to_hub() |
|
|
print(f"\n✅ Model available at: https://huggingface.co/{hub_model_id}") |
|
|
else: |
|
|
print(f"Model saved locally to: ./{OUTPUT_MODEL}") |
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("Training complete!") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|