from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling model_id = "mistralai/Mistral-7B-Instruct" # if you have resources tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained(model_id) # Tokenize your dataset def load_dataset(file_path, tokenizer, block_size=512): return TextDataset( tokenizer=tokenizer, file_path=file_path, block_size=block_size ) train_dataset = load_dataset("coaching_data.txt", tokenizer) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False, ) training_args = TrainingArguments( output_dir="./skilllink-coach", overwrite_output_dir=True, num_train_epochs=3, per_device_train_batch_size=2, save_steps=100, save_total_limit=1, logging_dir="./logs", fp16=True, # If using GPU ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, ) trainer.train() trainer.save_model("./skilllink-coach") tokenizer.save_pretrained("./skilllink-coach")