| # train a miniature character-level shakespeare model | |
| # good for debugging and playing on macbooks and such | |
| out_dir = 'out-shakespeare-char' | |
| eval_interval = 250 # keep frequent because we'll overfit | |
| eval_iters = 200 | |
| log_interval = 10 # don't print too too often | |
| # we expect to overfit on this small dataset, so only save when val improves | |
| always_save_checkpoint = False | |
| wandb_log = False # override via command line if you like | |
| wandb_project = 'shakespeare-char' | |
| wandb_run_name = 'mini-gpt' | |
| dataset = 'shakespeare_char' | |
| gradient_accumulation_steps = 1 | |
| batch_size = 64 | |
| block_size = 256 # context of up to 256 previous characters | |
| # baby GPT model :) | |
| n_layer = 6 | |
| n_head = 6 | |
| n_embd = 384 | |
| dropout = 0.2 | |
| learning_rate = 1e-3 # with baby networks can afford to go a bit higher | |
| max_iters = 5000 | |
| lr_decay_iters = 5000 # make equal to max_iters usually | |
| min_lr = 1e-4 # learning_rate / 10 usually | |
| beta2 = 0.99 # make a bit bigger because number of tokens per iter is small | |
| warmup_iters = 100 # not super necessary potentially | |
| # on macbook also add | |
| # device = 'cpu' # run on cpu only | |
| # compile = False # do not torch compile the model | |