Skip to content

Commit

Permalink
Experiments with the char based training.
Browse files Browse the repository at this point in the history
  • Loading branch information
sopotc committed Mar 17, 2024
1 parent 71c1d53 commit c6d4861
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 20 deletions.
42 changes: 24 additions & 18 deletions config/train_shakespeare_char.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,29 +9,35 @@
# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'shakespeare-char'
wandb_run_name = 'mini-gpt'

dataset = 'shakespeare_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters
gradient_accumulation_steps = 2
batch_size = 40
block_size = 512 # context of up to 512 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2
n_layer = 10
n_head = 16
n_embd = 512
dropout = 0.15

learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 5000
lr_decay_iters = 5000 # make equal to max_iters usually
min_lr = 1e-4 # learning_rate / 10 usually
max_iters = 1500
lr_decay_iters = max_iters # make equal to max_iters usually
min_lr = 1e-5 # learning_rate / 10 usually
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small

warmup_iters = 100 # not super necessary potentially

# on macbook also add
# device = 'cpu' # run on cpu only
# compile = False # do not torch compile the model
weight_decay = 1e-1



# next token baseline:
# step 1000: train loss 1.0870, val loss 1.4768 (lr 1e-3, dropout 0.3) *

# step 1500: train loss 2.1345, val loss 2.3614 (lr 1e-5, dropout 0.2)
# step 1500: train loss 1.7811, val loss 2.1740 (lr 1e-3, dropout 0.4)
# step 1500: train loss 1.9536, val loss 2.2579 (lr 1e-3, dropout 0.5)
# step 750 : train loss 1.7220, val loss 2.1908 (lr 1e-3, dropout 0.1, overfitted after 750)
# step 1500: train loss 1.6305, val loss 2.1660 (lr 1e-3, dropout 0.3) *
# step 1500: train loss 1.6350, val loss 2.1830 (lr 1e-3, dropout 0.3, block_size 512))
# step 750: train loss 1.7841, val loss 2.1851 (lr 1e-3, dropout 0.15, block_size 512, overfitted after 750))
Empty file added runs_char.MD
Empty file.
5 changes: 3 additions & 2 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,9 @@ def get_batch(split):
data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
else:
data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
ix = torch.randint(len(data) - block_size, (batch_size,))
ix = torch.randint(len(data) - block_size -1, (batch_size,))
x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
y = torch.stack([torch.from_numpy((data[i+5:i+5+block_size]).astype(np.int64)) for i in ix])
y = torch.stack([torch.from_numpy((data[i+2:i+2+block_size]).astype(np.int64)) for i in ix])
if device_type == 'cuda':
# pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
Expand All @@ -139,6 +139,7 @@ def get_batch(split):
# determine the vocab size we'll use for from-scratch training

model_args['vocab_size'] = 50304
#model_args['vocab_size'] = 65 # for tinyshakespeare
gptconf = GPTConfig(**model_args)
model = GPT(gptconf)

Expand Down

0 comments on commit c6d4861

Please sign in to comment.