Experiments with the char based training.

sopotc · Mar 17, 2024 · c6d4861 · c6d4861
1 parent 71c1d53
commit c6d4861
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 20 deletions.
diff --git a/config/train_shakespeare_char.py b/config/train_shakespeare_char.py
@@ -9,29 +9,35 @@
 # we expect to overfit on this small dataset, so only save when val improves
 always_save_checkpoint = False
 
-wandb_log = False # override via command line if you like
-wandb_project = 'shakespeare-char'
-wandb_run_name = 'mini-gpt'
-
 dataset = 'shakespeare_char'
-gradient_accumulation_steps = 1
-batch_size = 64
-block_size = 256 # context of up to 256 previous characters
+gradient_accumulation_steps = 2
+batch_size = 40
+block_size = 512 # context of up to 512 previous characters
 
-# baby GPT model :)
-n_layer = 6
-n_head = 6
-n_embd = 384
-dropout = 0.2
+n_layer = 10
+n_head = 16
+n_embd = 512
+dropout = 0.15
 
 learning_rate = 1e-3 # with baby networks can afford to go a bit higher
-max_iters = 5000
-lr_decay_iters = 5000 # make equal to max_iters usually
-min_lr = 1e-4 # learning_rate / 10 usually
+max_iters = 1500
+lr_decay_iters = max_iters # make equal to max_iters usually
+min_lr = 1e-5 # learning_rate / 10 usually
 beta2 = 0.99 # make a bit bigger because number of tokens per iter is small
 
 warmup_iters = 100 # not super necessary potentially
 
-# on macbook also add
-# device = 'cpu'  # run on cpu only
-# compile = False # do not torch compile the model
+weight_decay = 1e-1
+
+
+
+# next token baseline: 
+# step 1000: train loss 1.0870, val loss 1.4768 (lr 1e-3, dropout 0.3) * 
+
+# step 1500: train loss 2.1345, val loss 2.3614 (lr 1e-5, dropout 0.2)
+# step 1500: train loss 1.7811, val loss 2.1740 (lr 1e-3, dropout 0.4)
+# step 1500: train loss 1.9536, val loss 2.2579 (lr 1e-3, dropout 0.5)
+# step 750 : train loss 1.7220, val loss 2.1908 (lr 1e-3, dropout 0.1, overfitted after 750)
+# step 1500: train loss 1.6305, val loss 2.1660 (lr 1e-3, dropout 0.3) * 
+# step 1500: train loss 1.6350, val loss 2.1830 (lr 1e-3, dropout 0.3, block_size 512))
+# step 750: train loss 1.7841, val loss 2.1851 (lr 1e-3, dropout 0.15, block_size 512, overfitted after 750))
diff --git a/runs_char.MD b/runs_char.MD
diff --git a/train.py b/train.py
@@ -118,9 +118,9 @@ def get_batch(split):
         data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
     else:
         data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
-    ix = torch.randint(len(data) - block_size, (batch_size,))
+    ix = torch.randint(len(data) - block_size -1, (batch_size,))
     x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
-    y = torch.stack([torch.from_numpy((data[i+5:i+5+block_size]).astype(np.int64)) for i in ix])
+    y = torch.stack([torch.from_numpy((data[i+2:i+2+block_size]).astype(np.int64)) for i in ix])
     if device_type == 'cuda':
         # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
         x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
@@ -139,6 +139,7 @@ def get_batch(split):
 # determine the vocab size we'll use for from-scratch training
 
 model_args['vocab_size'] = 50304
+#model_args['vocab_size'] = 65 # for tinyshakespeare
 gptconf = GPTConfig(**model_args)
 model = GPT(gptconf)