karpathy · cesposo · Oct 14, 2023 · Oct 19, 2023 · Oct 20, 2023 · Nov 5, 2023
diff --git a/.gitignore b/.gitignore
@@ -6,5 +6,15 @@ __pycache__/
 *.bin
 *.pkl
 *.pt
+*.txt
 *.pyc
+*.npz
 input.txt
+train.npy
+val.npy
+meta.pkl
+
+model_ckpts/
+wandb/
+predictions/
+model-ckpts/
diff --git a/README.md b/README.md
@@ -1,3 +1,82 @@
+# GPT for SAT Solving
+I've currently updated the Attention mechanism to use RoPE attention and the MLP layer to use swiGLU activation. This mirrors that of the llama model.
+The updated model is in `model_ext.py`. You can compare it to model.py to see the difference.
+
+To train, run the command
+```
+python data/6_10_marginal/prepare.py
+python train_sat.py config/6_10_marginal.py
+```
+
+# Training Command
+
+## DPLL
+
+```bash
+python train.py config/train_sat_dpll.py --max_iters 200000
+```
+
+## CDCL
+
+```bash
+python train.py config/train_sat_cdcl.py --max_iters 200000
+```
+
+# Eval Command
+
+## DPLL (Assume Training complete)
+
+```bash
+python eval_sat.py --out_dir=model-ckpts/out-sat-dpll --prompts_file=data/SAT_DPLL/SAT_Dataset_DPLL_Balanced_Test.txt --num_samples=1000 --output_file=predictions/out-sat-dpll.txt
+```
+
+## CDCL (Assume Training complete)
+
+```bash
+python eval_sat.py --out_dir=model-ckpts/out-sat-cdcl --prompts_file=/home/lpan68/nanoGPT/data/SAT_CDCL/SAT_Dataset_CDCL_Balanced_Test.txt --num_samples=1000 --output_file=predictions/out-sat-cdcl.txt
+```
+
+# Results
+
+## CDCL trace, 600000 epochs, Satisfiability Prediction, single-token difference, 20-layer
+F1 Score: 0.8952380952380952
+Accuracy: 0.89
+Precision: 0.8545454545454545
+Recall: 0.94
+
+## CDCL trace, 600000 epochs, Satisfiability Prediction, single-token difference, 12-layer
+F1 Score: 0.7716535433070867
+Accuracy: 0.71
+Precision: 0.6363636363636364
+Recall: 0.98
+
+## CDCL trace, 600000 epochs, Satisfiability Prediction, Random Formula, 12-layer
+F1 Score: 1.0
+Accuracy: 1.0
+Precision: 1.0
+Recall: 1.0
+
+## DPLL trace, 20000 epochs, Satisfiability Prediction
+
+F1 Score: 0.4444444444444445
+Accuracy: 0.55
+Precision: 0.5806451612903226
+Recall: 0.36
+
+predictions/out-sat-dpll.txt
+
+## CDCL trace, 20000 epochs, Satisfiability Prediction
+F1 Score: 0.6969696969696971
+Accuracy: 0.6
+Precision: 0.5609756097560976
+Recall: 0.92
+
+# TODO
+
+Current accuracy is very low (~55%) although the model outputs seemingly valid trace. This may be due using the entire string including both the original problem, the execution trace, the final SAT result, and the padding tokens for training. This training method dilutes the importance of trace and satisfiability prediction since they only occupy a small portion of tokens in the prediction task. Therefore, as next steps, we can:
+
+- Only compute loss for tokens of the trace and satisfiability prediction instead of the whole string (This may require discarding the use of mini-batches in training and make training more inefficient)
+
 
 # nanoGPT
 

diff --git a/bench.py b/bench.py
diff --git a/config/6_10_marginal.py b/config/6_10_marginal.py
@@ -0,0 +1,7 @@
+epochs = 10
+block_size = 800
+batch_size = 16
+out_dir = 'models/6_10_marginal'
+dataset = "data/6_10_marginal"
+rope = True
+decay_lr = True
diff --git a/config/6_10_marginal_up.py b/config/6_10_marginal_up.py
@@ -0,0 +1,7 @@
+epochs = 10
+block_size = 800
+batch_size = 16
+out_dir = 'models/6_10_marginal_up'
+dataset = "data/6_10_marginal_up"
+rope = True
+decay_lr = True
diff --git a/config/train_ltl.py b/config/train_ltl.py
@@ -0,0 +1,36 @@
+# train a miniature character-level shakespeare model
+# good for debugging and playing on macbooks and such
+
+out_dir = 'model-ckpts/out-ltl'
+eval_interval = 250 # keep frequent because we'll overfit
+eval_iters = 200
+log_interval = 10 # don't print too too often
+
+# we expect to overfit on this small dataset, so only save when val improves
+always_save_checkpoint = False
+
+wandb_log = True # override via command line if you like
+wandb_project = 'ltl-gpt'
+wandb_run_name = 'ltl-gpt'
+
+dataset = 'LTL'
+gradient_accumulation_steps = 1
+batch_size = 12
+block_size = 523 # Note: longest sequence is 524, -1 for prediction
+
+# baby GPT model for first experiments
+n_layer = 12
+n_head = 12
+n_embd = 384
+dropout = 0.2
+
+max_iters = 100000
+lr_decay_iters = 100000 # make equal to max_iters usually
+min_lr = 1e-4 # learning_rate / 10 usually
+beta2 = 0.99 # make a bit bigger because number of tokens per iter is small
+
+warmup_iters = 100 # not super necessary potentially
+
+# on macbook also add
+# device = 'cpu'  # run on cpu only
+compile = False # do not torch compile the model
diff --git a/config/train_ltl_large.py b/config/train_ltl_large.py
@@ -0,0 +1,36 @@
+# train a miniature character-level shakespeare model
+# good for debugging and playing on macbooks and such
+
+out_dir = 'model-ckpts/out-ltl-large'
+eval_interval = 250 # keep frequent because we'll overfit
+eval_iters = 200
+log_interval = 10 # don't print too too often
+
+# we expect to overfit on this small dataset, so only save when val improves
+always_save_checkpoint = False
+
+wandb_log = True # override via command line if you like
+wandb_project = 'ltl-gpt'
+wandb_run_name = 'ltl-gpt-large'
+
+dataset = 'LTL'
+gradient_accumulation_steps = 1
+batch_size = 12
+block_size = 523 # Note: longest sequence is 524, -1 for prediction
+
+# baby GPT model for first experiments
+n_layer = 20
+n_head = 12
+n_embd = 384
+dropout = 0.2
+
+max_iters = 300000
+lr_decay_iters = 10000 # make equal to max_iters usually
+min_lr = 1e-4 # learning_rate / 10 usually
+beta2 = 0.99 # make a bit bigger because number of tokens per iter is small
+
+warmup_iters = 100 # not super necessary potentially
+
+# on macbook also add
+# device = 'cpu'  # run on cpu only
+compile = False # do not torch compile the model
diff --git a/config/train_ltl_large_600k.py b/config/train_ltl_large_600k.py
@@ -0,0 +1,36 @@
+# train a miniature character-level shakespeare model
+# good for debugging and playing on macbooks and such
+
+out_dir = 'model-ckpts/out-ltl-large-600k'
+eval_interval = 250 # keep frequent because we'll overfit
+eval_iters = 200
+log_interval = 10 # don't print too too often
+
+# we expect to overfit on this small dataset, so only save when val improves
+always_save_checkpoint = False
+
+wandb_log = True # override via command line if you like
+wandb_project = 'ltl-gpt'
+wandb_run_name = 'ltl-gpt-large-600k'
+
+dataset = 'LTL'
+gradient_accumulation_steps = 1
+batch_size = 12
+block_size = 523 # Note: longest sequence is 524, -1 for prediction
+
+# baby GPT model for first experiments
+n_layer = 20
+n_head = 12
+n_embd = 384
+dropout = 0.2
+
+max_iters = 600000
+lr_decay_iters = 600000 # make equal to max_iters usually
+min_lr = 1e-4 # learning_rate / 10 usually
+beta2 = 0.99 # make a bit bigger because number of tokens per iter is small
+
+warmup_iters = 100 # not super necessary potentially
+
+# on macbook also add
+# device = 'cpu'  # run on cpu only
+compile = False # do not torch compile the model
diff --git a/config/train_sat.py b/config/train_sat.py
@@ -0,0 +1,36 @@
+# train a miniature character-level shakespeare model
+# good for debugging and playing on macbooks and such
+
+out_dir = 'out-sat'
+eval_interval = 250 # keep frequent because we'll overfit
+eval_iters = 200
+log_interval = 10 # don't print too too often
+
+# we expect to overfit on this small dataset, so only save when val improves
+always_save_checkpoint = False
+
+wandb_log = False # override via command line if you like
+wandb_project = 'sat-solve-gpt'
+wandb_run_name = 'sat-gpt'
+
+dataset = 'SAT'
+gradient_accumulation_steps = 1
+batch_size = 12
+block_size = 1289 # Note: longest sequence is 1290, -1 for prediction
+
+# baby GPT model for first experiments
+n_layer = 6
+n_head = 6
+n_embd = 384
+dropout = 0.2
+
+max_iters = 10000
+lr_decay_iters = 5000 # make equal to max_iters usually
+min_lr = 1e-4 # learning_rate / 10 usually
+beta2 = 0.99 # make a bit bigger because number of tokens per iter is small
+
+warmup_iters = 100 # not super necessary potentially
+
+# on macbook also add
+# device = 'cpu'  # run on cpu only
+# compile = False # do not torch compile the model