diff --git a/.gitignore b/.gitignore index f8311a7b06..4e8d4eb6d1 100644 --- a/.gitignore +++ b/.gitignore @@ -10,5 +10,4 @@ __pycache__/ input.txt env/ venv/ -coord_check/*/out/* -mutransfer_lr/*/out/* \ No newline at end of file +mup_examples/*/*/out/* \ No newline at end of file diff --git a/mup_examples/coord_check_shakespeare_char/mup/run.sh b/mup_examples/coord_check_shakespeare_char/mup/run.sh index 6abf7a5fc8..cb3d9d6071 100644 --- a/mup_examples/coord_check_shakespeare_char/mup/run.sh +++ b/mup_examples/coord_check_shakespeare_char/mup/run.sh @@ -6,7 +6,7 @@ do n_heads=$((width / head_size)) mup_base_width=256 mup_width_multiplier=$(echo "scale=8; $width/$mup_base_width" | bc -l) - out_dir="coord_check/mup/out/width${width}_depth2_seed${seed}" + out_dir="mup_examples/coord_check_shakespeare_char/mup/out/width${width}_depth2_seed${seed}" python train.py \ --out_dir=$out_dir \ --eval_interval=1 \ diff --git a/mup_examples/coord_check_shakespeare_char/sp/run.sh b/mup_examples/coord_check_shakespeare_char/sp/run.sh index 500a8adce3..48f528feba 100644 --- a/mup_examples/coord_check_shakespeare_char/sp/run.sh +++ b/mup_examples/coord_check_shakespeare_char/sp/run.sh @@ -4,7 +4,7 @@ do do head_size=64 n_heads=$((width / head_size)) - out_dir="coord_check/sp/out/width${width}_depth2_seed${seed}" + out_dir="mup_examples/coord_check_shakespeare_char/sp/out/width${width}_depth2_seed${seed}" python train.py \ --out_dir=$out_dir \ --eval_interval=1 \ diff --git a/mup_examples/coord_check_shakespeare_char/sp_with_mup_hidden_init/run.sh b/mup_examples/coord_check_shakespeare_char/sp_with_mup_hidden_init/run.sh index 80f0bb0b8d..c3538a1df5 100644 --- a/mup_examples/coord_check_shakespeare_char/sp_with_mup_hidden_init/run.sh +++ b/mup_examples/coord_check_shakespeare_char/sp_with_mup_hidden_init/run.sh @@ -6,7 +6,7 @@ do n_heads=$((width / head_size)) mup_base_width=256 mup_width_multiplier=$(echo "scale=8; $width/$mup_base_width" | bc -l) - out_dir="coord_check/sp_with_mup_hidden_init/out/width${width}_depth2_seed${seed}" + out_dir="mup_examples/coord_check_shakespeare_char/sp_with_mup_hidden_init/out/width${width}_depth2_seed${seed}" python train.py \ --out_dir=$out_dir \ --eval_interval=1 \ diff --git a/mup_examples/coord_check_shakespeare_char/sp_with_mup_hidden_init_and_lr/run.sh b/mup_examples/coord_check_shakespeare_char/sp_with_mup_hidden_init_and_lr/run.sh index 1df04be46d..e297faf739 100644 --- a/mup_examples/coord_check_shakespeare_char/sp_with_mup_hidden_init_and_lr/run.sh +++ b/mup_examples/coord_check_shakespeare_char/sp_with_mup_hidden_init_and_lr/run.sh @@ -6,7 +6,7 @@ do n_heads=$((width / head_size)) mup_base_width=256 mup_width_multiplier=$(echo "scale=8; $width/$mup_base_width" | bc -l) - out_dir="coord_check/sp_with_mup_hidden_init_and_lr/out/width${width}_depth2_seed${seed}" + out_dir="mup_examples/coord_check_shakespeare_char/sp_with_mup_hidden_init_and_lr/out/width${width}_depth2_seed${seed}" python train.py \ --out_dir=$out_dir \ --eval_interval=1 \ diff --git a/mup_examples/coord_check_shakespeare_char/sp_with_mup_hidden_init_and_lr_output_logits/run.sh b/mup_examples/coord_check_shakespeare_char/sp_with_mup_hidden_init_and_lr_output_logits/run.sh index 875cc8211d..9e2cf41c5a 100644 --- a/mup_examples/coord_check_shakespeare_char/sp_with_mup_hidden_init_and_lr_output_logits/run.sh +++ b/mup_examples/coord_check_shakespeare_char/sp_with_mup_hidden_init_and_lr_output_logits/run.sh @@ -6,7 +6,7 @@ do n_heads=$((width / head_size)) mup_base_width=256 mup_width_multiplier=$(echo "scale=8; $width/$mup_base_width" | bc -l) - out_dir="coord_check/sp_with_mup_hidden_init_and_lr_output_logits/out/width${width}_depth2_seed${seed}" + out_dir="mup_examples/coord_check_shakespeare_char/sp_with_mup_hidden_init_and_lr_output_logits/out/width${width}_depth2_seed${seed}" python train.py \ --out_dir=$out_dir \ --eval_interval=1 \ diff --git a/mup_examples/coord_check_shakespeare_char/sp_with_mup_hidden_init_and_lr_partial_output_logits/run.sh b/mup_examples/coord_check_shakespeare_char/sp_with_mup_hidden_init_and_lr_partial_output_logits/run.sh index fe7e75b5b7..ef29b9bd24 100644 --- a/mup_examples/coord_check_shakespeare_char/sp_with_mup_hidden_init_and_lr_partial_output_logits/run.sh +++ b/mup_examples/coord_check_shakespeare_char/sp_with_mup_hidden_init_and_lr_partial_output_logits/run.sh @@ -6,7 +6,7 @@ do n_heads=$((width / head_size)) mup_base_width=256 mup_width_multiplier=$(echo "scale=8; $width/$mup_base_width" | bc -l) - out_dir="coord_check/sp_with_mup_hidden_init_and_lr_partial_output_logits/out/width${width}_depth2_seed${seed}" + out_dir="mup_examples/coord_check_shakespeare_char/sp_with_mup_hidden_init_and_lr_partial_output_logits/out/width${width}_depth2_seed${seed}" mup_output_alpha=$(echo "scale=8; sqrt($mup_width_multiplier)" | bc -l) python train.py \ --out_dir=$out_dir \ diff --git a/mup_examples/mutransfer_lr_shakespeare_char/mup/run.sh b/mup_examples/mutransfer_lr_shakespeare_char/mup/run.sh index d7c5046932..af3c272be0 100644 --- a/mup_examples/mutransfer_lr_shakespeare_char/mup/run.sh +++ b/mup_examples/mutransfer_lr_shakespeare_char/mup/run.sh @@ -9,7 +9,7 @@ LAYERS=2 for width in 256 512 1024 2048 do - for lr in 0.0009765625 0.00048828125 0.000244140625 0.0001220703125 0.00006103515625 + for lr in 0.125 0.0625 0.03125 0.015625 0.0078125 0.00390625 0.001953125 0.0009765625 0.00048828125 0.000244140625 0.0001220703125 0.00006103515625 do for seed in 1 2 3 do diff --git a/mup_examples/mutransfer_lr_shakespeare_char/plot.ipynb b/mup_examples/mutransfer_lr_shakespeare_char/plot.ipynb index 6ae7c6d8bd..615502d09e 100644 --- a/mup_examples/mutransfer_lr_shakespeare_char/plot.ipynb +++ b/mup_examples/mutransfer_lr_shakespeare_char/plot.ipynb @@ -226,6 +226,9 @@ " 0.0001220703125,\n", " 0.00006103515625,\n", " 0.00003051757812,\n", + " 0.00001525878906,\n", + " 0.000007629394531,\n", + " 0.000003814697266,\n", "]\n", "class MplColorHelper:\n", "\n", @@ -240,10 +243,10 @@ "\n", "\n", "color_helper = MplColorHelper('viridis', 0, len(widths)-1)\n", - "n_cols = 2\n", + "n_cols = len(parameterizations)\n", "n_rows = 1\n", - "fig, axes = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 3*n_rows))\n", - "layers=2\n", + "fig, axes = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 3.33*n_rows))\n", + "layers=2\n", "\n", "for parameterization_idx, (parameterization, parameterization_str) in enumerate(parameterizations):\n", " ax = axes[parameterization_idx]\n", diff --git a/mup_examples/mutransfer_lr_shakespeare_char/sp/run.sh b/mup_examples/mutransfer_lr_shakespeare_char/sp/run.sh index 6c1afd58cf..74b664c47a 100644 --- a/mup_examples/mutransfer_lr_shakespeare_char/sp/run.sh +++ b/mup_examples/mutransfer_lr_shakespeare_char/sp/run.sh @@ -7,9 +7,9 @@ LAUNCHER=python LAYERS=2 -for width in 1024 2048 +for width in 256 512 1024 2048 do - for lr in 0.00390625 0.001953125 0.0009765625 0.00048828125 0.000244140625 0.0001220703125 0.00006103515625 0.00003051757812 + for lr in 0.00390625 0.001953125 0.0009765625 0.00048828125 0.000244140625 0.0001220703125 0.00006103515625 0.00003051757812 0.00048828125 0.000244140625 0.0001220703125 0.00006103515625 0.00003051757812 0.00001525878906 0.000007629394531 0.000003814697266 do for seed in 1 2 3 do