From a5b7236b269da58276c819098015169d24c3e6d9 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Thu, 12 Sep 2024 10:53:52 +0000 Subject: [PATCH] add better instruction --- open_diloco/run_training.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/open_diloco/run_training.sh b/open_diloco/run_training.sh index f6e30a0..b0871d3 100755 --- a/open_diloco/run_training.sh +++ b/open_diloco/run_training.sh @@ -8,7 +8,7 @@ # you can either pass a fixed initial peer or set it to auto and the script will start a dht server for you -## ./run_training.sh 4 1 auto --per-device-train-batch-size 8 --total-batch-size 128 --lr 1e-2 --path-model ../tests/models/llama-2m-fresh --project debug --no-torch-compile --hv.local-steps 100 --fake-data --hv.matchmaking_time 2 +## ./run_training.sh 4 1 auto --per-device-train-batch-size 8 --total-batch-size 128 --lr 1e-2 --path-model ../tests/models/llama-2m-fresh --project debug --no-torch-compile --hv.local-steps 100 --fake-data --hv.matchmaking_time 2 --hv.fail_rank_drop --hv.skip_load_from_peers # Function to get CUDA devices based on the number of GPUs and index function get_cuda_devices() {