adds basic smoketests for main_ds and data_process CLI args

During development it's convenient to be able to run full distributed training, even on a smaller dataset, just to make sure that nothing obviously fails. This will also capture support for flash attention on the machine that it's run on, and for granite models. Signed-off-by: James Kunstle <[email protected]>
instructlab · Oct 17, 2024 · b637955 · b637955
1 parent e680bd8
commit b637955
Show file tree

Hide file tree

Showing 2 changed files with 162 additions and 0 deletions.
diff --git a/tests/README.md b/tests/README.md
@@ -0,0 +1,17 @@
+## Overview
+
+`smoketest.sh` cd's into the source directory and runs the script-entrypoint for `main_ds.py` and `data_process.py`. Testing will break if file names or locations in the source tree change.
+
+Existing tests are "smoke tests," meant to demonstrate that training completes (returns 0) or not. This is helpful to check if all required dependencies are installed.
+
+Tests add features over time:
+
+1. No Flash Attention or Granite
+2. No Granite but Flash Attention enabled
+3. Granite and Flash Attention enabled
+
+## Usage
+
+The testing script can be run without parameters as `./smoketest.sh`. By default, this will run all tests with `FSDP` as the distributed training backend. To change the distributed training backend to the other available option, one can run the script as `./smoketest.sh deepspeed`.
+
+> NOTE: You'll need to install the training library to run the test. Inside a virtual environment and at inside the repo, please run `pip3 install -e .` to install the package in editable mode.
diff --git a/tests/smoketest.sh b/tests/smoketest.sh
@@ -0,0 +1,145 @@
+#!/usr/bin/env bash
+set -eux
+
+# ############### User-modifiable parameters ############### 
+# Change these as needed
+NUM_GPUS=8
+MAX_BATCH_LEN=60000
+NUM_SAMPLES_TRAINED_ON=5000 # upper-bound on training dataset size.
+
+# ############### Read-only parameters ############### 
+MODEL_NAME="instructlab/granite-7b-lab"
+# gets directory of current file.
+SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+CORRECT_WORKING_DIR="$SCRIPT_DIR/../src/instructlab/training/"
+SAMPLE_DATA_PATH="$SCRIPT_DIR/../sample-data/train_all_pruned_SDG.jsonl"
+TMP_DIR=$(mktemp -d)
+CHECKPOINTS_DIR="$TMP_DIR/checkpoints"
+DATA_DIR="$TMP_DIR/data"
+COMPUTED_DATA_PATH="$DATA_DIR/data.jsonl"
+DEFAULT_DISTRIB_FRAMEWORK='fsdp'
+DISTRIB_FRAMEWORK="${1:-$DEFAULT_DISTRIB_FRAMEWORK}" # defaults to FSDP
+
+# ############### Test Functions ############### 
+
+function setup_tmpdir () {
+    mkdir "$CHECKPOINTS_DIR"
+    mkdir "$DATA_DIR"
+}
+
+function prepare_data () {
+    # preprocesses .jsonl messages data so that it's a valid
+    # input to the model (inputs tokenized, formatted with mask, etc.)
+    # then, data is trimmed to a determined length to make training
+    # go faster.
+
+    python3 data_process.py \
+    --data_path="$SAMPLE_DATA_PATH" \
+    --data_output_path="$DATA_DIR" \
+    --max_seq_len=4096 \
+    --model_name_or_path="$MODEL_NAME"
+
+    # trim data so we only keep the first 'n' samples.
+    # should be enough data for training to be meaningful but not enough
+    # that training takes a large amount of time.
+    echo "$(head -"$NUM_SAMPLES_TRAINED_ON" "$COMPUTED_DATA_PATH")" > "$COMPUTED_DATA_PATH"
+
+    echo "TRAINING ON $(wc -l "$COMPUTED_DATA_PATH") SAMPLES"
+}
+
+function _cleanup_saved_checkpoints() {
+    echo "CLEARING CHECKPOINTS: $CHECKPOINTS_DIR"
+    rm -rf "$CHECKPOINTS_DIR"
+    mkdir "$CHECKPOINTS_DIR"
+}
+
+function test_standard_loop () {
+    # Tests most common training parameters.
+    # - FSDP
+    # - Save full state
+    # - Save hf_format
+    # - padding-free
+    # - flash attention
+
+    torchrun \
+    --standalone \
+    --nproc_per_node="$NUM_GPUS" \
+    main_ds.py \
+    --model_name_or_path="$MODEL_NAME" \
+    --data_path="$COMPUTED_DATA_PATH" \
+    --output_dir="$CHECKPOINTS_DIR" \
+    --num_epochs=1 \
+    --effective_batch_size=128 \
+    --save_samples=0 \
+    --checkpoint_at_epoch \
+    --accelerate_full_state_at_epoch \
+    --distributed_training_framework="$DISTRIB_FRAMEWORK" \
+    --max_batch_len="$MAX_BATCH_LEN" \
+    --is_granite \
+}
+
+function test_standard_loop_nongranite () {
+    # Tests most common training parameters without
+    # using Granite
+    # - FSDP
+    # - Save full state
+    # - Save hf_format
+    # - flash attention
+
+    torchrun \
+    --standalone \
+    --nproc_per_node="$NUM_GPUS" \
+    main_ds.py \
+    --model_name_or_path="$MODEL_NAME" \
+    --data_path="$COMPUTED_DATA_PATH" \
+    --output_dir="$CHECKPOINTS_DIR" \
+    --num_epochs=1 \
+    --effective_batch_size=128 \
+    --save_samples=0 \
+    --checkpoint_at_epoch \
+    --accelerate_full_state_at_epoch \
+    --distributed_training_framework="$DISTRIB_FRAMEWORK" \
+    --max_batch_len="$MAX_BATCH_LEN" \
+    # --is_granite \
+}
+
+function test_standard_loop_noflashattention_nogranite () {
+    # Tests most common training parameters without
+    # using Granite or Flash Attention
+    # - FSDP
+    # - Save full state
+    # - Save hf_format
+
+    torchrun \
+    --standalone \
+    --nproc_per_node="$NUM_GPUS" \
+    main_ds.py \
+    --model_name_or_path="$MODEL_NAME" \
+    --data_path="$COMPUTED_DATA_PATH" \
+    --output_dir="$CHECKPOINTS_DIR" \
+    --num_epochs=1 \
+    --effective_batch_size=128 \
+    --save_samples=0 \
+    --checkpoint_at_epoch \
+    --accelerate_full_state_at_epoch \
+    --distributed_training_framework="$DISTRIB_FRAMEWORK" \
+    --max_batch_len="$MAX_BATCH_LEN" \
+    --disable_flash_attn
+    # --is_granite \
+}
+
+# ############### Setup and tests ############### 
+setup_tmpdir
+trap "rm -rf $TMP_DIR" EXIT
+
+#NOTE (jkunstle): script is run as though it's
+# in the same source dir as main_ds and data_process.
+cd "$CORRECT_WORKING_DIR"
+echo "CURRENT WORKING DIRECTORY: $(pwd)"
+
+prepare_data
+test_standard_loop_noflashattention_nogranite
+_cleanup_saved_checkpoints
+test_standard_loop_nongranite
+_cleanup_saved_checkpoints
+test_standard_loop