Skip to content

feat(checkpoint): support universal checkpoint #1246

feat(checkpoint): support universal checkpoint

feat(checkpoint): support universal checkpoint #1246

Workflow file for this run

name: e2e-tests
on:
pull_request:
branches:
- "develop"
paths-ignore:
- "doc/**"
- "**.md"
env:
WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4)
SLURM_PARTITION: llm_s
jobs:
training_4GPU:
runs-on: [t_cluster]
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3
- name: training_4GPU
run: |
source activate ${evo_env_torch21_flash2}
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=$jobname -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
training_8GPU_ISP:
runs-on: [t_cluster]
timeout-minutes: 10
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3
- name: training_8GPU_ISP
run: |
source activate ${evo_env_torch21_flash2}
jobname=ISP-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
training_8GPU_ISP_CKPT:
runs-on: [t_cluster]
timeout-minutes: 20
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3
- name: training_8GPU_ISP_CKPT
run: |
source activate ${evo_env_torch21_flash2}
jobname=ISP_CKPT-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_SAVE_CKPT" ./tests/test_training/test_loss.py
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
jobname=LOAD-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_LOAD_CKPT" ./tests/test_training/test_loss.py
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
training_8GPU_4DP2TP:
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_8GPU_4DP2TP_910B
if: ${{ matrix.runner == '910B' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command"
training_8GPU_4DP2TPSP:
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_8GPU_4DP2TPSP_910B
if: ${{ matrix.runner == '910B' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command"
training_8GPU_4DP2PP:
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_8GPU_4DP2PP_910B
if: ${{ matrix.runner == '910B' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command"
training_8GPU_4DP2PP_ZB:
runs-on: [t_cluster]
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3
- name: training_8GPU_4DP2PP_ZB
run: |
source activate ${evo_env_torch21_flash2}
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP_ZB" ./tests/test_training/test_loss.py
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
training_16GPU_4DP2TP2PP_MTP:
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_16GPU_4DP2TP2PP_MTP_910B
if: ${{ matrix.runner == '910B' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
training_16GPU_4DP2TP2PP_MSP:
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_16GPU_4DP2TP2PP_MSP_910B
if: ${{ matrix.runner == '910B' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
training_16GPU_4DP2TP2PP_FSP:
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_16GPU_4DP2TP2PP_FSP_910B
if: ${{ matrix.runner == '910B' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
training_llama2:
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 20
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_llama2_910B
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command"
training_internlm2:
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 20
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_internlm2_910B
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_internlm2" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command"