Skip to content

feat(checkpoint): support universal checkpoint #1242

feat(checkpoint): support universal checkpoint

feat(checkpoint): support universal checkpoint #1242

Workflow file for this run

name: demo-in-readme
on:
pull_request:
branches:
- "main"
- "develop"
paths-ignore:
- "docs/**"
- "**.md"
env:
WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4)
SLURM_PARTITION: llm_s
jobs:
dataset-preparation:
runs-on: [t_cluster]
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3
- name: raw-chinese-data
run: |
source activate ${evo_env_torch21_flash2}
sh ./ci_scripts/data/tokenizer_chinese.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
- name: alpaca-data
run: |
source activate ${evo_env_torch21_flash2}
sh ./ci_scripts/data/tokenizer_alpaca.sh
train:
runs-on: [t_cluster]
timeout-minutes: 30
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3
- name: slurm-train
id: basic_train
run: |
source activate ${evo_env_torch21_flash2}
sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
- name: load_preset_ckpt
if: ${{ failure() && steps.basic_train.conclusion == 'failure' }}
run: |
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
- name: load_new_ckpt
run: |
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
rm -rf $GITHUB_WORKSPACE/llm_ckpts
- name: torchrun-train
run: |
source activate ${evo_env_torch21_flash2}
sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
rm -rf $GITHUB_WORKSPACE/llm_ckpts
convert-model-then-load:
runs-on: [t_cluster]
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3
- name: convert-model-then-load
run: |
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
sh ./ci_scripts/model/convert_to_hf.sh
cd ./hf_ckpt
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py
exit_code=$?
cd ..
rm -rf $GITHUB_WORKSPACE/hf_ckpt
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname