From aeed2cd400d4f10f0cfd49f5c3a7589a38e31a6b Mon Sep 17 00:00:00 2001 From: Gurpreet Singh Dhami Date: Tue, 8 Oct 2024 21:04:20 +0000 Subject: [PATCH 01/30] rocm docker and scripts --- Dockerfile_amd | 85 ++++++++++++ run_examples.sh | 7 + train_llama.sh | 336 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 428 insertions(+) create mode 100644 Dockerfile_amd create mode 100755 run_examples.sh create mode 100644 train_llama.sh diff --git a/Dockerfile_amd b/Dockerfile_amd new file mode 100644 index 0000000000..a449c908c4 --- /dev/null +++ b/Dockerfile_amd @@ -0,0 +1,85 @@ +#ARG BASE_DOCKER=rocm/pytorch:latest +#ARG BASE_DOCKER=rocm/pytorch-private:exec_dashboard_nightly +ARG BASE_DOCKER=rocm/pytorch-private:20240920_exec_dashboard_nightly +FROM $BASE_DOCKER +ENV WORKSPACE_DIR=/workspace +ENV STAGE_DIR=/workspace/installs +RUN mkdir -p $WORKSPACE_DIR +RUN mkdir -p ${STAGE_DIR} +WORKDIR $WORKSPACE_DIR + + +RUN pip3 install \ +numpy==1.26.4 \ +scipy \ +einops \ +flask-restful \ +nltk \ +pytest \ +pytest-cov \ +pytest_mock \ +pytest-random-order \ +sentencepiece \ +wrapt \ +zarr \ +wandb \ +tensorstore==0.1.45 \ +pytest_mock \ +pybind11 \ +wrapt \ +setuptools==69.5.1 \ +datasets + +RUN pip3 install "huggingface_hub[cli]" +RUN python3 -m nltk.downloader punkt_tab + + +############################################################################## +# Apex +############################################################################## +#RUN git clone https://github.com/ROCm/apex.git ${STAGE_DIR}/apex +#WORKDIR ${STAGE_DIR}/apex +#RUN python3 setup.py install --cpp_ext --cuda_ext +#WORKDIR $WORKSPACE_DIR +#RUN rm -rf ${STAGE_DIR}/apex + + +# Install Causal-Conv1d and its dependencies +WORKDIR ${STAGE_DIR} +ENV CAUSAL_CONV1D_FORCE_BUILD=TRUE +ENV MAMBA_FORCE_BUILD=TRUE +ENV HIP_ARCHITECTURES="gfx942" +RUN git clone https://github.com/Dao-AILab/causal-conv1d causal-conv1d &&\ + cd causal-conv1d &&\ + git show --oneline -s &&\ + pip install . + +# Install mamba +WORKDIR ${STAGE_DIR} +RUN git clone https://github.com/state-spaces/mamba mamba &&\ + cd mamba &&\ + git show --oneline -s &&\ + pip install --no-build-isolation . + +# Clone TE repo and submodules +WORKDIR ${STAGE_DIR} +ENV NVTE_FRAMEWORK=pytorch +ENV PYTORCH_ROCM_ARCH=gfx942 +ENV NVTE_USE_HIPBLASLT=1 +RUN git clone --recursive https://github.com/ROCmSoftwarePlatform/TransformerEngine-private.git &&\ + cd TransformerEngine-private &&\ + pip install . + +WORKDIR $WORKSPACE_DIR +RUN git clone https://github.com/ROCm/Megatron-LM.git Megatron-LM &&\ + cd Megatron-LM &&\ + git checkout rocm_megatron_lm_upstream &&\ + pip install -e . + + +WORKDIR $WORKSPACE_DIR/Megatron-LM + +# record configuration for posterity +RUN pip list + +RUN cd $WORKSPACE_DIR diff --git a/run_examples.sh b/run_examples.sh new file mode 100755 index 0000000000..078fb21905 --- /dev/null +++ b/run_examples.sh @@ -0,0 +1,7 @@ +bash ../scripts/megatron/debug_train_70b_custom.sh ENABLE_MOCK_DATA=0 ENABLE_ROPE=1 DUMMY_RUN=0 ADD_TASK=0 MODEL_SIZE=70 ENABLE_PROFILING=0 TP=8 PP=1 BS=8 MBS=2 TOTAL_ITERS=20 LABEL="rope_en_fused" + + +#bash ../scripts/megatron/debug_train_70b_custom.sh ENABLE_MOCK_DATA=0 ENABLE_ROPE=0 DUMMY_RUN=0 ADD_TASK=0 MODEL_SIZE=70 ENABLE_PROFILING=0 TP=8 PP=1 BS=8 MBS=2 TOTAL_ITERS=20 LABEL="rope_disabled" + + +#bash ../scripts/megatron/debug_train_70b_custom.sh ENABLE_MOCK_DATA=0 ENABLE_ROPE=1 DUMMY_RUN=0 ADD_TASK=0 MODEL_SIZE=70 ENABLE_PROFILING=1 TP=8 PP=1 BS=8 MBS=2 TOTAL_ITERS=20 LABEL="rope_en_fused_profile" diff --git a/train_llama.sh b/train_llama.sh new file mode 100644 index 0000000000..3c3bae1fa9 --- /dev/null +++ b/train_llama.sh @@ -0,0 +1,336 @@ +#!/bin/bash + +# set -x + +export GPU_MAX_HW_QUEUES=2 +export TORCH_NCCL_HIGH_PRIORITY=1 + + +# parsing input arguments +for ARGUMENT in "$@" +do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + + KEY_LENGTH=${#KEY} + VALUE="${ARGUMENT:$KEY_LENGTH+1}" + export "$KEY"="$VALUE" +done + +# Change for multinode config +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +TEE_OUTPUT="${TEE_OUTPUT:-1}" +NO_TORCH_COMPILE="${NO_TORCH_COMPILE:-0}" +USE_FLASH_ATTN="${USE_FLASH_ATTN:-1}" +NO_TRAINING="${NO_TRAINING:-0}" # NO_TRAINING=1: for computing metrics only +ENABLE_PROFILING="${ENABLE_PROFILING:-0}" +ENABLE_ROPE="${ENABLE_ROPE:-1}" +ENABLE_MOCK_DATA="${ENABLE_MOCK_DATA:-1}" +DUMMY_RUN="${DUMMY_RUN:-0}" +ADD_TASK="${ADD_TASK:-0}" +LABEL="${LABEL:-"test"}" +LOG_DIR="profile/${LABEL}" +echo "NO_TRAINING=$NO_TRAINING" + +CWD=`pwd` +GPUS_PER_NODE=`python -c "import torch; print(torch.cuda.device_count())"` + + +# Change for multinode config +MASTER_ADDR=localhost +MASTER_PORT=23731 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) + +MODEL_SIZE="${MODEL_SIZE:-70}" +TP="${TP:-8}" +PP="${PP:-1}" +MBS="${MBS:-2}" +BS="${BS:-8}" +SEQ_LENGTH="${SEQ_LENGTH:-4096}" +TOTAL_ITERS="${TOTAL_ITERS:-4}" +SEQ_PARALLEL="${SEQ_PARALLEL:-1}" +CONTI_PARAMS="${CONTI_PARAMS:-0}" +OPTIMIZER="${OPTIMIZER:-sgd}" +TE_FP16="${TE_FP16:-1}" + + +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +EXPERIMENT_DIR="experiment" +mkdir -p $EXPERIMENT_DIR + +CHECKPOINT_PATH=$EXPERIMENT_DIR/ckpts +rm -rf $CHECKPOINT_PATH +mkdir -p $CHECKPOINT_PATH +DATA_DIR=$EXPERIMENT_DIR/data +mkdir -p $DATA_DIR + +TOKENIZER_MODEL=$EXPERIMENT_DIR/tokenizer.model + +# Download the tokenizer model +if ! [ -f "$TOKENIZER_MODEL" ]; then +wget -O $TOKENIZER_MODEL https://huggingface.co/NousResearch/Llama-2-7b-chat-hf/resolve/main/tokenizer.model +fi + +# Prepare the dataset +echo 'import argparse +from pathlib import Path +from datasets import load_dataset + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--out-dir", type=str, required=False, default="tmp/data", + help="Path to output JSON") + args = parser.parse_args() + out_dir = Path(args.out_dir) + out_dir.mkdir(exist_ok=True, parents=True) + + dataset = load_dataset("bookcorpus", split="train") + dataset.to_json(out_dir / "bookcorpus_megatron.json")' > prepare_bookcorpus_megatron_dataset.py + +DATA_PATH=${DATA_DIR}/bookcorpus_text_sentence + + if ! [ -f "${DATA_DIR}/bookcorpus_text_sentence.idx" ]; then + echo "Dataset file does not exist, creating..." + python3 prepare_bookcorpus_megatron_dataset.py --out-dir ${DATA_DIR} + python3 tools/preprocess_data.py --input ${DATA_DIR}/bookcorpus_megatron.json --tokenizer-type GPTSentencePieceTokenizer --tokenizer-model ${EXPERIMENT_DIR}/tokenizer.model --output-prefix ${DATA_DIR}/bookcorpus --workers `nproc` --split-sentences + python3 tools/preprocess_data.py --input ${DATA_DIR}/bookcorpus_megatron.json --tokenizer-type GPTSentencePieceTokenizer --tokenizer-model ${EXPERIMENT_DIR}/tokenizer.model --output-prefix ${DATA_DIR}/bookcorpus --workers `nproc` --split-sentences + else + echo "Dataset file already exist." + fi + + +MAX_POSITION_EMBEDDINGS=32768 + +if [ "$TE_FP16" -eq 1 ]; then + TRAIN_LOG="${EXPERIMENT_DIR}/train_${MODEL_SIZE}B_iter${TOTAL_ITERS}_mbs${MBS}_bs${BS}_tp${TP}_pp${PP}_seq${SEQ_LENGTH}_optim_${OPTIMIZER}_nocompile${NO_TORCH_COMPILE}_fa_${USE_FLASH_ATTN}_seqpara_${SEQ_PARALLEL}_contiparam_${CONTI_PARAMS}_TE_FP16_${LABEL}.log" +else + TRAIN_LOG="${EXPERIMENT_DIR}/train_${MODEL_SIZE}B_iter${TOTAL_ITERS}_mbs${MBS}_bs${BS}_tp${TP}_pp${PP}_seq${SEQ_LENGTH}_optim_${OPTIMIZER}_nocompile${NO_TORCH_COMPILE}_fa_${USE_FLASH_ATTN}_seqpara_${SEQ_PARALLEL}_contiparam_${CONTI_PARAMS}_${LABEL}.log" +fi + +echo $TRAIN_LOG + +if [[ $MODEL_SIZE -eq 7 ]]; then + HIDDEN_SIZE=4096 # e.g. llama-13b: 5120 + FFN_HIDDEN_SIZE=11008 # e.g. llama-13b: 13824 + NUM_LAYERS=32 # e.g. llama-13b: 40 + NUM_HEADS=32 # e.g. llama-13b: 40 + SEQ_LENGTH=$SEQ_LENGTH + MAX_POSITION_EMBEDDINGS=$MAX_POSITION_EMBEDDINGS + NUM_KV_HEADS=32 # llama2 70B uses GQA +elif [[ $MODEL_SIZE -eq 13 ]]; then + HIDDEN_SIZE=5120 # e.g. llama-13b: 5120 + FFN_HIDDEN_SIZE=13824 # e.g. llama-13b: 13824 + NUM_LAYERS=40 # e.g. llama-13b: 40 + NUM_HEADS=40 # e.g. llama-13b: 40 + SEQ_LENGTH=$SEQ_LENGTH + MAX_POSITION_EMBEDDINGS=$MAX_POSITION_EMBEDDINGS + NUM_KV_HEADS=40 # llama2 70B uses GQA +elif [[ $MODEL_SIZE -eq 20 ]]; then + HIDDEN_SIZE=8192 # e.g. llama-13b: 5120 + FFN_HIDDEN_SIZE=28672 # e.g. llama-13b: 13824 + NUM_LAYERS=20 # e.g. llama-13b: 40 + NUM_HEADS=64 # e.g. llama-13b: 40 + NUM_KV_HEADS=8 # llama2 70B uses GQA + SEQ_LENGTH=$SEQ_LENGTH + MAX_POSITION_EMBEDDINGS=$MAX_POSITION_EMBEDDINGS +elif [[ $MODEL_SIZE -eq 70 ]]; then + HIDDEN_SIZE=8192 # e.g. llama-13b: 5120 + FFN_HIDDEN_SIZE=28672 # e.g. llama-13b: 13824 + NUM_LAYERS=80 # e.g. llama-13b: 40 + NUM_HEADS=64 # e.g. llama-13b: 40 + NUM_KV_HEADS=8 # llama2 70B uses GQA + SEQ_LENGTH=$SEQ_LENGTH + MAX_POSITION_EMBEDDINGS=$MAX_POSITION_EMBEDDINGS +else + echo "Model size not supported." + exit 1 +fi + +GROUP_SIZE=$(( ${NUM_HEADS} / ${NUM_KV_HEADS} )) +NUM_GROUPS=$(( ${NUM_HEADS} / ${GROUP_SIZE} )) + + +PROFILING_DIR="${EXPERIMENT_DIR}/perf_${MODEL_SIZE}B_iter${TOTAL_ITERS}_mbs${MBS}_bs${BS}_tp${TP}_pp${PP}_seq${SEQ_LENGTH}_optim_${OPTIMIZER}_nocompile${NO_TORCH_COMPILE}_fa_${USE_FLASH_ATTN}_seqpara_${SEQ_PARALLEL}_contiparam_${CONTI_PARAMS}" + + +GPT_ARGS=" + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --num-layers $NUM_LAYERS \ + --hidden-size $HIDDEN_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --num-attention-heads $NUM_HEADS \ + --seq-length $SEQ_LENGTH \ + --max-position-embeddings $MAX_POSITION_EMBEDDINGS \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --swiglu \ + --init-method-std 0.02 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --normalization RMSNorm \ + --micro-batch-size $MBS \ + --global-batch-size $BS \ + --lr 3.0e-4 \ + --train-iters $TOTAL_ITERS \ + --lr-decay-style cosine \ + --min-lr 3.0e-5 \ + --weight-decay 1e-1 \ + --lr-warmup-fraction .01 \ + --optimizer $OPTIMIZER \ + --no-async-tensor-model-parallel-allreduce \ + --clip-grad 1.0 \ + --bf16 \ + --no-masked-softmax-fusion \ + --overlap-grad-reduce \ +" + # --no-masked-softmax-fusion \ + +DATA_ARGS=" + --tokenizer-type Llama2Tokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --split 949,50,1 \ +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval 1000 \ + --log-throughput \ + --no-save-optim \ + --eval-iters -1 +" + + # --save-interval $TOTAL_ITERS \ + # --eval-interval $TOTAL_ITERS \ + +DISTRIBUTED_ARGS=" + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +EXTRA_ARGS=" + --group-query-attention \ + --num-query-groups $NUM_GROUPS \ + --no-gradient-accumulation-fusion \ + --distributed-backend nccl \ + --distributed-timeout-minutes 30 +" + +if [ "$ENABLE_PROFILING" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --profile --use-pytorch-profiler --tensorboard-dir $LOG_DIR" +fi + +if [ "$ADD_TASK" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --task gpt_chat" +fi + + +if [ "$ENABLE_MOCK_DATA" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --mock-data" +else +EXTRA_ARGS="$EXTRA_ARGS --data-path $DATA_PATH" +fi + +if [ "$ENABLE_ROPE" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --position-embedding-type rope" +fi + +if [ "$NO_TORCH_COMPILE" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --no-torch-compile" +fi + +if [ "$USE_FLASH_ATTN" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --use-flash-attn" +fi + +if [ "$SEQ_PARALLEL" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --sequence-parallel" +fi + +if [ "$CONTI_PARAMS" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --use-contiguous-parameters-in-local-ddp" +fi + +if [ "$TE_FP16" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --transformer-impl=transformer_engine \ + --fp8-margin=0 \ + --fp8-interval=1 \ + --fp8-amax-history-len=1024 \ + --fp8-amax-compute-algo=max +" +fi + +if [ "$DUMMY_RUN" -eq 0 ]; then +run_cmd=" + torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + $EXTRA_ARGS \ + --load $CHECKPOINT_PATH +" +else +run_cmd=" +echo 'torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + $EXTRA_ARGS \ + --load $CHECKPOINT_PATH' +" +fi + +if [ "$TEE_OUTPUT" -eq 0 ]; then + run_cmd="$run_cmd >& $TRAIN_LOG" +else + run_cmd="$run_cmd |& tee $TRAIN_LOG" +fi + +if [ "$NO_TRAINING" -eq 0 ]; then + eval $run_cmd +fi + +echo 'import argparse +import numpy as np + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="Process Log") + parser.add_argument("filename") + args = parser.parse_args() + + with open(args.filename) as f: + lines = f.readlines() + lines = lines[1:-1] + lines = [float(a) for a in lines] + mean = np.mean(np.array(lines)) + print(mean)' > mean_log_value.py + + +# echo '============================================================================================================' +grep -Eo 'throughput per GPU [^|]*' $TRAIN_LOG | sed -E 's/.*throughput per GPU \(TFLOP\/s\/GPU\): ([0-9\.]+).*/\1/' > tmp.txt +echo "throughput per GPU: $(python mean_log_value.py tmp.txt)" |& tee -a $TRAIN_LOG +THROUGHPUT=$(python mean_log_value.py tmp.txt) +rm tmp.txt + +# echo '============================================================================================================' +grep -Eo 'elapsed time per iteration [^|]*' $TRAIN_LOG | sed -E 's/.*elapsed time per iteration \(ms\): ([0-9\.]+).*/\1/' > tmp.txt +echo "elapsed time per iteration: $(python mean_log_value.py tmp.txt)" |& tee -a $TRAIN_LOG + +TIME_PER_ITER=$(python mean_log_value.py tmp.txt 2>/dev/null | awk '{printf "%.6f", $0}') +PERFORMANCE=$(awk -v bs="$BS" -v sl="$SEQ_LENGTH" -v tpi="$TIME_PER_ITER" -v ws="$WORLD_SIZE" 'BEGIN {printf "%.6f", bs * sl * 1000/ (tpi * ws)}') +echo "tokens/GPU/s: $PERFORMANCE" |& tee -a $TRAIN_LOG +rm tmp.txt + +echo '============================================================================================================' +grep -Eo 'mem usages: [^|]*' $TRAIN_LOG | sed -E 's/.*mem usages: ([0-9\.]+).*/\1/' > tmp.txt +echo "mem usages: $(python mean_log_value.py tmp.txt)" |& tee -a $TRAIN_LOG +rm tmp.txt + +echo "Model, $MODEL_SIZE, BS , $BS, MBS $MBS, TP, $TP, PP, $PP, ROPE, $ENABLE_ROPE, mock, $ENABLE_MOCK_DATA, throughput(tflops), $THROUGHPUT , time_per_iter , $TIME_PER_ITER, token/sec, $PERFORMANCE, LABEL, $LABEL, ITERS, $TOTAL_ITERS " >> results.csv From 12952bf5858e7c5169134b5005a1c5df67f551a4 Mon Sep 17 00:00:00 2001 From: Gurpreet Singh Dhami Date: Wed, 9 Oct 2024 16:53:51 +0000 Subject: [PATCH 02/30] update mock data --- train_llama.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/train_llama.sh b/train_llama.sh index 3c3bae1fa9..3ebba0dd37 100644 --- a/train_llama.sh +++ b/train_llama.sh @@ -74,6 +74,8 @@ if ! [ -f "$TOKENIZER_MODEL" ]; then wget -O $TOKENIZER_MODEL https://huggingface.co/NousResearch/Llama-2-7b-chat-hf/resolve/main/tokenizer.model fi +if ! [ "$ENABLE_MOCK_DATA" -eq 1 ]; then +exit # Prepare the dataset echo 'import argparse from pathlib import Path @@ -100,7 +102,7 @@ DATA_PATH=${DATA_DIR}/bookcorpus_text_sentence else echo "Dataset file already exist." fi - +fi MAX_POSITION_EMBEDDINGS=32768 From 9713598aa87d1b2b9d76ed892408974db50db1a3 Mon Sep 17 00:00:00 2001 From: Gurpreet Singh Dhami Date: Wed, 9 Oct 2024 21:49:33 +0000 Subject: [PATCH 03/30] update --- Dockerfile_amd | 4 ++-- run_examples.sh | 7 +++---- train_llama.sh | 5 +++++ 3 files changed, 10 insertions(+), 6 deletions(-) mode change 100644 => 100755 train_llama.sh diff --git a/Dockerfile_amd b/Dockerfile_amd index a449c908c4..b53d6601d5 100644 --- a/Dockerfile_amd +++ b/Dockerfile_amd @@ -1,6 +1,6 @@ -#ARG BASE_DOCKER=rocm/pytorch:latest +ARG BASE_DOCKER=rocm/pytorch:latest #ARG BASE_DOCKER=rocm/pytorch-private:exec_dashboard_nightly -ARG BASE_DOCKER=rocm/pytorch-private:20240920_exec_dashboard_nightly +#ARG BASE_DOCKER=rocm/pytorch-private:20240920_exec_dashboard_nightly FROM $BASE_DOCKER ENV WORKSPACE_DIR=/workspace ENV STAGE_DIR=/workspace/installs diff --git a/run_examples.sh b/run_examples.sh index 078fb21905..f5e69256c6 100755 --- a/run_examples.sh +++ b/run_examples.sh @@ -1,7 +1,6 @@ -bash ../scripts/megatron/debug_train_70b_custom.sh ENABLE_MOCK_DATA=0 ENABLE_ROPE=1 DUMMY_RUN=0 ADD_TASK=0 MODEL_SIZE=70 ENABLE_PROFILING=0 TP=8 PP=1 BS=8 MBS=2 TOTAL_ITERS=20 LABEL="rope_en_fused" +bash train_llama.sh ENABLE_MOCK_DATA=0 ENABLE_ROPE=1 ENABLE_ROPE_TE=1 DUMMY_RUN=0 ADD_TASK=0 MODEL_SIZE=70 ENABLE_PROFILING=0 TP=8 PP=1 BS=8 MBS=2 TOTAL_ITERS=20 LABEL="rope_en_fused" +#bash train_llama.sh ENABLE_MOCK_DATA=0 ENABLE_ROPE=0 DUMMY_RUN=0 ADD_TASK=0 MODEL_SIZE=70 ENABLE_PROFILING=0 TP=8 PP=1 BS=8 MBS=2 TOTAL_ITERS=20 LABEL="rope_disabled" -#bash ../scripts/megatron/debug_train_70b_custom.sh ENABLE_MOCK_DATA=0 ENABLE_ROPE=0 DUMMY_RUN=0 ADD_TASK=0 MODEL_SIZE=70 ENABLE_PROFILING=0 TP=8 PP=1 BS=8 MBS=2 TOTAL_ITERS=20 LABEL="rope_disabled" - -#bash ../scripts/megatron/debug_train_70b_custom.sh ENABLE_MOCK_DATA=0 ENABLE_ROPE=1 DUMMY_RUN=0 ADD_TASK=0 MODEL_SIZE=70 ENABLE_PROFILING=1 TP=8 PP=1 BS=8 MBS=2 TOTAL_ITERS=20 LABEL="rope_en_fused_profile" +#bash train_llama.sh ENABLE_MOCK_DATA=0 ENABLE_ROPE=1 DUMMY_RUN=0 ADD_TASK=0 MODEL_SIZE=70 ENABLE_PROFILING=1 TP=8 PP=1 BS=8 MBS=2 TOTAL_ITERS=20 LABEL="rope_en_fused_profile" diff --git a/train_llama.sh b/train_llama.sh old mode 100644 new mode 100755 index 3ebba0dd37..fc6b859410 --- a/train_llama.sh +++ b/train_llama.sh @@ -25,6 +25,7 @@ USE_FLASH_ATTN="${USE_FLASH_ATTN:-1}" NO_TRAINING="${NO_TRAINING:-0}" # NO_TRAINING=1: for computing metrics only ENABLE_PROFILING="${ENABLE_PROFILING:-0}" ENABLE_ROPE="${ENABLE_ROPE:-1}" +ENABLE_ROPE_TE="${ENABLE_ROPE_TE:-1}" ENABLE_MOCK_DATA="${ENABLE_MOCK_DATA:-1}" DUMMY_RUN="${DUMMY_RUN:-0}" ADD_TASK="${ADD_TASK:-0}" @@ -243,6 +244,10 @@ if [ "$ENABLE_ROPE" -eq 1 ]; then EXTRA_ARGS="$EXTRA_ARGS --position-embedding-type rope" fi +if [ "$ENABLE_ROPE_TE" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --use-te-fused-rope" +fi + if [ "$NO_TORCH_COMPILE" -eq 1 ]; then EXTRA_ARGS="$EXTRA_ARGS --no-torch-compile" fi From 717e7cd5d80f614aa518d9827b0efbd1c49a21d5 Mon Sep 17 00:00:00 2001 From: Gurpreet Singh Dhami Date: Thu, 10 Oct 2024 15:49:38 +0000 Subject: [PATCH 04/30] add jenkins pipeline --- Jenkinsfile | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 Jenkinsfile diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000000..7f73a1ce70 --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,67 @@ +import org.apache.commons.io.FilenameUtils +import groovy.json.JsonOutput + + +def show_node_info() { + sh """ + echo "NODE_NAME = \$NODE_NAME" || true + lsb_release -sd || true + uname -r || true + cat /sys/module/amdgpu/version || true + ls /opt/ -la || true + """ +} + +def clean_up_docker() { + sh 'docker ps -a || true' // "|| true" suppresses errors + sh 'docker kill $(docker ps -q) || true' + sh 'docker rm $(docker ps -a -q) || true' + sh 'docker rmi $(docker images -q) || true' + sh 'docker system prune -af --volumes || true' +} + +def clean_up_docker_container() { + sh 'docker ps -a || true' // "|| true" suppresses errors + sh 'docker kill $(docker ps -q) || true' +} + +//makes sure multiple builds are not triggered for branch indexing +def resetbuild() { + if(currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) { + def milestonesList = [] + def build = currentBuild + + while(build != null) { + if(build.getBuildCauses().toString().contains('BranchIndexingCause')) { + milestonesList.add(0, build.number) + } + build = build.previousBuildInProgress + } + + for (buildNum in milestonesList) { + milestone(buildNum) + } + } +} + +pipeline { + agent any + + stages { + stage('Build') { + steps { + echo 'Building..' + } + } + stage('Test') { + steps { + echo 'Testing..' + } + } + stage('Deploy') { + steps { + show_node_info() + } + } + } +} From 0c4709e06df29d5a40a25246fe72ab80b421516d Mon Sep 17 00:00:00 2001 From: Gurpreet Singh Dhami <143527450+gurpreet-dhami@users.noreply.github.com> Date: Fri, 11 Oct 2024 09:48:36 -0400 Subject: [PATCH 05/30] fix runtime error for rocm (#3) --- megatron/legacy/fused_kernels/__init__.py | 32 ++++++++++++----------- 1 file changed, 17 insertions(+), 15 deletions(-) mode change 100644 => 100755 megatron/legacy/fused_kernels/__init__.py diff --git a/megatron/legacy/fused_kernels/__init__.py b/megatron/legacy/fused_kernels/__init__.py old mode 100644 new mode 100755 index 87cceac3e3..f01088fd5a --- a/megatron/legacy/fused_kernels/__init__.py +++ b/megatron/legacy/fused_kernels/__init__.py @@ -3,9 +3,10 @@ import os import pathlib import subprocess - +import torch from torch.utils import cpp_extension + # Setting this param to a list has a problem of generating different # compilation commands (with diferent order of architectures) and # leading to recompilation of fused kernels. Set it to empty string @@ -16,22 +17,23 @@ def load(args): - # Check if cuda 11 is installed for compute capability 8.0 - cc_flag = [] - _, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version( - cpp_extension.CUDA_HOME - ) - if int(bare_metal_major) >= 11: - cc_flag.append('-gencode') - cc_flag.append('arch=compute_80,code=sm_80') - if int(bare_metal_minor) >= 8: + if torch.cuda.is_available() and torch.version.cuda: + # Check if cuda 11 is installed for compute capability 8.0 + cc_flag = [] + _, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version( + cpp_extension.CUDA_HOME + ) + if int(bare_metal_major) >= 11: cc_flag.append('-gencode') - cc_flag.append('arch=compute_90,code=sm_90') + cc_flag.append('arch=compute_80,code=sm_80') + if int(bare_metal_minor) >= 8: + cc_flag.append('-gencode') + cc_flag.append('arch=compute_90,code=sm_90') - # Build path - srcpath = pathlib.Path(__file__).parent.absolute() - buildpath = srcpath / "build" - _create_build_dir(buildpath) + # Build path + srcpath = pathlib.Path(__file__).parent.absolute() + buildpath = srcpath / "build" + _create_build_dir(buildpath) # Helper function to build the kernels. def _cpp_extention_load_helper(name, sources, extra_cuda_flags): From 6dbde6a12d98a49f32b56d70db67f1486c8feccc Mon Sep 17 00:00:00 2001 From: Gurpreet Singh Dhami Date: Thu, 10 Oct 2024 20:12:45 +0000 Subject: [PATCH 06/30] update --- Dockerfile_amd | 1 - train_llama.sh | 1 - 2 files changed, 2 deletions(-) diff --git a/Dockerfile_amd b/Dockerfile_amd index b53d6601d5..ca68aea00c 100644 --- a/Dockerfile_amd +++ b/Dockerfile_amd @@ -82,4 +82,3 @@ WORKDIR $WORKSPACE_DIR/Megatron-LM # record configuration for posterity RUN pip list -RUN cd $WORKSPACE_DIR diff --git a/train_llama.sh b/train_llama.sh index fc6b859410..71ddd7008a 100755 --- a/train_llama.sh +++ b/train_llama.sh @@ -76,7 +76,6 @@ wget -O $TOKENIZER_MODEL https://huggingface.co/NousResearch/Llama-2-7b-chat-hf/ fi if ! [ "$ENABLE_MOCK_DATA" -eq 1 ]; then -exit # Prepare the dataset echo 'import argparse from pathlib import Path From 2b855bafe238ab0949d49609131d2da87fb09000 Mon Sep 17 00:00:00 2001 From: Gurpreet Singh Dhami <143527450+gurpreet-dhami@users.noreply.github.com> Date: Tue, 22 Oct 2024 09:27:09 -0400 Subject: [PATCH 07/30] rope enable through TE (#4) * rope enable * add conditional check for TE * add command line argument for rope enablement * move rope enablement to rope utils * enable fused rope by transformer engine --- .../models/common/embeddings/rope_utils.py | 194 ++++++++++++++---- .../common/embeddings/rotary_pos_embedding.py | 2 +- .../core/transformer/transformer_config.py | 3 + megatron/training/arguments.py | 4 + 4 files changed, 167 insertions(+), 36 deletions(-) mode change 100644 => 100755 megatron/core/models/common/embeddings/rope_utils.py mode change 100644 => 100755 megatron/core/models/common/embeddings/rotary_pos_embedding.py mode change 100644 => 100755 megatron/core/transformer/transformer_config.py mode change 100644 => 100755 megatron/training/arguments.py diff --git a/megatron/core/models/common/embeddings/rope_utils.py b/megatron/core/models/common/embeddings/rope_utils.py old mode 100644 new mode 100755 index accb251961..ece04492af --- a/megatron/core/models/common/embeddings/rope_utils.py +++ b/megatron/core/models/common/embeddings/rope_utils.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Optional, Tuple, Union if TYPE_CHECKING: from megatron.core.transformer.transformer_config import TransformerConfig @@ -26,6 +26,11 @@ except ImportError: HAVE_APPLY_ROPE_FUSION = False +try: + import transformer_engine.pytorch.cpp_extensions as tex + HAVE_TE = True +except ImportError: + HAVE_TE = False def get_pos_emb_on_this_cp_rank(pos_emb: Tensor, seq_dim: int) -> Tensor: """Get the position embedding on the current context parallel rank. @@ -149,43 +154,162 @@ def apply_rotary_pos_emb( Reroute to the appropriate apply_rotary_pos_emb function depending on fused/unfused kernels, or bshd (conventional) / thd (packed seq) format """ - if config.apply_rope_fusion and not HAVE_APPLY_ROPE_FUSION: - # setting apply_rope_fusion in config to False - # so that subsequent queries to this config also return False - config.apply_rope_fusion = False - if not getattr(apply_rotary_pos_emb, "printed_fused_warning", False): + if not config.disable_te_fused_rope and HAVE_TE and torch.cuda.is_available() and torch.version.hip: + return apply_rotary_pos_emb_fused_te(t = t, freqs = freqs, config = config, cu_seqlens = cu_seqlens) + else: + if config.apply_rope_fusion and not HAVE_APPLY_ROPE_FUSION: + # setting apply_rope_fusion in config to False + # so that subsequent queries to this config also return False + config.apply_rope_fusion = False + if not getattr(apply_rotary_pos_emb, "printed_fused_warning", False): + logger.warning( + "Setting apply_rope_fusion to false because its implementation" + " is not included in Apex. Try upgrading to the latest version" + ) + apply_rotary_pos_emb.printed_fused_warning = True + + if getattr(config, "multi_latent_attention", False) and config.rotary_interleaved: logger.warning( - "Setting apply_rope_fusion to false because its implementation" - " is not included in Apex. Try upgrading to the latest version" + "rotary_interleaved is not supported with multi_latent_attention, setting it to False" ) - apply_rotary_pos_emb.printed_fused_warning = True + config.rotary_interleaved = False + + if config.apply_rope_fusion: + if cu_seqlens is None: + return fused_apply_rotary_pos_emb(t, freqs, transpose_output_memory=True) + else: + return fused_apply_rotary_pos_emb_thd(t, cu_seqlens, freqs) + else: + if cu_seqlens is None: + return _apply_rotary_pos_emb_bshd( + t, + freqs, + rotary_interleaved=config.rotary_interleaved, + multi_latent_attention=config.multi_latent_attention, + mscale=mscale, + ) + else: + return _apply_rotary_pos_emb_thd( + t, + cu_seqlens, + freqs, + rotary_interleaved=config.rotary_interleaved, + multi_latent_attention=config.multi_latent_attention, + mscale=mscale, + ) + +class FusedRoPEFunc(torch.autograd.Function): + """ + Function for FusedRoPE - if getattr(config, "multi_latent_attention", False) and config.rotary_interleaved: - logger.warning( - "rotary_interleaved is not supported with multi_latent_attention, setting it to False" - ) - config.rotary_interleaved = False + This implementation assumes the input tensor to be in `sbhd`, `bshd` or `thd` format and + the RoPE tensor to be of shape (s, 1, 1, d). It accepts arbitrary memory layouts to avoid + the expensive `.contiguous()` calls, thus it may not achieve the best memory access pattern. + """ - if config.apply_rope_fusion: - if cu_seqlens is None: - return fused_apply_rotary_pos_emb(t, freqs, transpose_output_memory=True) + @staticmethod + def forward( + ctx, + t: torch.Tensor, + freqs: torch.Tensor, + tensor_format: str = "sbhd", + cu_seqlens: Union[torch.Tensor, None] = None, + ) -> torch.Tensor: + if freqs.dtype != torch.float32: + freqs = freqs.float() + if tensor_format == "sbhd": + output = tex.fused_rope_forward(t, freqs, False) + elif tensor_format == "bshd": + output = tex.fused_rope_forward( + t.transpose(0, 1), freqs, True + ).transpose(0, 1) + elif tensor_format == "thd": + output = tex.fused_rope_thd_forward(t, cu_seqlens, freqs) else: - return fused_apply_rotary_pos_emb_thd(t, cu_seqlens, freqs) - else: - if cu_seqlens is None: - return _apply_rotary_pos_emb_bshd( - t, - freqs, - rotary_interleaved=config.rotary_interleaved, - multi_latent_attention=config.multi_latent_attention, - mscale=mscale, - ) + raise ValueError(f"Unsupported tensor_format: {tensor_format}.") + ctx.save_for_backward(freqs, cu_seqlens) + ctx.tensor_format = tensor_format + return output + + @staticmethod + def backward( + ctx, grad_output: torch.Tensor + ) -> Tuple[Union[torch.Tensor, None], ...]: + freqs, cu_seqlens = ctx.saved_tensors + if ctx.tensor_format == "sbhd": + grad_input = tex.fused_rope_backward(grad_output, freqs, False) + elif ctx.tensor_format == "bshd": + grad_input = tex.fused_rope_backward( + grad_output.transpose(0, 1), freqs, True + ).transpose(0, 1) + elif ctx.tensor_format == "thd": + grad_input = tex.fused_rope_thd_backward(grad_output, cu_seqlens, freqs) else: - return _apply_rotary_pos_emb_thd( - t, - cu_seqlens, - freqs, - rotary_interleaved=config.rotary_interleaved, - multi_latent_attention=config.multi_latent_attention, - mscale=mscale, - ) + raise ValueError(f"Unsupported tensor_format: {ctx.tensor_format}.") + + return grad_input, None, None, None, None + + +def apply_rotary_pos_emb_fused_te( + t: torch.Tensor, + freqs: torch.Tensor, + tensor_format: str = "sbhd", + config: TransformerConfig = None, + fused: bool = True, + cu_seqlens: Union[torch.Tensor, None] = None, +) -> torch.Tensor: + """ + Apply rotary positional embedding tensor to the input tensor. + + Parameters + ---------- + t: torch.Tensor + Input tensor of shape `[s, b, h, d]`, `[b, s, h, d]` or `[t, h, d]`, on which + rotary positional embedding will be applied. + freqs: torch.Tensor + Rotary positional embedding tensor of shape `[s2, 1, 1, d2]` and dtype 'float', + with `s2 >= s` and `d2 <= d`. + fused: bool, default = False + Whether to use a fused applying RoPE implementation. + tensor_format: {'sbhd', 'bshd', 'thd'}, default = 'sbhd' + is `bshd` if `t` is of shape `[bs, seq, ...]`, or `sbhd` if `t` is + of shape `[seq, bs, ...]`. 'thd' is only supported when `fused` is True. + cu_seqlens: torch.Tensor, default = None. + Cumulative sum of sequence lengths in a batch for `t`, with shape [b + 1] and + dtype torch.int32. Only valid when `tensor_format` is 'thd'. + """ + + if fused: + assert ( + tensor_format != "thd" or cu_seqlens is not None + ), "cu_seqlens must not be None when tensor_format is 'thd'." + return FusedRoPEFunc.apply(t, freqs, tensor_format, cu_seqlens) + + assert tensor_format in ("sbhd", "bshd"), ( + "Only formats `sbhd` or `bshd` are supported for input tensor `t` " + f"when fused is False, got {tensor_format}." + ) + + max_seq_len = freqs.shape[0] + cur_seq_len = t.shape[1] if tensor_format == "bshd" else t.shape[0] + + # Only apply the rotary embeddings up to the sequence length of the running + # input. + assert cur_seq_len <= max_seq_len, ( + f"Rotary Embeddings only supported up to {max_seq_len} sequence length!" + ) + freqs = freqs[:cur_seq_len] + if tensor_format == "bshd": + freqs = freqs.transpose(0, 1) # [seq, 1, 1, dim] -> [1, seq, 1, dim] + # cos/sin first then dtype conversion for better precision + cos_ = torch.cos(freqs).to(t.dtype) + sin_ = torch.sin(freqs).to(t.dtype) + + rot_dim = freqs.shape[-1] + # ideally t_pass is empty so rotary pos embedding is applied to all tensor t + t, t_pass = t[..., :rot_dim], t[..., rot_dim:] + + # first part is cosine component + # second part is sine component, need to change signs with _rotate_half method + t = (t * cos_) + (_rotate_half(t) * sin_) + return torch.cat((t, t_pass), dim=-1) diff --git a/megatron/core/models/common/embeddings/rotary_pos_embedding.py b/megatron/core/models/common/embeddings/rotary_pos_embedding.py old mode 100644 new mode 100755 index 5232faec60..d16ae79cdb --- a/megatron/core/models/common/embeddings/rotary_pos_embedding.py +++ b/megatron/core/models/common/embeddings/rotary_pos_embedding.py @@ -183,4 +183,4 @@ def get_rotary_seq_len( rotary_seq_len *= transformer_config.context_parallel_size - return rotary_seq_len + return rotary_seq_len \ No newline at end of file diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py old mode 100644 new mode 100755 index a63171686a..b8968d6cf5 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -165,6 +165,9 @@ class TransformerConfig(ModelParallelConfig): apply_rope_fusion: bool = False """If True, use fused RoPE kernel.""" + disable_te_fused_rope: bool = False + """If True, disable fused RoPE kernel from transformer engine""" + #################### # activation recomputation #################### diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py old mode 100644 new mode 100755 index e3d876a5f2..9411223126 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -695,6 +695,8 @@ def core_transformer_config_from_args(args, config_class=None): else: kw_args['num_query_groups'] = None kw_args['config_logger_dir'] = args.config_logger_dir + if args.disable_te_fused_rope: + kw_args['disable_te_fused_rope'] = args.disable_te_fused_rope # Return config. return config_class(**kw_args) @@ -853,6 +855,8 @@ def _add_network_size_args(parser): action='store_false', help='Disable position embedding. Deprecated: use --position-embedding-type', dest='add_position_embedding') + group.add_argument('--disable-te-fused-rope', action='store_true', default = False, + help='Disable fused rope from transformer-engine: use --disable_te_fused_rope') group.add_argument('--make-vocab-size-divisible-by', type=int, default=128, help='Pad the vocab size to be divisible by this value.' 'This is added for computational efficieny reasons.') From 2a3af131ec07be516914bd562dbdc87231495163 Mon Sep 17 00:00:00 2001 From: Gurpreet Singh Dhami Date: Tue, 22 Oct 2024 14:45:23 +0000 Subject: [PATCH 08/30] address review comments --- Dockerfile_amd | 21 ++++++--------------- run_examples.sh | 2 +- train_llama.sh | 32 +++++++++++--------------------- 3 files changed, 18 insertions(+), 37 deletions(-) diff --git a/Dockerfile_amd b/Dockerfile_amd index ca68aea00c..2ad89f60df 100644 --- a/Dockerfile_amd +++ b/Dockerfile_amd @@ -1,6 +1,4 @@ ARG BASE_DOCKER=rocm/pytorch:latest -#ARG BASE_DOCKER=rocm/pytorch-private:exec_dashboard_nightly -#ARG BASE_DOCKER=rocm/pytorch-private:20240920_exec_dashboard_nightly FROM $BASE_DOCKER ENV WORKSPACE_DIR=/workspace ENV STAGE_DIR=/workspace/installs @@ -18,6 +16,7 @@ nltk \ pytest \ pytest-cov \ pytest_mock \ +pytest-csv \ pytest-random-order \ sentencepiece \ wrapt \ @@ -28,21 +27,13 @@ pytest_mock \ pybind11 \ wrapt \ setuptools==69.5.1 \ -datasets +datasets \ +tiktoken \ +pynvml RUN pip3 install "huggingface_hub[cli]" RUN python3 -m nltk.downloader punkt_tab - -############################################################################## -# Apex -############################################################################## -#RUN git clone https://github.com/ROCm/apex.git ${STAGE_DIR}/apex -#WORKDIR ${STAGE_DIR}/apex -#RUN python3 setup.py install --cpp_ext --cuda_ext -#WORKDIR $WORKSPACE_DIR -#RUN rm -rf ${STAGE_DIR}/apex - # Install Causal-Conv1d and its dependencies WORKDIR ${STAGE_DIR} @@ -58,6 +49,7 @@ RUN git clone https://github.com/Dao-AILab/causal-conv1d causal-conv1d &&\ WORKDIR ${STAGE_DIR} RUN git clone https://github.com/state-spaces/mamba mamba &&\ cd mamba &&\ + git checkout bc84fb1 &&\ git show --oneline -s &&\ pip install --no-build-isolation . @@ -73,10 +65,9 @@ RUN git clone --recursive https://github.com/ROCmSoftwarePlatform/TransformerEng WORKDIR $WORKSPACE_DIR RUN git clone https://github.com/ROCm/Megatron-LM.git Megatron-LM &&\ cd Megatron-LM &&\ - git checkout rocm_megatron_lm_upstream &&\ + git checkout rocm_dev &&\ pip install -e . - WORKDIR $WORKSPACE_DIR/Megatron-LM # record configuration for posterity diff --git a/run_examples.sh b/run_examples.sh index f5e69256c6..32695c1a33 100755 --- a/run_examples.sh +++ b/run_examples.sh @@ -1,4 +1,4 @@ -bash train_llama.sh ENABLE_MOCK_DATA=0 ENABLE_ROPE=1 ENABLE_ROPE_TE=1 DUMMY_RUN=0 ADD_TASK=0 MODEL_SIZE=70 ENABLE_PROFILING=0 TP=8 PP=1 BS=8 MBS=2 TOTAL_ITERS=20 LABEL="rope_en_fused" +bash train_llama.sh ENABLE_MOCK_DATA=0 ENABLE_ROPE=1 DISABLE_ROPE_TE=0 DUMMY_RUN=0 ADD_TASK=0 MODEL_SIZE=70 ENABLE_PROFILING=0 TP=8 PP=1 BS=8 MBS=2 TOTAL_ITERS=20 LABEL="te_rope_en_fused" #bash train_llama.sh ENABLE_MOCK_DATA=0 ENABLE_ROPE=0 DUMMY_RUN=0 ADD_TASK=0 MODEL_SIZE=70 ENABLE_PROFILING=0 TP=8 PP=1 BS=8 MBS=2 TOTAL_ITERS=20 LABEL="rope_disabled" diff --git a/train_llama.sh b/train_llama.sh index 71ddd7008a..38ff79df6b 100755 --- a/train_llama.sh +++ b/train_llama.sh @@ -25,7 +25,7 @@ USE_FLASH_ATTN="${USE_FLASH_ATTN:-1}" NO_TRAINING="${NO_TRAINING:-0}" # NO_TRAINING=1: for computing metrics only ENABLE_PROFILING="${ENABLE_PROFILING:-0}" ENABLE_ROPE="${ENABLE_ROPE:-1}" -ENABLE_ROPE_TE="${ENABLE_ROPE_TE:-1}" +DISABLE_ROPE_TE="${DISABLE_ROPE_TE:-0}" ENABLE_MOCK_DATA="${ENABLE_MOCK_DATA:-1}" DUMMY_RUN="${DUMMY_RUN:-0}" ADD_TASK="${ADD_TASK:-0}" @@ -54,10 +54,7 @@ TOTAL_ITERS="${TOTAL_ITERS:-4}" SEQ_PARALLEL="${SEQ_PARALLEL:-1}" CONTI_PARAMS="${CONTI_PARAMS:-0}" OPTIMIZER="${OPTIMIZER:-sgd}" -TE_FP16="${TE_FP16:-1}" - - -export CUDA_DEVICE_MAX_CONNECTIONS=1 +TE_BF16="${TE_BF16:-1}" EXPERIMENT_DIR="experiment" mkdir -p $EXPERIMENT_DIR @@ -89,7 +86,7 @@ if __name__ == "__main__": out_dir = Path(args.out_dir) out_dir.mkdir(exist_ok=True, parents=True) - dataset = load_dataset("bookcorpus", split="train") + dataset = load_dataset("bookcorpus", split="train", trust_remote_code=True) dataset.to_json(out_dir / "bookcorpus_megatron.json")' > prepare_bookcorpus_megatron_dataset.py DATA_PATH=${DATA_DIR}/bookcorpus_text_sentence @@ -106,8 +103,8 @@ fi MAX_POSITION_EMBEDDINGS=32768 -if [ "$TE_FP16" -eq 1 ]; then - TRAIN_LOG="${EXPERIMENT_DIR}/train_${MODEL_SIZE}B_iter${TOTAL_ITERS}_mbs${MBS}_bs${BS}_tp${TP}_pp${PP}_seq${SEQ_LENGTH}_optim_${OPTIMIZER}_nocompile${NO_TORCH_COMPILE}_fa_${USE_FLASH_ATTN}_seqpara_${SEQ_PARALLEL}_contiparam_${CONTI_PARAMS}_TE_FP16_${LABEL}.log" +if [ "$TE_BF16" -eq 1 ]; then + TRAIN_LOG="${EXPERIMENT_DIR}/train_${MODEL_SIZE}B_iter${TOTAL_ITERS}_mbs${MBS}_bs${BS}_tp${TP}_pp${PP}_seq${SEQ_LENGTH}_optim_${OPTIMIZER}_nocompile${NO_TORCH_COMPILE}_fa_${USE_FLASH_ATTN}_seqpara_${SEQ_PARALLEL}_contiparam_${CONTI_PARAMS}_TE_BF16_${LABEL}.log" else TRAIN_LOG="${EXPERIMENT_DIR}/train_${MODEL_SIZE}B_iter${TOTAL_ITERS}_mbs${MBS}_bs${BS}_tp${TP}_pp${PP}_seq${SEQ_LENGTH}_optim_${OPTIMIZER}_nocompile${NO_TORCH_COMPILE}_fa_${USE_FLASH_ATTN}_seqpara_${SEQ_PARALLEL}_contiparam_${CONTI_PARAMS}_${LABEL}.log" fi @@ -243,8 +240,8 @@ if [ "$ENABLE_ROPE" -eq 1 ]; then EXTRA_ARGS="$EXTRA_ARGS --position-embedding-type rope" fi -if [ "$ENABLE_ROPE_TE" -eq 1 ]; then -EXTRA_ARGS="$EXTRA_ARGS --use-te-fused-rope" +if [ "$DISABLE_ROPE_TE" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --disable-te-fused-rope" fi if [ "$NO_TORCH_COMPILE" -eq 1 ]; then @@ -263,7 +260,7 @@ if [ "$CONTI_PARAMS" -eq 1 ]; then EXTRA_ARGS="$EXTRA_ARGS --use-contiguous-parameters-in-local-ddp" fi -if [ "$TE_FP16" -eq 1 ]; then +if [ "$TE_BF16" -eq 1 ]; then EXTRA_ARGS="$EXTRA_ARGS --transformer-impl=transformer_engine \ --fp8-margin=0 \ --fp8-interval=1 \ @@ -292,13 +289,13 @@ echo 'torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ " fi -if [ "$TEE_OUTPUT" -eq 0 ]; then +if [ "$TEE_OUTPUT" -eq 0 ]; then run_cmd="$run_cmd >& $TRAIN_LOG" else run_cmd="$run_cmd |& tee $TRAIN_LOG" fi -if [ "$NO_TRAINING" -eq 0 ]; then +if [ "$NO_TRAINING" -eq 0 ]; then eval $run_cmd fi @@ -332,11 +329,4 @@ echo "elapsed time per iteration: $(python mean_log_value.py tmp.txt)" |& tee -a TIME_PER_ITER=$(python mean_log_value.py tmp.txt 2>/dev/null | awk '{printf "%.6f", $0}') PERFORMANCE=$(awk -v bs="$BS" -v sl="$SEQ_LENGTH" -v tpi="$TIME_PER_ITER" -v ws="$WORLD_SIZE" 'BEGIN {printf "%.6f", bs * sl * 1000/ (tpi * ws)}') echo "tokens/GPU/s: $PERFORMANCE" |& tee -a $TRAIN_LOG -rm tmp.txt - -echo '============================================================================================================' -grep -Eo 'mem usages: [^|]*' $TRAIN_LOG | sed -E 's/.*mem usages: ([0-9\.]+).*/\1/' > tmp.txt -echo "mem usages: $(python mean_log_value.py tmp.txt)" |& tee -a $TRAIN_LOG -rm tmp.txt - -echo "Model, $MODEL_SIZE, BS , $BS, MBS $MBS, TP, $TP, PP, $PP, ROPE, $ENABLE_ROPE, mock, $ENABLE_MOCK_DATA, throughput(tflops), $THROUGHPUT , time_per_iter , $TIME_PER_ITER, token/sec, $PERFORMANCE, LABEL, $LABEL, ITERS, $TOTAL_ITERS " >> results.csv +rm tmp.txt \ No newline at end of file From 967673bae90144790cdb0266638858bbd72175ba Mon Sep 17 00:00:00 2001 From: Gurpreet Singh Dhami Date: Fri, 25 Oct 2024 16:47:05 +0000 Subject: [PATCH 09/30] update --- run_examples.sh | 6 +++--- train_llama.sh => train_llama2.sh | 25 ++++++------------------- 2 files changed, 9 insertions(+), 22 deletions(-) rename train_llama.sh => train_llama2.sh (91%) diff --git a/run_examples.sh b/run_examples.sh index 32695c1a33..bdfb09889d 100755 --- a/run_examples.sh +++ b/run_examples.sh @@ -1,6 +1,6 @@ -bash train_llama.sh ENABLE_MOCK_DATA=0 ENABLE_ROPE=1 DISABLE_ROPE_TE=0 DUMMY_RUN=0 ADD_TASK=0 MODEL_SIZE=70 ENABLE_PROFILING=0 TP=8 PP=1 BS=8 MBS=2 TOTAL_ITERS=20 LABEL="te_rope_en_fused" +bash train_llama2.sh ENABLE_MOCK_DATA=0 ENABLE_ROPE=1 DISABLE_ROPE_TE=0 DUMMY_RUN=0 ADD_TASK=0 MODEL_SIZE=70 ENABLE_PROFILING=0 TP=8 PP=1 BS=8 MBS=2 TOTAL_ITERS=20 LABEL="te_rope_en_fused" -#bash train_llama.sh ENABLE_MOCK_DATA=0 ENABLE_ROPE=0 DUMMY_RUN=0 ADD_TASK=0 MODEL_SIZE=70 ENABLE_PROFILING=0 TP=8 PP=1 BS=8 MBS=2 TOTAL_ITERS=20 LABEL="rope_disabled" +#bash train_llama2.sh ENABLE_MOCK_DATA=0 ENABLE_ROPE=0 DUMMY_RUN=0 ADD_TASK=0 MODEL_SIZE=70 ENABLE_PROFILING=0 TP=8 PP=1 BS=8 MBS=2 TOTAL_ITERS=20 LABEL="rope_disabled" -#bash train_llama.sh ENABLE_MOCK_DATA=0 ENABLE_ROPE=1 DUMMY_RUN=0 ADD_TASK=0 MODEL_SIZE=70 ENABLE_PROFILING=1 TP=8 PP=1 BS=8 MBS=2 TOTAL_ITERS=20 LABEL="rope_en_fused_profile" +#bash train_llama2.sh ENABLE_MOCK_DATA=0 ENABLE_ROPE=1 DUMMY_RUN=0 ADD_TASK=0 MODEL_SIZE=70 ENABLE_PROFILING=1 TP=8 PP=1 BS=8 MBS=2 TOTAL_ITERS=20 LABEL="rope_en_fused_profile" diff --git a/train_llama.sh b/train_llama2.sh similarity index 91% rename from train_llama.sh rename to train_llama2.sh index 38ff79df6b..557ebe8e5e 100755 --- a/train_llama.sh +++ b/train_llama2.sh @@ -5,7 +5,6 @@ export GPU_MAX_HW_QUEUES=2 export TORCH_NCCL_HIGH_PRIORITY=1 - # parsing input arguments for ARGUMENT in "$@" do @@ -20,7 +19,6 @@ done export CUDA_DEVICE_MAX_CONNECTIONS=1 TEE_OUTPUT="${TEE_OUTPUT:-1}" -NO_TORCH_COMPILE="${NO_TORCH_COMPILE:-0}" USE_FLASH_ATTN="${USE_FLASH_ATTN:-1}" NO_TRAINING="${NO_TRAINING:-0}" # NO_TRAINING=1: for computing metrics only ENABLE_PROFILING="${ENABLE_PROFILING:-0}" @@ -36,12 +34,11 @@ echo "NO_TRAINING=$NO_TRAINING" CWD=`pwd` GPUS_PER_NODE=`python -c "import torch; print(torch.cuda.device_count())"` - # Change for multinode config -MASTER_ADDR=localhost -MASTER_PORT=23731 -NNODES=1 -NODE_RANK=0 +MASTER_ADDR=${MASTER_ADDR:-localhost} +MASTER_PORT=${MASTER_PORT:-23731} +NNODES=${NNODES:-1} +NODE_RANK=${NODE_RANK:-0} WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) MODEL_SIZE="${MODEL_SIZE:-70}" @@ -103,11 +100,7 @@ fi MAX_POSITION_EMBEDDINGS=32768 -if [ "$TE_BF16" -eq 1 ]; then - TRAIN_LOG="${EXPERIMENT_DIR}/train_${MODEL_SIZE}B_iter${TOTAL_ITERS}_mbs${MBS}_bs${BS}_tp${TP}_pp${PP}_seq${SEQ_LENGTH}_optim_${OPTIMIZER}_nocompile${NO_TORCH_COMPILE}_fa_${USE_FLASH_ATTN}_seqpara_${SEQ_PARALLEL}_contiparam_${CONTI_PARAMS}_TE_BF16_${LABEL}.log" -else - TRAIN_LOG="${EXPERIMENT_DIR}/train_${MODEL_SIZE}B_iter${TOTAL_ITERS}_mbs${MBS}_bs${BS}_tp${TP}_pp${PP}_seq${SEQ_LENGTH}_optim_${OPTIMIZER}_nocompile${NO_TORCH_COMPILE}_fa_${USE_FLASH_ATTN}_seqpara_${SEQ_PARALLEL}_contiparam_${CONTI_PARAMS}_${LABEL}.log" -fi +TRAIN_LOG="${TEMP_DIR}/train_${MODEL_SIZE}B_iter${TOTAL_ITERS}_mbs${MBS}_bs${BS}_tp${TP}_pp${PP}_seq${SEQ_LENGTH}_optim_${OPTIMIZER}_fa_${USE_FLASH_ATTN}_seqpara_${SEQ_PARALLEL}_contiparam_${CONTI_PARAMS}_te_bg16_${TE_BF16}_${LABEL}.log" echo $TRAIN_LOG @@ -151,9 +144,7 @@ fi GROUP_SIZE=$(( ${NUM_HEADS} / ${NUM_KV_HEADS} )) NUM_GROUPS=$(( ${NUM_HEADS} / ${GROUP_SIZE} )) - -PROFILING_DIR="${EXPERIMENT_DIR}/perf_${MODEL_SIZE}B_iter${TOTAL_ITERS}_mbs${MBS}_bs${BS}_tp${TP}_pp${PP}_seq${SEQ_LENGTH}_optim_${OPTIMIZER}_nocompile${NO_TORCH_COMPILE}_fa_${USE_FLASH_ATTN}_seqpara_${SEQ_PARALLEL}_contiparam_${CONTI_PARAMS}" - +PROFILING_DIR="${EXPERIMENT_DIR}/perf_${MODEL_SIZE}B_iter${TOTAL_ITERS}_mbs${MBS}_bs${BS}_tp${TP}_pp${PP}_seq${SEQ_LENGTH}_optim_${OPTIMIZER}_fa_${USE_FLASH_ATTN}_seqpara_${SEQ_PARALLEL}_contiparam_${CONTI_PARAMS}" GPT_ARGS=" --tensor-model-parallel-size ${TP} \ @@ -244,10 +235,6 @@ if [ "$DISABLE_ROPE_TE" -eq 1 ]; then EXTRA_ARGS="$EXTRA_ARGS --disable-te-fused-rope" fi -if [ "$NO_TORCH_COMPILE" -eq 1 ]; then -EXTRA_ARGS="$EXTRA_ARGS --no-torch-compile" -fi - if [ "$USE_FLASH_ATTN" -eq 1 ]; then EXTRA_ARGS="$EXTRA_ARGS --use-flash-attn" fi From a2d0bdf040b6404ecae2905940a3a6967ea53dee Mon Sep 17 00:00:00 2001 From: Manjunath Siddaiah <163046790+msiddaiah@users.noreply.github.com> Date: Wed, 30 Oct 2024 11:02:57 -0500 Subject: [PATCH 10/30] Enable HuggingFaceTokenizer in preprocessing (#10) Co-authored-by: Manjunath Siddaiah --- tools/preprocess_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index a81fe8ca7e..a9575707b9 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -203,7 +203,7 @@ def get_args(): choices=['BertWordPieceLowerCase','BertWordPieceCase', 'GPT2BPETokenizer', 'SentencePieceTokenizer', 'GPTSentencePieceTokenizer', 'Llama2Tokenizer', - 'Llama3Tokenizer', 'MistralTokenizer', 'NullTokenizer'], + 'Llama3Tokenizer', 'MistralTokenizer', 'HuggingFaceTokenizer', 'NullTokenizer'], help='What type of tokenizer to use.') group.add_argument('--tokenizer-model', type=str, default=None, help='YTTM tokenizer model.') From 0d5c01e121a55af6d65548ca6600adccc59fa7c9 Mon Sep 17 00:00:00 2001 From: Gurpreet Singh Dhami Date: Mon, 4 Nov 2024 15:54:25 -0600 Subject: [PATCH 11/30] markers for failing tests on rocm --- pytest.ini | 7 ++++++- .../dist_checkpointing/models/test_moe_experts.py | 2 ++ tests/unit_tests/dist_checkpointing/test_fp8.py | 3 +++ tests/unit_tests/dist_checkpointing/test_nonpersistent.py | 2 ++ tests/unit_tests/dist_checkpointing/test_optimizer.py | 1 + tests/unit_tests/models/test_mamba_model.py | 2 ++ tests/unit_tests/test_utils.py | 2 +- tests/unit_tests/transformer/test_attention.py | 6 +++--- tests/unit_tests/transformer/test_retro_attention.py | 5 ++++- tests/unit_tests/transformer/test_spec_customization.py | 1 + 10 files changed, 25 insertions(+), 6 deletions(-) diff --git a/pytest.ini b/pytest.ini index c75f3b9fa4..5a4bd57c78 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,9 @@ # content of pytest.ini [pytest] markers = - internal: mark a test as a test to private/internal functions. \ No newline at end of file + internal: mark a test as a test to private/internal functions. + failing_on_rocm: Currently Failing Tests on Rocm + +addopts = + --ignore tests/unit_tests/test_utilities.py + -m "not failing_on_rocm and not flaky and not internal" \ No newline at end of file diff --git a/tests/unit_tests/dist_checkpointing/models/test_moe_experts.py b/tests/unit_tests/dist_checkpointing/models/test_moe_experts.py index 4a8f153ed4..695a257c0f 100644 --- a/tests/unit_tests/dist_checkpointing/models/test_moe_experts.py +++ b/tests/unit_tests/dist_checkpointing/models/test_moe_experts.py @@ -102,6 +102,7 @@ def teardown_method(self, method): (False, (1, 1, 4), (8, 1, 1), True), ], ) + @pytest.mark.failing_on_rocm @pytest.mark.parametrize("expert_type", expert_type) def test_parallel_reconfiguration_e2e( self, tmp_path_dist_ckpt, src_tp_pp_exp, dest_tp_pp_exp, use_glu, use_fpsl, expert_type @@ -182,6 +183,7 @@ def test_parallel_reconfiguration_e2e( ], ) @pytest.mark.parametrize("src_module,dest_module", src_dest_expert_type) + @pytest.mark.failing_on_rocm def test_sequential_grouped_mlp_interchangeable( self, tmp_path_dist_ckpt, src_tp_pp_exp, dest_tp_pp_exp, use_glu, src_module, dest_module ): diff --git a/tests/unit_tests/dist_checkpointing/test_fp8.py b/tests/unit_tests/dist_checkpointing/test_fp8.py index d2dcb367c7..33c19465eb 100644 --- a/tests/unit_tests/dist_checkpointing/test_fp8.py +++ b/tests/unit_tests/dist_checkpointing/test_fp8.py @@ -18,8 +18,10 @@ class TestFP8: + @pytest.mark.parametrize('dtype', ['bf16', 'fp16', 'fp8']) @pytest.mark.parametrize('src_rank', [0, 6]) + @pytest.mark.failing_on_rocm def test_simple_broadcast(self, dtype, src_rank): Utils.initialize_model_parallel() @@ -52,6 +54,7 @@ def get_ten(dtype: str = 'fp8'): ], ) @pytest.mark.flaky + @pytest.mark.failing_on_rocm def test_fp8_save_load( self, tmp_path_dist_ckpt, use_fpsl, src_tp_pp, dest_tp_pp, load_exchange_algo ): diff --git a/tests/unit_tests/dist_checkpointing/test_nonpersistent.py b/tests/unit_tests/dist_checkpointing/test_nonpersistent.py index 346751e264..263168a63c 100644 --- a/tests/unit_tests/dist_checkpointing/test_nonpersistent.py +++ b/tests/unit_tests/dist_checkpointing/test_nonpersistent.py @@ -30,6 +30,7 @@ def teardown_method(self, method): @pytest.mark.parametrize(('tp,pp'), [(2, 4)]) @pytest.mark.flaky + @pytest.mark.failing_on_rocm def test_basic_save_load_scenarios(self, tmp_path_dist_ckpt, tp, pp): Utils.initialize_model_parallel(tp, pp) num_floating_point_operations_so_far = 0 @@ -119,6 +120,7 @@ def test_basic_save_load_scenarios(self, tmp_path_dist_ckpt, tp, pp): class TestLegacySaveAndLoad: @pytest.mark.parametrize(('tp,pp'), [(2, 4)]) @pytest.mark.flaky + @pytest.mark.failing_on_rocm def test_basic_save_load_scenario(self, tmp_path_dist_ckpt, tp, pp): Utils.initialize_model_parallel(tp, pp) num_floating_point_operations_so_far = 0 diff --git a/tests/unit_tests/dist_checkpointing/test_optimizer.py b/tests/unit_tests/dist_checkpointing/test_optimizer.py index 19d1ee9e85..2b47138d46 100644 --- a/tests/unit_tests/dist_checkpointing/test_optimizer.py +++ b/tests/unit_tests/dist_checkpointing/test_optimizer.py @@ -517,6 +517,7 @@ def test_optimizer_resharding( ((2, 1, 2), (1, 1, 8)), ], ) + @pytest.mark.failing_on_rocm def test_chained_optimizer_resharding( self, tmp_path_dist_ckpt, diff --git a/tests/unit_tests/models/test_mamba_model.py b/tests/unit_tests/models/test_mamba_model.py index 913adb538c..cad2b0367c 100644 --- a/tests/unit_tests/models/test_mamba_model.py +++ b/tests/unit_tests/models/test_mamba_model.py @@ -56,6 +56,7 @@ def test_set_input_tensor(self): assert self.model.decoder.input_tensor.shape[1] == micro_batch_size assert self.model.decoder.input_tensor.shape[2] == config.hidden_size + @pytest.mark.failing_on_rocm def test_forward(self): config: TransformerConfig = self.model.config sequence_length = self.model.max_sequence_length @@ -78,6 +79,7 @@ def test_forward(self): assert logits.shape[1] == sequence_length assert logits.shape[2] == self.model.vocab_size + @pytest.mark.failing_on_rocm def test_inference(self): config: TransformerConfig = self.model.config micro_batch_size = 2 diff --git a/tests/unit_tests/test_utils.py b/tests/unit_tests/test_utils.py index 229cead1c3..c8b0139a59 100644 --- a/tests/unit_tests/test_utils.py +++ b/tests/unit_tests/test_utils.py @@ -110,7 +110,7 @@ def test_cross_check_param_hashes_across_dp_replicas(): # Teardown. _deinit_distributed() - +@pytest.mark.failing_on_rocm def test_straggler_detector(): world = int(os.getenv('WORLD_SIZE', '1')) rank = int(os.getenv('RANK', '0')) diff --git a/tests/unit_tests/transformer/test_attention.py b/tests/unit_tests/transformer/test_attention.py index 8c13ff3f8c..ef1226d0b0 100644 --- a/tests/unit_tests/transformer/test_attention.py +++ b/tests/unit_tests/transformer/test_attention.py @@ -37,7 +37,7 @@ def test_constructor(self): def test_cpu_forward(self): # we can't currently do this because the global memory buffer is on GPU pass - + @pytest.mark.failing_on_rocm def test_gpu_forward(self): config = self.parallel_attention.config @@ -61,7 +61,7 @@ def test_gpu_forward(self): assert output.shape[1] == micro_batch_size assert output.shape[2] == config.hidden_size assert bias.shape[0] == config.hidden_size - + @pytest.mark.failing_on_rocm def test_fused_rope_gpu_forward(self): self.parallel_attention.config.apply_rope_fusion = True config = self.parallel_attention.config @@ -90,7 +90,7 @@ def test_fused_rope_gpu_forward(self): assert output.shape[2] == config.hidden_size assert bias.shape[0] == config.hidden_size self.parallel_attention.config.apply_rope_fusion = False - + @pytest.mark.failing_on_rocm def test_checkpointed_gpu_forward(self): transformer_config = self.transformer_config transformer_config.recompute_granularity = 'selective' diff --git a/tests/unit_tests/transformer/test_retro_attention.py b/tests/unit_tests/transformer/test_retro_attention.py index d7c5a5f155..751e1c74e3 100644 --- a/tests/unit_tests/transformer/test_retro_attention.py +++ b/tests/unit_tests/transformer/test_retro_attention.py @@ -3,6 +3,7 @@ import types import torch +import pytest from megatron.core.models.retro import RetroConfig, get_retro_decoder_block_spec from megatron.core.models.retro.decoder_attention import ( @@ -80,6 +81,7 @@ def setup_method(self, method): def teardown_method(self, method): Utils.destroy_model_parallel() + @pytest.mark.failing_on_rocm def test_constructor(self): config = self.get_config() @@ -101,6 +103,7 @@ def test_constructor(self): assert get_nparams(modules.encoder_bda) == 0 assert get_nparams(modules.encoder_norm) == 32 + @pytest.mark.failing_on_rocm def test_cpu_forward(self): # we can't currently do this because the global memory buffer is on GPU pass @@ -190,7 +193,7 @@ def run_gpu_forward(self, recompute_granularity, use_transformer_engine): config.retro_num_neighbors * micro_batch_size * n_chunks_per_sample, config.hidden_size, ) - + @pytest.mark.failing_on_rocm def test_gpu_forward(self): for recompute_granularity in (None, 'selective'): for use_transformer_engine in (True, False): diff --git a/tests/unit_tests/transformer/test_spec_customization.py b/tests/unit_tests/transformer/test_spec_customization.py index a9a245b861..1d458fe88f 100755 --- a/tests/unit_tests/transformer/test_spec_customization.py +++ b/tests/unit_tests/transformer/test_spec_customization.py @@ -132,6 +132,7 @@ def test_build_module(self): bda_op = build_module(self.bda_spec) assert id(bda_op) == id(get_bias_dropout_add) + @pytest.mark.failing_on_rocm def test_sliding_window_attention(self): if not is_te_min_version("1.2.0"): print("SWA not tested because TE version is not >= 1.2.0", file=sys.stderr) From 1efaebf6714dde7e001a29608801ad3b25812528 Mon Sep 17 00:00:00 2001 From: Gurpreet Singh Dhami Date: Wed, 6 Nov 2024 03:10:53 +0530 Subject: [PATCH 12/30] mark failing tests on mi250 --- pytest.ini | 4 ++-- tests/unit_tests/inference/test_modelopt_gpt_model.py | 2 ++ tests/unit_tests/transformer/test_transformer_block.py | 2 ++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pytest.ini b/pytest.ini index 5a4bd57c78..e6c58fccb7 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,7 +3,7 @@ markers = internal: mark a test as a test to private/internal functions. failing_on_rocm: Currently Failing Tests on Rocm + failing_on_rocm_mi250: Tests failing on MI250 addopts = - --ignore tests/unit_tests/test_utilities.py - -m "not failing_on_rocm and not flaky and not internal" \ No newline at end of file + --ignore tests/unit_tests/test_utilities.py \ No newline at end of file diff --git a/tests/unit_tests/inference/test_modelopt_gpt_model.py b/tests/unit_tests/inference/test_modelopt_gpt_model.py index 380ac7fa16..2cb86e546e 100644 --- a/tests/unit_tests/inference/test_modelopt_gpt_model.py +++ b/tests/unit_tests/inference/test_modelopt_gpt_model.py @@ -1,4 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +import pytest from megatron.core.inference.modelopt_support.gpt.model_specs import get_gpt_layer_modelopt_spec from megatron.core.inference.modelopt_support.gpt.state_dict_hooks import ( mcore_gpt_load_te_state_dict_pre_hook, @@ -32,6 +33,7 @@ def setup_method(self, method): max_sequence_length=4, ) + @pytest.mark.failing_on_rocm_mi250 def test_load_te_state_dict_pre_hook(self): handle = self.modelopt_gpt_model._register_load_state_dict_pre_hook( mcore_gpt_load_te_state_dict_pre_hook diff --git a/tests/unit_tests/transformer/test_transformer_block.py b/tests/unit_tests/transformer/test_transformer_block.py index 02702a9ff7..210a4bc37c 100644 --- a/tests/unit_tests/transformer/test_transformer_block.py +++ b/tests/unit_tests/transformer/test_transformer_block.py @@ -66,12 +66,14 @@ def test_gpu_forward(self): def test_gpu_forward_full_checkpoint(self): self._run_full_checkpoint_test(fp8=None) + @pytest.mark.failing_on_rocm_mi250 def test_gpu_forward_full_checkpoint_fp8(self): self._run_full_checkpoint_test(fp8="e4m3") def test_gpu_forward_selective_checkpoint(self): self._run_selective_checkpoint_test(fp8=None) + @pytest.mark.failing_on_rocm_mi250 def test_gpu_forward_selective_checkpoint_fp8(self): self._run_selective_checkpoint_test(fp8="e4m3") From 8be4fae05012f1a7477e9c2ddb4b6822c58ea536 Mon Sep 17 00:00:00 2001 From: Gurpreet Singh Dhami Date: Wed, 6 Nov 2024 03:43:34 +0530 Subject: [PATCH 13/30] enable ci pipeline --- Dockerfile_rocm_ci | 70 ++++++++++++++++++++++++++++++++++++++++++++++ Jenkinsfile | 69 ++++++++++++++++++++------------------------- run_unit_tests.sh | 5 ++++ 3 files changed, 105 insertions(+), 39 deletions(-) create mode 100644 Dockerfile_rocm_ci mode change 100644 => 100755 Jenkinsfile create mode 100755 run_unit_tests.sh diff --git a/Dockerfile_rocm_ci b/Dockerfile_rocm_ci new file mode 100644 index 0000000000..56cc92892e --- /dev/null +++ b/Dockerfile_rocm_ci @@ -0,0 +1,70 @@ +ARG BASE_DOCKER=rocm/pytorch:latest +FROM $BASE_DOCKER + +ARG PYTORCH_ROCM_ARCH_OVERRIDE="gfx942" +ENV WORKSPACE_DIR=/workspace +ENV STAGE_DIR=/workspace/installs +RUN mkdir -p $WORKSPACE_DIR +RUN mkdir -p ${STAGE_DIR} +WORKDIR $WORKSPACE_DIR + +RUN pip3 install \ +numpy==1.26.4 \ +scipy \ +einops \ +flask-restful \ +nltk \ +pytest \ +pytest-cov \ +pytest_mock \ +pytest-csv \ +pytest-random-order \ +sentencepiece \ +wrapt \ +zarr \ +wandb \ +tensorstore==0.1.45 \ +pytest_mock \ +pybind11 \ +setuptools==69.5.1 \ +datasets \ +tiktoken \ +pynvml + +RUN pip3 install "huggingface_hub[cli]" +RUN python3 -m nltk.downloader punkt_tab + + +# Install Causal-Conv1d and its dependencies +WORKDIR ${STAGE_DIR} +ENV CAUSAL_CONV1D_FORCE_BUILD=TRUE +ENV MAMBA_FORCE_BUILD=TRUE +ENV HIP_ARCHITECTURES=${PYTORCH_ROCM_ARCH_OVERRIDE} +RUN git clone https://github.com/Dao-AILab/causal-conv1d causal-conv1d &&\ + cd causal-conv1d &&\ + git show --oneline -s &&\ + pip install . + +# Install mamba +WORKDIR ${STAGE_DIR} +RUN git clone https://github.com/state-spaces/mamba mamba &&\ + cd mamba &&\ + git show --oneline -s &&\ + pip install --no-build-isolation . + +# Clone TE repo and submodules +WORKDIR ${STAGE_DIR} +ENV NVTE_FRAMEWORK=pytorch +ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH_OVERRIDE} +ENV NVTE_USE_HIPBLASLT=1 +RUN git clone --recursive https://github.com/ROCmSoftwarePlatform/TransformerEngine-private.git &&\ + cd TransformerEngine-private &&\ + pip install . + +WORKDIR $WORKSPACE_DIR +COPY . Megatron-LM +WORKDIR $WORKSPACE_DIR/Megatron-LM + +# record configuration for posterity +RUN pip list + diff --git a/Jenkinsfile b/Jenkinsfile old mode 100644 new mode 100755 index 7f73a1ce70..23d4dc5140 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -12,55 +12,46 @@ def show_node_info() { """ } -def clean_up_docker() { - sh 'docker ps -a || true' // "|| true" suppresses errors - sh 'docker kill $(docker ps -q) || true' - sh 'docker rm $(docker ps -a -q) || true' - sh 'docker rmi $(docker images -q) || true' - sh 'docker system prune -af --volumes || true' -} - -def clean_up_docker_container() { - sh 'docker ps -a || true' // "|| true" suppresses errors - sh 'docker kill $(docker ps -q) || true' -} +DOCKER_IMAGE = "megatron-lm" +CONTAINER_NAME = "megatron-lm-container" +DOCKER_BUILD_ARGS = "--build-arg PYTORCH_ROCM_ARCH_OVERRIDE=gfx90a" +DOCKER_RUN_ARGS = "-v \$(pwd):/workspace/Megatron-LM/output --workdir /workspace/Megatron-LM --entrypoint /workspace/Megatron-LM/run_unit_tests.sh" -//makes sure multiple builds are not triggered for branch indexing -def resetbuild() { - if(currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) { - def milestonesList = [] - def build = currentBuild - - while(build != null) { - if(build.getBuildCauses().toString().contains('BranchIndexingCause')) { - milestonesList.add(0, build.number) - } - build = build.previousBuildInProgress - } - - for (buildNum in milestonesList) { - milestone(buildNum) - } +DOCKER_RUN_CMD= "docker run --rm -t --network host -u root --group-add video --cap-add=SYS_PTRACE --cap-add SYS_ADMIN --device /dev/fuse --security-opt seccomp=unconfined --security-opt apparmor=unconfined --ipc=host --device=/dev/kfd --device=/dev/dri" +pipeline { + parameters { + string(name: 'TEST_NODE_LABEL', defaultValue: 'MI250', description: 'Node or Label to launch Jenkins Job') } -} -pipeline { - agent any + agent {node {label "${params.TEST_NODE_LABEL}"}} stages { - stage('Build') { + stage('Build Docker Image') { steps { - echo 'Building..' + show_node_info() + script { + sh "docker build -f Dockerfile_rocm_ci -t ${DOCKER_IMAGE} ${DOCKER_BUILD_ARGS} ." + } + } } - } - stage('Test') { + + stage('Run Docker Container') { steps { - echo 'Testing..' + script { + wrap([$class: 'AnsiColorBuildWrapper', 'colorMapName': 'xterm']) { + sh "${DOCKER_RUN_CMD} ${DOCKER_RUN_ARGS} --name ${CONTAINER_NAME} ${DOCKER_IMAGE} " + } + } } } - stage('Deploy') { - steps { - show_node_info() + } + + post { + always { + //Cleanup + archiveArtifacts artifacts: 'test_report.csv' + script { + sh "docker rmi ${DOCKER_IMAGE}" } } } diff --git a/run_unit_tests.sh b/run_unit_tests.sh new file mode 100755 index 0000000000..c7317ad4a4 --- /dev/null +++ b/run_unit_tests.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +set -x +export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +torchrun --nproc_per_node=8 -m pytest --color=yes -m "not flaky and not internal and not failing_on_rocm_mi250 and not failing_on_rocm" --csv output/test_report.csv tests/unit_tests/ \ No newline at end of file From 955bdfa90bf2e953ac1d98baed4a4e95bbc29c9e Mon Sep 17 00:00:00 2001 From: Gurpreet Singh Dhami Date: Thu, 7 Nov 2024 21:33:03 +0530 Subject: [PATCH 14/30] address review comments --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 23d4dc5140..751199278e 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -35,7 +35,7 @@ pipeline { } } - stage('Run Docker Container') { + stage('Run Unit Tests') { steps { script { wrap([$class: 'AnsiColorBuildWrapper', 'colorMapName': 'xterm']) { From 328fe811fdce6510ab25b492375dd341ba1974c6 Mon Sep 17 00:00:00 2001 From: Gurpreet Singh Dhami Date: Fri, 8 Nov 2024 00:34:14 +0530 Subject: [PATCH 15/30] address review comments --- Dockerfile_rocm_ci => Dockerfile_rocm.ci | 4 ++-- tests/unit_tests/dist_checkpointing/test_fp8.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) rename Dockerfile_rocm_ci => Dockerfile_rocm.ci (91%) mode change 100644 => 100755 tests/unit_tests/dist_checkpointing/test_fp8.py diff --git a/Dockerfile_rocm_ci b/Dockerfile_rocm.ci similarity index 91% rename from Dockerfile_rocm_ci rename to Dockerfile_rocm.ci index 56cc92892e..376b7d6b6b 100644 --- a/Dockerfile_rocm_ci +++ b/Dockerfile_rocm.ci @@ -57,8 +57,8 @@ WORKDIR ${STAGE_DIR} ENV NVTE_FRAMEWORK=pytorch ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH_OVERRIDE} ENV NVTE_USE_HIPBLASLT=1 -RUN git clone --recursive https://github.com/ROCmSoftwarePlatform/TransformerEngine-private.git &&\ - cd TransformerEngine-private &&\ +RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git &&\ + cd TransformerEngine &&\ pip install . WORKDIR $WORKSPACE_DIR diff --git a/tests/unit_tests/dist_checkpointing/test_fp8.py b/tests/unit_tests/dist_checkpointing/test_fp8.py old mode 100644 new mode 100755 index 33c19465eb..9f29626acb --- a/tests/unit_tests/dist_checkpointing/test_fp8.py +++ b/tests/unit_tests/dist_checkpointing/test_fp8.py @@ -18,7 +18,6 @@ class TestFP8: - @pytest.mark.parametrize('dtype', ['bf16', 'fp16', 'fp8']) @pytest.mark.parametrize('src_rank', [0, 6]) @pytest.mark.failing_on_rocm From b142a986f6be9af9d3df9fd8767a0d390e779731 Mon Sep 17 00:00:00 2001 From: Gurpreet Singh Dhami Date: Fri, 8 Nov 2024 00:49:54 +0530 Subject: [PATCH 16/30] update dockerfile --- Dockerfile_amd => Dockerfile_rocm.dev | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) rename Dockerfile_amd => Dockerfile_rocm.dev (84%) diff --git a/Dockerfile_amd b/Dockerfile_rocm.dev similarity index 84% rename from Dockerfile_amd rename to Dockerfile_rocm.dev index 2ad89f60df..8915e5423a 100644 --- a/Dockerfile_amd +++ b/Dockerfile_rocm.dev @@ -1,12 +1,12 @@ ARG BASE_DOCKER=rocm/pytorch:latest FROM $BASE_DOCKER +ARG PYTORCH_ROCM_ARCH_OVERRIDE="gfx942" ENV WORKSPACE_DIR=/workspace ENV STAGE_DIR=/workspace/installs RUN mkdir -p $WORKSPACE_DIR RUN mkdir -p ${STAGE_DIR} WORKDIR $WORKSPACE_DIR - RUN pip3 install \ numpy==1.26.4 \ scipy \ @@ -25,13 +25,12 @@ wandb \ tensorstore==0.1.45 \ pytest_mock \ pybind11 \ -wrapt \ setuptools==69.5.1 \ datasets \ tiktoken \ pynvml -RUN pip3 install "huggingface_hub[cli]" +RUN pip3 install "huggingface_hub[cli]" RUN python3 -m nltk.downloader punkt_tab @@ -39,7 +38,7 @@ RUN python3 -m nltk.downloader punkt_tab WORKDIR ${STAGE_DIR} ENV CAUSAL_CONV1D_FORCE_BUILD=TRUE ENV MAMBA_FORCE_BUILD=TRUE -ENV HIP_ARCHITECTURES="gfx942" +ENV HIP_ARCHITECTURES=${PYTORCH_ROCM_ARCH_OVERRIDE} RUN git clone https://github.com/Dao-AILab/causal-conv1d causal-conv1d &&\ cd causal-conv1d &&\ git show --oneline -s &&\ @@ -49,17 +48,16 @@ RUN git clone https://github.com/Dao-AILab/causal-conv1d causal-conv1d &&\ WORKDIR ${STAGE_DIR} RUN git clone https://github.com/state-spaces/mamba mamba &&\ cd mamba &&\ - git checkout bc84fb1 &&\ git show --oneline -s &&\ pip install --no-build-isolation . # Clone TE repo and submodules WORKDIR ${STAGE_DIR} -ENV NVTE_FRAMEWORK=pytorch -ENV PYTORCH_ROCM_ARCH=gfx942 +ENV NVTE_FRAMEWORK=pytorch +ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH_OVERRIDE} ENV NVTE_USE_HIPBLASLT=1 -RUN git clone --recursive https://github.com/ROCmSoftwarePlatform/TransformerEngine-private.git &&\ - cd TransformerEngine-private &&\ +RUN git clone --recursive https://github.com/ROCmSoftwarePlatform/TransformerEngine &&\ + cd TransformerEngine &&\ pip install . WORKDIR $WORKSPACE_DIR From a533cceaf077ec49bba7898d501a25d53a1ecd80 Mon Sep 17 00:00:00 2001 From: Gurpreet Singh Dhami Date: Fri, 8 Nov 2024 03:13:24 +0530 Subject: [PATCH 17/30] update script --- examples/llama2/parse_logs.py | 15 ++++ .../prepare_bookcorpus_megatron_dataset.py | 14 ++++ examples/llama2/prepare_dataset.sh | 18 +++++ .../llama2/train_llama2.sh | 81 ++++--------------- 4 files changed, 62 insertions(+), 66 deletions(-) create mode 100644 examples/llama2/parse_logs.py create mode 100644 examples/llama2/prepare_bookcorpus_megatron_dataset.py create mode 100644 examples/llama2/prepare_dataset.sh rename train_llama2.sh => examples/llama2/train_llama2.sh (70%) diff --git a/examples/llama2/parse_logs.py b/examples/llama2/parse_logs.py new file mode 100644 index 0000000000..1fddaeb8a4 --- /dev/null +++ b/examples/llama2/parse_logs.py @@ -0,0 +1,15 @@ +import argparse +import numpy as np + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="Process Log") + parser.add_argument("filename") + args = parser.parse_args() + + with open(args.filename) as f: + lines = f.readlines() + lines = lines[1:-1] + lines = [float(a) for a in lines] + mean = np.mean(np.array(lines)) + print(mean)' \ No newline at end of file diff --git a/examples/llama2/prepare_bookcorpus_megatron_dataset.py b/examples/llama2/prepare_bookcorpus_megatron_dataset.py new file mode 100644 index 0000000000..449d41dfa7 --- /dev/null +++ b/examples/llama2/prepare_bookcorpus_megatron_dataset.py @@ -0,0 +1,14 @@ +import argparse +from pathlib import Path +from datasets import load_dataset + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--out-dir", type=str, required=False, default="tmp/data", + help="Path to output JSON") + args = parser.parse_args() + out_dir = Path(args.out_dir) + out_dir.mkdir(exist_ok=True, parents=True) + + dataset = load_dataset("bookcorpus", split="train", trust_remote_code=True) + dataset.to_json(out_dir / "bookcorpus_megatron.json") \ No newline at end of file diff --git a/examples/llama2/prepare_dataset.sh b/examples/llama2/prepare_dataset.sh new file mode 100644 index 0000000000..d605eaa100 --- /dev/null +++ b/examples/llama2/prepare_dataset.sh @@ -0,0 +1,18 @@ +TMP_DIR="tmp" +mkdir -p $TMP_DIR +mkdir -p ${TMP_DIR}/data + +DATA_PATH="${TMP_DIR}/data/bookcorpus_text_sentence" +TOKENIZER_MODEL=$TMP/tokenizer.model + +# Download the tokenizer model +if ! [ -f "$TOKENIZER_MODEL" ]; then +wget -O $TOKENIZER_MODEL https://huggingface.co/NousResearch/Llama-2-7b-chat-hf/resolve/main/tokenizer.model +fi + +python3 prepare_bookcorpus_megatron_dataset.py --out-dir ${DATA_PATH} +python3 tools/preprocess_data.py --input ${DATA_PATH}/bookcorpus_megatron.json --tokenizer-type GPTSentencePieceTokenizer \ +--tokenizer-model ${TOKENIZER_MODEL} --output-prefix ${DATA_PATH}/bookcorpus --workers `nproc` --split-sentences + +python3 tools/preprocess_data.py --input ${DATA_PATH}/bookcorpus_megatron.json --tokenizer-type GPTSentencePieceTokenizer \ +--tokenizer-model ${TOKENIZER_MODEL} --output-prefix ${DATA_PATH}/bookcorpus --workers `nproc` --split-sentences diff --git a/train_llama2.sh b/examples/llama2/train_llama2.sh similarity index 70% rename from train_llama2.sh rename to examples/llama2/train_llama2.sh index 557ebe8e5e..d2f21602bb 100755 --- a/train_llama2.sh +++ b/examples/llama2/train_llama2.sh @@ -18,7 +18,6 @@ done # Change for multinode config export CUDA_DEVICE_MAX_CONNECTIONS=1 -TEE_OUTPUT="${TEE_OUTPUT:-1}" USE_FLASH_ATTN="${USE_FLASH_ATTN:-1}" NO_TRAINING="${NO_TRAINING:-0}" # NO_TRAINING=1: for computing metrics only ENABLE_PROFILING="${ENABLE_PROFILING:-0}" @@ -53,54 +52,26 @@ CONTI_PARAMS="${CONTI_PARAMS:-0}" OPTIMIZER="${OPTIMIZER:-sgd}" TE_BF16="${TE_BF16:-1}" -EXPERIMENT_DIR="experiment" -mkdir -p $EXPERIMENT_DIR +TMP_DIR="tmp" -CHECKPOINT_PATH=$EXPERIMENT_DIR/ckpts -rm -rf $CHECKPOINT_PATH -mkdir -p $CHECKPOINT_PATH -DATA_DIR=$EXPERIMENT_DIR/data -mkdir -p $DATA_DIR +CHECKPOINT_PATH=${CHECKPOINT_PATH:-"$TMP/ckpts"} +mkdir -p ${CHECKPOINT_PATH} -TOKENIZER_MODEL=$EXPERIMENT_DIR/tokenizer.model +DATA_PATH=${DATA_PATH:-"$TMP_DIR/data/bookcorpus_text_sentence"} + +TOKENIZER_MODEL=$TMP/tokenizer.model # Download the tokenizer model if ! [ -f "$TOKENIZER_MODEL" ]; then wget -O $TOKENIZER_MODEL https://huggingface.co/NousResearch/Llama-2-7b-chat-hf/resolve/main/tokenizer.model fi -if ! [ "$ENABLE_MOCK_DATA" -eq 1 ]; then -# Prepare the dataset -echo 'import argparse -from pathlib import Path -from datasets import load_dataset - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--out-dir", type=str, required=False, default="tmp/data", - help="Path to output JSON") - args = parser.parse_args() - out_dir = Path(args.out_dir) - out_dir.mkdir(exist_ok=True, parents=True) - - dataset = load_dataset("bookcorpus", split="train", trust_remote_code=True) - dataset.to_json(out_dir / "bookcorpus_megatron.json")' > prepare_bookcorpus_megatron_dataset.py - -DATA_PATH=${DATA_DIR}/bookcorpus_text_sentence - - if ! [ -f "${DATA_DIR}/bookcorpus_text_sentence.idx" ]; then - echo "Dataset file does not exist, creating..." - python3 prepare_bookcorpus_megatron_dataset.py --out-dir ${DATA_DIR} - python3 tools/preprocess_data.py --input ${DATA_DIR}/bookcorpus_megatron.json --tokenizer-type GPTSentencePieceTokenizer --tokenizer-model ${EXPERIMENT_DIR}/tokenizer.model --output-prefix ${DATA_DIR}/bookcorpus --workers `nproc` --split-sentences - python3 tools/preprocess_data.py --input ${DATA_DIR}/bookcorpus_megatron.json --tokenizer-type GPTSentencePieceTokenizer --tokenizer-model ${EXPERIMENT_DIR}/tokenizer.model --output-prefix ${DATA_DIR}/bookcorpus --workers `nproc` --split-sentences - else - echo "Dataset file already exist." - fi -fi - MAX_POSITION_EMBEDDINGS=32768 - -TRAIN_LOG="${TEMP_DIR}/train_${MODEL_SIZE}B_iter${TOTAL_ITERS}_mbs${MBS}_bs${BS}_tp${TP}_pp${PP}_seq${SEQ_LENGTH}_optim_${OPTIMIZER}_fa_${USE_FLASH_ATTN}_seqpara_${SEQ_PARALLEL}_contiparam_${CONTI_PARAMS}_te_bg16_${TE_BF16}_${LABEL}.log" +LOG_NAME="${TMP_DIR}/train_${MODEL_SIZE}B_iter${TOTAL_ITERS}_mbs${MBS}_bs${BS}_tp${TP}_pp${PP}_\ + seq${SEQ_LENGTH}_optim_${OPTIMIZER}_fa_${USE_FLASH_ATTN}_seqpara_${SEQ_PARALLEL}_\ + contiparam_${CONTI_PARAMS}_te_bg16_${TE_BF16}_${LABEL}.log" +TRAIN_LOG="${LOG_NAME}.log" +PROFILING_DIR="profile_${LOG_NAME}" echo $TRAIN_LOG @@ -144,8 +115,6 @@ fi GROUP_SIZE=$(( ${NUM_HEADS} / ${NUM_KV_HEADS} )) NUM_GROUPS=$(( ${NUM_HEADS} / ${GROUP_SIZE} )) -PROFILING_DIR="${EXPERIMENT_DIR}/perf_${MODEL_SIZE}B_iter${TOTAL_ITERS}_mbs${MBS}_bs${BS}_tp${TP}_pp${PP}_seq${SEQ_LENGTH}_optim_${OPTIMIZER}_fa_${USE_FLASH_ATTN}_seqpara_${SEQ_PARALLEL}_contiparam_${CONTI_PARAMS}" - GPT_ARGS=" --tensor-model-parallel-size ${TP} \ --pipeline-model-parallel-size ${PP} \ @@ -276,44 +245,24 @@ echo 'torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ " fi -if [ "$TEE_OUTPUT" -eq 0 ]; then - run_cmd="$run_cmd >& $TRAIN_LOG" -else - run_cmd="$run_cmd |& tee $TRAIN_LOG" -fi +run_cmd="$run_cmd |& tee $TRAIN_LOG" if [ "$NO_TRAINING" -eq 0 ]; then eval $run_cmd fi -echo 'import argparse -import numpy as np - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="Process Log") - parser.add_argument("filename") - args = parser.parse_args() - - with open(args.filename) as f: - lines = f.readlines() - lines = lines[1:-1] - lines = [float(a) for a in lines] - mean = np.mean(np.array(lines)) - print(mean)' > mean_log_value.py - # echo '============================================================================================================' grep -Eo 'throughput per GPU [^|]*' $TRAIN_LOG | sed -E 's/.*throughput per GPU \(TFLOP\/s\/GPU\): ([0-9\.]+).*/\1/' > tmp.txt echo "throughput per GPU: $(python mean_log_value.py tmp.txt)" |& tee -a $TRAIN_LOG -THROUGHPUT=$(python mean_log_value.py tmp.txt) +THROUGHPUT=$(python parse_logs.py tmp.txt) rm tmp.txt # echo '============================================================================================================' grep -Eo 'elapsed time per iteration [^|]*' $TRAIN_LOG | sed -E 's/.*elapsed time per iteration \(ms\): ([0-9\.]+).*/\1/' > tmp.txt -echo "elapsed time per iteration: $(python mean_log_value.py tmp.txt)" |& tee -a $TRAIN_LOG +echo "elapsed time per iteration: $(python parse_logs.py tmp.txt)" |& tee -a $TRAIN_LOG -TIME_PER_ITER=$(python mean_log_value.py tmp.txt 2>/dev/null | awk '{printf "%.6f", $0}') +TIME_PER_ITER=$(python parse_logs.py tmp.txt 2>/dev/null | awk '{printf "%.6f", $0}') PERFORMANCE=$(awk -v bs="$BS" -v sl="$SEQ_LENGTH" -v tpi="$TIME_PER_ITER" -v ws="$WORLD_SIZE" 'BEGIN {printf "%.6f", bs * sl * 1000/ (tpi * ws)}') echo "tokens/GPU/s: $PERFORMANCE" |& tee -a $TRAIN_LOG rm tmp.txt \ No newline at end of file From 5ce68ce09593f8158f0454fd1e1c041d25d274a7 Mon Sep 17 00:00:00 2001 From: Gurpreet Singh Dhami Date: Fri, 8 Nov 2024 04:19:46 +0530 Subject: [PATCH 18/30] fix docker filename --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 751199278e..3d56d9ef5c 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -30,7 +30,7 @@ pipeline { steps { show_node_info() script { - sh "docker build -f Dockerfile_rocm_ci -t ${DOCKER_IMAGE} ${DOCKER_BUILD_ARGS} ." + sh "docker build -f Dockerfile_rocm.ci -t ${DOCKER_IMAGE} ${DOCKER_BUILD_ARGS} ." } } } From 77113cc9d9e0634241036164e6feeafecf5e99ec Mon Sep 17 00:00:00 2001 From: Gurpreet Singh Dhami Date: Fri, 8 Nov 2024 03:58:22 +0530 Subject: [PATCH 19/30] update --- Dockerfile_rocm.dev | 2 +- examples/llama2/mean_log_value.py | 13 +++++++++++ examples/llama2/parse_logs.py | 15 ------------ .../prepare_bookcorpus_megatron_dataset.py | 0 examples/llama2/prepare_dataset.sh | 4 ++-- examples/llama2/train_llama2.sh | 23 ++++++++++--------- run_examples.sh | 6 ----- 7 files changed, 28 insertions(+), 35 deletions(-) create mode 100755 examples/llama2/mean_log_value.py delete mode 100644 examples/llama2/parse_logs.py mode change 100644 => 100755 examples/llama2/prepare_bookcorpus_megatron_dataset.py mode change 100644 => 100755 examples/llama2/prepare_dataset.sh delete mode 100755 run_examples.sh diff --git a/Dockerfile_rocm.dev b/Dockerfile_rocm.dev index 8915e5423a..9869d93ffe 100644 --- a/Dockerfile_rocm.dev +++ b/Dockerfile_rocm.dev @@ -56,7 +56,7 @@ WORKDIR ${STAGE_DIR} ENV NVTE_FRAMEWORK=pytorch ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH_OVERRIDE} ENV NVTE_USE_HIPBLASLT=1 -RUN git clone --recursive https://github.com/ROCmSoftwarePlatform/TransformerEngine &&\ +RUN git clone --recursive https://github.com/ROCm/TransformerEngine &&\ cd TransformerEngine &&\ pip install . diff --git a/examples/llama2/mean_log_value.py b/examples/llama2/mean_log_value.py new file mode 100755 index 0000000000..46cd6732ec --- /dev/null +++ b/examples/llama2/mean_log_value.py @@ -0,0 +1,13 @@ +import argparse +import numpy as np + +parser = argparse.ArgumentParser(prog="Process Log") +parser.add_argument("filename") +args = parser.parse_args() + +with open(args.filename) as f: + lines = f.readlines() +lines = lines[1:-1] +lines = [float(a) for a in lines] +mean = np.mean(np.array(lines)) +print(mean) \ No newline at end of file diff --git a/examples/llama2/parse_logs.py b/examples/llama2/parse_logs.py deleted file mode 100644 index 1fddaeb8a4..0000000000 --- a/examples/llama2/parse_logs.py +++ /dev/null @@ -1,15 +0,0 @@ -import argparse -import numpy as np - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="Process Log") - parser.add_argument("filename") - args = parser.parse_args() - - with open(args.filename) as f: - lines = f.readlines() - lines = lines[1:-1] - lines = [float(a) for a in lines] - mean = np.mean(np.array(lines)) - print(mean)' \ No newline at end of file diff --git a/examples/llama2/prepare_bookcorpus_megatron_dataset.py b/examples/llama2/prepare_bookcorpus_megatron_dataset.py old mode 100644 new mode 100755 diff --git a/examples/llama2/prepare_dataset.sh b/examples/llama2/prepare_dataset.sh old mode 100644 new mode 100755 index d605eaa100..8f16ea0fda --- a/examples/llama2/prepare_dataset.sh +++ b/examples/llama2/prepare_dataset.sh @@ -2,8 +2,8 @@ TMP_DIR="tmp" mkdir -p $TMP_DIR mkdir -p ${TMP_DIR}/data -DATA_PATH="${TMP_DIR}/data/bookcorpus_text_sentence" -TOKENIZER_MODEL=$TMP/tokenizer.model +DATA_PATH="${TMP_DIR}/data" +TOKENIZER_MODEL=${TMP_DIR}/tokenizer.model # Download the tokenizer model if ! [ -f "$TOKENIZER_MODEL" ]; then diff --git a/examples/llama2/train_llama2.sh b/examples/llama2/train_llama2.sh index d2f21602bb..e5e8e4bfce 100755 --- a/examples/llama2/train_llama2.sh +++ b/examples/llama2/train_llama2.sh @@ -46,7 +46,7 @@ PP="${PP:-1}" MBS="${MBS:-2}" BS="${BS:-8}" SEQ_LENGTH="${SEQ_LENGTH:-4096}" -TOTAL_ITERS="${TOTAL_ITERS:-4}" +TOTAL_ITERS="${TOTAL_ITERS:-20}" SEQ_PARALLEL="${SEQ_PARALLEL:-1}" CONTI_PARAMS="${CONTI_PARAMS:-0}" OPTIMIZER="${OPTIMIZER:-sgd}" @@ -251,18 +251,19 @@ if [ "$NO_TRAINING" -eq 0 ]; then eval $run_cmd fi - +MEAN_LOG_SCRIPT=examples/llama2/mean_log_value.py +TMP_FILE=${TMP_DIR}/tmp.txt # echo '============================================================================================================' -grep -Eo 'throughput per GPU [^|]*' $TRAIN_LOG | sed -E 's/.*throughput per GPU \(TFLOP\/s\/GPU\): ([0-9\.]+).*/\1/' > tmp.txt -echo "throughput per GPU: $(python mean_log_value.py tmp.txt)" |& tee -a $TRAIN_LOG -THROUGHPUT=$(python parse_logs.py tmp.txt) -rm tmp.txt +grep -Eo 'throughput per GPU [^|]*' $TRAIN_LOG | sed -E 's/.*throughput per GPU \(TFLOP\/s\/GPU\): ([0-9\.]+).*/\1/' > $TMP_FILE +THROUGHPUT=$(python ${MEAN_LOG_SCRIPT} ${TMP_FILE}) +echo "throughput per GPU (TFLOPs/GPU): ${THROUGHPUT}" +rm $TMP_FILE # echo '============================================================================================================' -grep -Eo 'elapsed time per iteration [^|]*' $TRAIN_LOG | sed -E 's/.*elapsed time per iteration \(ms\): ([0-9\.]+).*/\1/' > tmp.txt -echo "elapsed time per iteration: $(python parse_logs.py tmp.txt)" |& tee -a $TRAIN_LOG +grep -Eo 'elapsed time per iteration [^|]*' $TRAIN_LOG | sed -E 's/.*elapsed time per iteration \(ms\): ([0-9\.]+).*/\1/' > $TMP_FILE +TIME_PER_ITER=$(python ${MEAN_LOG_SCRIPT} ${TMP_FILE} 2>/dev/null | awk '{printf "%.6f", $0}') +echo "elapsed time per iteration: ${TIME_PER_ITER}" +rm $TMP_FILE -TIME_PER_ITER=$(python parse_logs.py tmp.txt 2>/dev/null | awk '{printf "%.6f", $0}') PERFORMANCE=$(awk -v bs="$BS" -v sl="$SEQ_LENGTH" -v tpi="$TIME_PER_ITER" -v ws="$WORLD_SIZE" 'BEGIN {printf "%.6f", bs * sl * 1000/ (tpi * ws)}') -echo "tokens/GPU/s: $PERFORMANCE" |& tee -a $TRAIN_LOG -rm tmp.txt \ No newline at end of file +echo "tokens/GPU/s: $PERFORMANCE" diff --git a/run_examples.sh b/run_examples.sh deleted file mode 100755 index bdfb09889d..0000000000 --- a/run_examples.sh +++ /dev/null @@ -1,6 +0,0 @@ -bash train_llama2.sh ENABLE_MOCK_DATA=0 ENABLE_ROPE=1 DISABLE_ROPE_TE=0 DUMMY_RUN=0 ADD_TASK=0 MODEL_SIZE=70 ENABLE_PROFILING=0 TP=8 PP=1 BS=8 MBS=2 TOTAL_ITERS=20 LABEL="te_rope_en_fused" - -#bash train_llama2.sh ENABLE_MOCK_DATA=0 ENABLE_ROPE=0 DUMMY_RUN=0 ADD_TASK=0 MODEL_SIZE=70 ENABLE_PROFILING=0 TP=8 PP=1 BS=8 MBS=2 TOTAL_ITERS=20 LABEL="rope_disabled" - - -#bash train_llama2.sh ENABLE_MOCK_DATA=0 ENABLE_ROPE=1 DUMMY_RUN=0 ADD_TASK=0 MODEL_SIZE=70 ENABLE_PROFILING=1 TP=8 PP=1 BS=8 MBS=2 TOTAL_ITERS=20 LABEL="rope_en_fused_profile" From 1e64046b302936dccfb422f8e864c55730cac0d7 Mon Sep 17 00:00:00 2001 From: Gurpreet Singh Dhami Date: Fri, 8 Nov 2024 13:10:36 -0600 Subject: [PATCH 20/30] remove commented lines --- examples/llama2/train_llama2.sh | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/examples/llama2/train_llama2.sh b/examples/llama2/train_llama2.sh index e5e8e4bfce..cfa3ab5e33 100755 --- a/examples/llama2/train_llama2.sh +++ b/examples/llama2/train_llama2.sh @@ -15,9 +15,6 @@ do export "$KEY"="$VALUE" done -# Change for multinode config -export CUDA_DEVICE_MAX_CONNECTIONS=1 - USE_FLASH_ATTN="${USE_FLASH_ATTN:-1}" NO_TRAINING="${NO_TRAINING:-0}" # NO_TRAINING=1: for computing metrics only ENABLE_PROFILING="${ENABLE_PROFILING:-0}" @@ -34,6 +31,7 @@ CWD=`pwd` GPUS_PER_NODE=`python -c "import torch; print(torch.cuda.device_count())"` # Change for multinode config +export CUDA_DEVICE_MAX_CONNECTIONS=1 MASTER_ADDR=${MASTER_ADDR:-localhost} MASTER_PORT=${MASTER_PORT:-23731} NNODES=${NNODES:-1} @@ -146,7 +144,6 @@ GPT_ARGS=" --no-masked-softmax-fusion \ --overlap-grad-reduce \ " - # --no-masked-softmax-fusion \ DATA_ARGS=" --tokenizer-type Llama2Tokenizer \ @@ -162,9 +159,6 @@ OUTPUT_ARGS=" --eval-iters -1 " - # --save-interval $TOTAL_ITERS \ - # --eval-interval $TOTAL_ITERS \ - DISTRIBUTED_ARGS=" --nproc_per_node $GPUS_PER_NODE \ --nnodes $NNODES \ @@ -253,13 +247,13 @@ fi MEAN_LOG_SCRIPT=examples/llama2/mean_log_value.py TMP_FILE=${TMP_DIR}/tmp.txt -# echo '============================================================================================================' + grep -Eo 'throughput per GPU [^|]*' $TRAIN_LOG | sed -E 's/.*throughput per GPU \(TFLOP\/s\/GPU\): ([0-9\.]+).*/\1/' > $TMP_FILE THROUGHPUT=$(python ${MEAN_LOG_SCRIPT} ${TMP_FILE}) echo "throughput per GPU (TFLOPs/GPU): ${THROUGHPUT}" rm $TMP_FILE -# echo '============================================================================================================' + grep -Eo 'elapsed time per iteration [^|]*' $TRAIN_LOG | sed -E 's/.*elapsed time per iteration \(ms\): ([0-9\.]+).*/\1/' > $TMP_FILE TIME_PER_ITER=$(python ${MEAN_LOG_SCRIPT} ${TMP_FILE} 2>/dev/null | awk '{printf "%.6f", $0}') echo "elapsed time per iteration: ${TIME_PER_ITER}" From 106519d075be410f81e975efd7ef9fda29497a84 Mon Sep 17 00:00:00 2001 From: Gurpreet Singh Dhami <143527450+gurpreet-dhami@users.noreply.github.com> Date: Mon, 18 Nov 2024 12:40:47 -0500 Subject: [PATCH 21/30] Use mi300 for ci pipeline (#21) * Use mi300 for ci pipeline Use mi300 and implemented the build only node and test node functionality. * correct the artifact * make improvements * add unique docker image id * update --- Dockerfile_rocm.ci | 2 +- Dockerfile_rocm.dev | 1 - Jenkinsfile | 88 +++++++++++++++++++++++++++++++-------------- run_unit_tests.sh | 9 ++++- 4 files changed, 70 insertions(+), 30 deletions(-) diff --git a/Dockerfile_rocm.ci b/Dockerfile_rocm.ci index 376b7d6b6b..253b446a29 100644 --- a/Dockerfile_rocm.ci +++ b/Dockerfile_rocm.ci @@ -9,7 +9,6 @@ RUN mkdir -p ${STAGE_DIR} WORKDIR $WORKSPACE_DIR RUN pip3 install \ -numpy==1.26.4 \ scipy \ einops \ flask-restful \ @@ -64,6 +63,7 @@ RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git &&\ WORKDIR $WORKSPACE_DIR COPY . Megatron-LM WORKDIR $WORKSPACE_DIR/Megatron-LM +RUN pip install -e . # record configuration for posterity RUN pip list diff --git a/Dockerfile_rocm.dev b/Dockerfile_rocm.dev index 9869d93ffe..d253193b67 100644 --- a/Dockerfile_rocm.dev +++ b/Dockerfile_rocm.dev @@ -8,7 +8,6 @@ RUN mkdir -p ${STAGE_DIR} WORKDIR $WORKSPACE_DIR RUN pip3 install \ -numpy==1.26.4 \ scipy \ einops \ flask-restful \ diff --git a/Jenkinsfile b/Jenkinsfile index 3d56d9ef5c..27e60f4db1 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -2,56 +2,90 @@ import org.apache.commons.io.FilenameUtils import groovy.json.JsonOutput -def show_node_info() { - sh """ - echo "NODE_NAME = \$NODE_NAME" || true - lsb_release -sd || true - uname -r || true - cat /sys/module/amdgpu/version || true - ls /opt/ -la || true - """ +def clean_up_docker_images() { + // Check if the images exist before attempting to remove them + def imageExists = sh(script: "docker images -q ${env.DOCKER_IMAGE}", returnStdout: true).trim() + if (imageExists) { + sh "docker rmi ${env.DOCKER_IMAGE}" + } } -DOCKER_IMAGE = "megatron-lm" -CONTAINER_NAME = "megatron-lm-container" -DOCKER_BUILD_ARGS = "--build-arg PYTORCH_ROCM_ARCH_OVERRIDE=gfx90a" -DOCKER_RUN_ARGS = "-v \$(pwd):/workspace/Megatron-LM/output --workdir /workspace/Megatron-LM --entrypoint /workspace/Megatron-LM/run_unit_tests.sh" +def clean_docker_build_cache() { + sh 'docker system prune -f --volumes || true' +} -DOCKER_RUN_CMD= "docker run --rm -t --network host -u root --group-add video --cap-add=SYS_PTRACE --cap-add SYS_ADMIN --device /dev/fuse --security-opt seccomp=unconfined --security-opt apparmor=unconfined --ipc=host --device=/dev/kfd --device=/dev/dri" pipeline { + agent { + label 'build-only' + } + parameters { - string(name: 'TEST_NODE_LABEL', defaultValue: 'MI250', description: 'Node or Label to launch Jenkins Job') + string(name: 'TEST_NODE_LABEL', defaultValue: 'MI300X_BANFF', description: 'Node or Label to launch Jenkins Job') + string(name: 'GPU_ARCH', defaultValue: 'gfx942', description: 'GPU Architecture') } - agent {node {label "${params.TEST_NODE_LABEL}"}} + environment { + REPO_NAME = 'rocm/megatron-lm' + CONTAINER_NAME = "megatron-lm-container" + DOCKER_RUN_ARGS = "-v \$(pwd):/workspace/Megatron-LM/output --workdir /workspace/Megatron-LM \ + --entrypoint /workspace/Megatron-LM/run_unit_tests.sh" + DOCKER_RUN_CMD = "docker run --rm -t --network host -u root --group-add video --cap-add=SYS_PTRACE \ + --cap-add SYS_ADMIN --device /dev/fuse --security-opt seccomp=unconfined --security-opt apparmor=unconfined \ + --ipc=host --device=/dev/kfd --device=/dev/dri" + } stages { stage('Build Docker Image') { steps { - show_node_info() + clean_docker_build_cache() script { - sh "docker build -f Dockerfile_rocm.ci -t ${DOCKER_IMAGE} ${DOCKER_BUILD_ARGS} ." + + // Generate a unique UUID for the Docker image name + def uuid = sh(script: 'uuidgen', returnStdout: true).trim() + env.DOCKER_IMAGE = "${REPO_NAME}:${uuid}" + + // Build Docker image + sh "docker build --no-cache -f Dockerfile_rocm.ci --build-arg PYTORCH_ROCM_ARCH_OVERRIDE=${params.GPU_ARCH} -t ${env.DOCKER_IMAGE} ." + + withCredentials([usernamePassword(credentialsId: 'docker-hub-credentials', usernameVariable: 'DOCKER_USERNAME', passwordVariable: 'DOCKER_PASSWORD')]) { + sh "docker push ${env.DOCKER_IMAGE}" } } } + post { + always { + clean_up_docker_images() + } + } + } stage('Run Unit Tests') { + agent { + node { + label "${params.TEST_NODE_LABEL}" + } + } + steps { script { + // Pull the Docker image from the repository on the test node + withCredentials([usernamePassword(credentialsId: 'docker-hub-credentials', usernameVariable: 'DOCKER_USERNAME', passwordVariable: 'DOCKER_PASSWORD')]) { + sh "docker pull ${env.DOCKER_IMAGE}" + } + wrap([$class: 'AnsiColorBuildWrapper', 'colorMapName': 'xterm']) { - sh "${DOCKER_RUN_CMD} ${DOCKER_RUN_ARGS} --name ${CONTAINER_NAME} ${DOCKER_IMAGE} " + sh "${DOCKER_RUN_CMD} ${DOCKER_RUN_ARGS} --name ${env.CONTAINER_NAME} ${env.DOCKER_IMAGE}" } } } - } - } - - post { - always { - //Cleanup - archiveArtifacts artifacts: 'test_report.csv' - script { - sh "docker rmi ${DOCKER_IMAGE}" + post { + always { + // Archive test results + script { + archiveArtifacts artifacts: 'test_report.csv', allowEmptyArchive: true + clean_up_docker_images() + } + } } } } diff --git a/run_unit_tests.sh b/run_unit_tests.sh index c7317ad4a4..00540a6149 100755 --- a/run_unit_tests.sh +++ b/run_unit_tests.sh @@ -2,4 +2,11 @@ set -x export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -torchrun --nproc_per_node=8 -m pytest --color=yes -m "not flaky and not internal and not failing_on_rocm_mi250 and not failing_on_rocm" --csv output/test_report.csv tests/unit_tests/ \ No newline at end of file + +PYTEST_MARKERS="not flaky and not internal and not failing_on_rocm" + +if [[ "$HIP_ARCHITECTURES" == "gfx90a" ]]; then + PYTEST_MARKERS="$PYTEST_MARKERS and not failing_on_rocm_mi250" +fi + +torchrun --nproc_per_node=8 -m pytest --color=yes -m "$PYTEST_MARKERS" --csv output/test_report.csv tests/unit_tests/ \ No newline at end of file From d24a3643237e7160f172efeb064b4df50bfe100e Mon Sep 17 00:00:00 2001 From: Gurpreet Singh Dhami <143527450+gurpreet-dhami@users.noreply.github.com> Date: Mon, 18 Nov 2024 23:14:23 -0500 Subject: [PATCH 22/30] fix string error (#25) --- examples/llama2/train_llama2.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/llama2/train_llama2.sh b/examples/llama2/train_llama2.sh index cfa3ab5e33..7ce38702f4 100755 --- a/examples/llama2/train_llama2.sh +++ b/examples/llama2/train_llama2.sh @@ -65,9 +65,9 @@ wget -O $TOKENIZER_MODEL https://huggingface.co/NousResearch/Llama-2-7b-chat-hf/ fi MAX_POSITION_EMBEDDINGS=32768 -LOG_NAME="${TMP_DIR}/train_${MODEL_SIZE}B_iter${TOTAL_ITERS}_mbs${MBS}_bs${BS}_tp${TP}_pp${PP}_\ - seq${SEQ_LENGTH}_optim_${OPTIMIZER}_fa_${USE_FLASH_ATTN}_seqpara_${SEQ_PARALLEL}_\ - contiparam_${CONTI_PARAMS}_te_bg16_${TE_BF16}_${LABEL}.log" +LOG_NAME="${TMP_DIR}/train_${MODEL_SIZE}B_iter${TOTAL_ITERS}_mbs${MBS}_bs${BS}_tp${TP}_pp${PP}_"\ +"seq${SEQ_LENGTH}_optim_${OPTIMIZER}_fa_${USE_FLASH_ATTN}_seqpara_${SEQ_PARALLEL}_"\ +"contiparam_${CONTI_PARAMS}_te_bg16_${TE_BF16}_${LABEL}" TRAIN_LOG="${LOG_NAME}.log" PROFILING_DIR="profile_${LOG_NAME}" From b0d08df8b52d3f038993f61199f68f96b23b3821 Mon Sep 17 00:00:00 2001 From: wenchenvincent <32376000+wenchenvincent@users.noreply.github.com> Date: Thu, 21 Nov 2024 12:28:29 -0600 Subject: [PATCH 23/30] Enabled TEGroupedMLP test. (#22) * Enabled TEGroupedMLP test. * Revert "Enabled TEGroupedMLP test." This reverts commit 93fb2043b03e5de06997a2e0f3231dd267a2f673. * Enabled TEGroupedMLP test to run on ROCm. --- pytest.ini | 9 +++++---- run_unit_tests.sh | 4 ++-- tests/unit_tests/transformer/moe/test_grouped_mlp.py | 3 +++ 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pytest.ini b/pytest.ini index e6c58fccb7..cb6bfac7d4 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,9 +1,10 @@ # content of pytest.ini [pytest] markers = - internal: mark a test as a test to private/internal functions. - failing_on_rocm: Currently Failing Tests on Rocm - failing_on_rocm_mi250: Tests failing on MI250 + internal: Mark a test as a test to private/internal functions. + failing_on_rocm: Currently Failing Tests on ROCm. + failing_on_rocm_mi250: Tests failing on MI250. + test_on_rocm: Mark a test that we run on ROCm specifically. addopts = - --ignore tests/unit_tests/test_utilities.py \ No newline at end of file + --ignore tests/unit_tests/test_utilities.py diff --git a/run_unit_tests.sh b/run_unit_tests.sh index 00540a6149..cfa8e0ad3b 100755 --- a/run_unit_tests.sh +++ b/run_unit_tests.sh @@ -3,10 +3,10 @@ set -x export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -PYTEST_MARKERS="not flaky and not internal and not failing_on_rocm" +PYTEST_MARKERS="(not flaky and not internal and not failing_on_rocm or test_on_rocm)" if [[ "$HIP_ARCHITECTURES" == "gfx90a" ]]; then PYTEST_MARKERS="$PYTEST_MARKERS and not failing_on_rocm_mi250" fi -torchrun --nproc_per_node=8 -m pytest --color=yes -m "$PYTEST_MARKERS" --csv output/test_report.csv tests/unit_tests/ \ No newline at end of file +torchrun --nproc_per_node=8 -m pytest --color=yes -m "$PYTEST_MARKERS" --csv output/test_report.csv tests/unit_tests/ diff --git a/tests/unit_tests/transformer/moe/test_grouped_mlp.py b/tests/unit_tests/transformer/moe/test_grouped_mlp.py index 043bdc8c58..8638cf364b 100644 --- a/tests/unit_tests/transformer/moe/test_grouped_mlp.py +++ b/tests/unit_tests/transformer/moe/test_grouped_mlp.py @@ -278,6 +278,7 @@ def setup_method(self, method, use_cpu_initialization=False, swiglu=True): def teardown_method(self, method): Utils.destroy_model_parallel() + @pytest.mark.test_on_rocm @pytest.mark.internal def test_constructor(self): assert isinstance(self.sequential_mlp, MoELayer) @@ -313,6 +314,7 @@ def test_constructor(self): ) @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.test_on_rocm @pytest.mark.internal def test_gpu_forward_backward(self): self.sequential_mlp.cuda() @@ -356,6 +358,7 @@ def test_gpu_forward_backward(self): torch.testing.assert_close(smm_result, gmm_result) @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.test_on_rocm @pytest.mark.internal def test_gpu_forward_backward_with_no_tokens_allocated(self): """Test the case when no token is allocated for groupedGEMM kernels.""" From e8c908017648d8a8e356fe58dbd9b19688d45245 Mon Sep 17 00:00:00 2001 From: lizamd <161388580+lizamd@users.noreply.github.com> Date: Thu, 21 Nov 2024 19:13:26 -0800 Subject: [PATCH 24/30] Submit example/llama with llama2/3 scripts and readme (#19) * Updated training script for llama2 and llama3. Added README. * modification based on review feedback * removed experiement/ folder * modified scripts further * updated readme * last update * modifed output format * add copyright --------- Co-authored-by: Gurpreet Singh Dhami --- .../prepare_bookcorpus_megatron_dataset.py | 0 examples/{llama2 => llama}/prepare_dataset.sh | 0 examples/llama/readme.md | 131 ++++++++ examples/llama/train_llama2.sh | 307 +++++++++++++++++ examples/llama/train_llama3.sh | 308 ++++++++++++++++++ examples/llama2/mean_log_value.py | 13 - examples/llama2/train_llama2.sh | 263 --------------- 7 files changed, 746 insertions(+), 276 deletions(-) rename examples/{llama2 => llama}/prepare_bookcorpus_megatron_dataset.py (100%) rename examples/{llama2 => llama}/prepare_dataset.sh (100%) create mode 100644 examples/llama/readme.md create mode 100644 examples/llama/train_llama2.sh create mode 100644 examples/llama/train_llama3.sh delete mode 100755 examples/llama2/mean_log_value.py delete mode 100755 examples/llama2/train_llama2.sh diff --git a/examples/llama2/prepare_bookcorpus_megatron_dataset.py b/examples/llama/prepare_bookcorpus_megatron_dataset.py similarity index 100% rename from examples/llama2/prepare_bookcorpus_megatron_dataset.py rename to examples/llama/prepare_bookcorpus_megatron_dataset.py diff --git a/examples/llama2/prepare_dataset.sh b/examples/llama/prepare_dataset.sh similarity index 100% rename from examples/llama2/prepare_dataset.sh rename to examples/llama/prepare_dataset.sh diff --git a/examples/llama/readme.md b/examples/llama/readme.md new file mode 100644 index 0000000000..8d0c8704b7 --- /dev/null +++ b/examples/llama/readme.md @@ -0,0 +1,131 @@ +# Llama2/Llama3 Model Pretraining Instructions + +This guide provides the steps for setting up the environment and configuring the script to train Llama2 or Llama3 models. + +--- + +## 1. Environment Setup + +1. **Download Docker Image** + Download the Docker image required for training: + `docker pull ` + +2. **Launch Docker Container** + Start the Docker container: + `docker run -it ` + +--- + +## 2. How to Run + +### 2.1 Single Node Training +To run the training on a single node, go to Megatron-LM folder, use the following command: +```bash +TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh +``` + + +### 2.2 Multi-node Training +To run training on multiple nodes, launch the Docker container on each node. Follow these steps: + +- **On the Master Node:** + ```bash + TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh + ``` + +- **On the Slave Node(s):** + ```bash + TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh + ``` + +## 3. Configurations in Script (`Megatron/examples/llama`) + +### 3.1 Network Interface +Update the network interface in the script to match your system’s network interface. +To find your network interface, run (out of container): +```bash +ip a +``` +Then, update the following variables in the script: +```bash +export NCCL_SOCKET_IFNAME=ens50f0np0 +export GLOO_SOCKET_IFNAME=ens50f0np0 +``` + +### 3.2 Dataset +You can use either mock data or real data for training. + +- **Mock Data:** + Replace the data path: + ```bash + --data-path $DATA_PATH \ with + --mock-data + ``` + +- **Real Data:** + Update the `DATA_PATH` to the location where your dataset is stored: + ```bash + DATA_DIR="/root/.cache/data" # Change to where your dataset is stored + DATA_PATH=${DATA_DIR}/bookcorpus_text_sentence + ``` + +### 3.3 Tokenizer + +- **For Llama2 Training:** + Use the `Llama2Tokenizer`. + +- **For Llama3 Training:** + Use the `HuggingFaceTokenizer`. Set the HuggingFace model link in the `TOKENIZER_MODEL` variable: + ```bash + TOKENIZER_MODEL=meta-llama/Llama-3.1-8B # For Llama3 + ``` + +### 3.4 Multi-node Training +If you're running multi-node training, update the following environment variables: + +- **Master Address:** + Change `localhost` to the master node's hostname: + ```bash + MASTER_ADDR="${MASTER_ADDR:-localhost}" + ``` + +- **Number of Nodes:** + Set the number of nodes you want to train on (e.g., 2, 4, 8): + ```bash + NNODES="${NNODES:-1}" + ``` + +- **Node Rank:** + Set the rank of each node (0 for master, 1 for the first slave node, etc.): + ```bash + NODE_RANK="${NODE_RANK:-0}" + ``` + +--- + +## 4. Key Variables to Pay Attention To + +- **TE_FP8:** + `0` for BP16 (default), `1` for FP8. + +- **GEMM_TUNING:** + `1` to enable GEMM tuning, which boosts performance by using the best GEMM kernels. + +- **USE_FLASH_ATTN:** + `1` to enable Flash Attention. + +- **ENABLE_PROFILING:** + `1` to enable PyTorch profiling for performance analysis. + +- **transformer-impl:** + `transformer_engine` to use the Transformer Engine (TE). Set to `local` if you want to disable TE. + +- **MODEL_SIZE:** + Set to `7B` or `70B` for Llama2, or `8B` or `70B` for Llama3/3.1. + +- **TOTAL_ITERS:** + Set the total number of iterations (default: 10). + +--- + +That's it! You've now set up the environment and configured the necessary settings for training Llama2 or Llama3 models. \ No newline at end of file diff --git a/examples/llama/train_llama2.sh b/examples/llama/train_llama2.sh new file mode 100644 index 0000000000..b9ce6ccbb8 --- /dev/null +++ b/examples/llama/train_llama2.sh @@ -0,0 +1,307 @@ +#!/bin/bash +############################################################################### +# Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +################################################################################# +# set -x + +# set envs +export GPU_MAX_HW_QUEUES=2 +export TORCH_NCCL_HIGH_PRIORITY=1 +export NCCL_CHECKS_DISABLE=1 +export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 +export NCCL_IB_GID_INDEX=3 +export NCCL_CROSS_NIC=0 +export NCCL_SOCKET_IFNAME=ens50f0np0 # network interface +export GLOO_SOCKET_IFNAME=ens50f0np0 # network interface +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NCCL_PROTO=Simple +export RCCL_MSCCL_ENABLE=0 +export TOKENIZERS_PARALLELISM=false +export HSA_NO_SCRATCH_RECLAIM=1 + + +# parsing input arguments +for ARGUMENT in "$@" +do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + + KEY_LENGTH=${#KEY} + VALUE="${ARGUMENT:$KEY_LENGTH+1}" + + export "$KEY"="$VALUE" +done + + +TIME_STAMP=$(date +"%Y-%m-%d_%H-%M-%S") +EXP_NAME="${EXP_NAME:-perf}" + +TEE_OUTPUT="${TEE_OUTPUT:-1}" +USE_FLASH_ATTN="${USE_FLASH_ATTN:-1}" +NO_TRAINING="${NO_TRAINING:-0}" # NO_TRAINING=1: for computing metrics only +ENABLE_PROFILING="${ENABLE_PROFILING:-0}" #enable pytorch profiling +ENABLE_ROPE="${ENABLE_ROPE:-1}" +DISABLE_ROPE_TE="${DISABLE_ROPE_TE:-0}" +echo "NO_TRAINING=$NO_TRAINING" + +CWD=`pwd` +GPUS_PER_NODE=`python3 -c "import torch; print(torch.cuda.device_count())"` + +# single node config, Change for multinode config +MASTER_ADDR="${MASTER_ADDR:-localhost}" +#MASTER_ADDR="${MASTER_ADDR:-tw015}" +MASTER_PORT="${MASTER_PORT:-6020}" +NNODES="${NNODES:-1}" +#NNODES="${NNODES:-2}" +NODE_RANK="${NODE_RANK:-0}" +WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) + +MODEL_SIZE="${MODEL_SIZE:-70}" +TP="${TP:-8}" +PP="${PP:-1}" +CP="${CP:-1}" +MBS="${MBS:-2}" +BS="${BS:-8}" +SEQ_LENGTH="${SEQ_LENGTH:-4096}" +TOTAL_ITERS="${TOTAL_ITERS:-5}" +SEQ_PARALLEL="${SEQ_PARALLEL:-1}" +CONTI_PARAMS="${CONTI_PARAMS:-0}" +TE_FP8="${TE_FP8:-0}" # 0: disable FP8, 1: enable FP8 +GEMM_TUNING="${GEMM_TUNING:-1}" +MCORE="${MCORE:-1}" + +EXPERIMENT_DIR="experiment" +mkdir -p $EXPERIMENT_DIR +CHECKPOINT_PATH=${CHECKPOINT_PATH:-"$EXPERIMENT_DIR/ckpts"} + + +DATA_DIR="/root/.cache/data" # change to where the dataset is stored + +DATA_PATH=${DATA_PATH:-"$DATA_DIR/bookcorpus_text_sentence"} + +TOKENIZER_MODEL=$EXPERIMENT_DIR/tokenizer.model +# Download the tokenizer model +if ! [ -f "$TOKENIZER_MODEL" ]; then +wget -O $TOKENIZER_MODEL https://huggingface.co/NousResearch/Llama-2-7b-chat-hf/resolve/main/tokenizer.model +fi + +MAX_POSITION_EMBEDDINGS=128000 + +DEFAULT_LOG_DIR="${EXPERIMENT_DIR}/${NNODES}nodes_rank${NODE_RANK}_train_${MODEL_SIZE}B_mbs${MBS}_bs${BS}_tp${TP}_pp${PP}_cp${CP}_iter${TOTAL_ITERS}/TE_FP8_${TE_FP8}/${TIME_STAMP}" +LOG_DIR="${LOG_DIR:-${DEFAULT_LOG_DIR}}" +TRAIN_LOG="${LOG_DIR}/output_${EXP_NAME}.log" +mkdir -p $LOG_DIR +echo $TRAIN_LOG + +# gemm tuning +if [ "$GEMM_TUNING" -eq 1 ]; then + export TE_HIPBLASLT_TUNING_RUN_COUNT=10 + export TE_HIPBLASLT_TUNING_ALGO_COUNT=50 +fi + +if [ "$SEQ_LENGTH" -le 8192 ]; then + ds_works=8 +else + ds_works=24 +fi + +if [[ $MODEL_SIZE -eq 7 ]]; then #llama2-7B + HIDDEN_SIZE=4096 # e.g. llama-13b: 5120 + FFN_HIDDEN_SIZE=14336 # e.g. llama-13b: 13824 + NUM_LAYERS=32 # e.g. llama-13b: 40 + NUM_HEADS=32 # e.g. llama-13b: 40 + SEQ_LENGTH=$SEQ_LENGTH + NUM_KV_HEADS=8 # llama2 70B uses GQA +elif [[ $MODEL_SIZE -eq 70 ]]; then + HIDDEN_SIZE=8192 # e.g. llama-13b: 5120 + FFN_HIDDEN_SIZE=28672 # e.g. llama-13b: 13824 + NUM_LAYERS=80 # e.g. llama-13b: 40 + NUM_HEADS=64 # e.g. llama-13b: 40 + NUM_KV_HEADS=8 # llama3 70B uses GQA + SEQ_LENGTH=$SEQ_LENGTH + MAX_POSITION_EMBEDDINGS=$MAX_POSITION_EMBEDDINGS +else + echo "Model size not supported." + exit 1 +fi + +GROUP_SIZE=$(( ${NUM_HEADS} / ${NUM_KV_HEADS} )) +NUM_GROUPS=$(( ${NUM_HEADS} / ${GROUP_SIZE} )) + +PROFILING_DIR="${LOG_DIR}/trace_${EXP_NAME}" + +GPT_ARGS=" + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --context-parallel-size ${CP} \ + --num-layers $NUM_LAYERS \ + --hidden-size $HIDDEN_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --num-attention-heads $NUM_HEADS \ + --seq-length $SEQ_LENGTH \ + --max-position-embeddings $MAX_POSITION_EMBEDDINGS \ + --untie-embeddings-and-output-weights \ + --position-embedding-type rope \ + --no-position-embedding \ + --disable-bias-linear \ + --swiglu \ + --init-method-std 0.02 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --normalization RMSNorm \ + --micro-batch-size $MBS \ + --global-batch-size $BS \ + --train-iters $TOTAL_ITERS \ + --no-async-tensor-model-parallel-allreduce \ + --bf16 \ + --no-masked-softmax-fusion \ + --disable-bias-linear \ +" + +TRAIN_ARGS="--lr 1e-4 \ + --min-lr 1e-5 \ + --lr-decay-iters 320000 \ + --lr-decay-style cosine \ + --weight-decay 1.0e-1 \ + --clip-grad 1.0 \ + --optimizer adam \ +" +DATA_ARGS=" + --tokenizer-type Llama2Tokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --dataloader-type cyclic \ + --save-interval 200000 \ + --tensorboard-dir $LOG_DIR \ + --log-interval 1 \ + --eval-interval 320000 \ + --eval-iters 10 \ + --num-workers $ds_works \ + --mock-data +" +# --data-path $DATA_PATH \ +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval 5000 \ + --log-throughput \ + --no-save-optim \ + --eval-iters -1 +" +# --save $CHECKPOINT_PATH \ + +DISTRIBUTED_ARGS=" + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT \ +" + +CKPT_LOAD_ARGS="--exit-on-missing-checkpoint \ + --no-load-optim \ + --use-checkpoint-args \ + --no-load-rng" + + +EXTRA_ARGS=" + --group-query-attention \ + --num-query-groups $NUM_GROUPS \ + --no-gradient-accumulation-fusion \ + --distributed-backend nccl \ + --distributed-timeout-minutes 120 \ + --use-distributed-optimizer \ + --overlap-param-gather \ + --overlap-grad-reduce \ +" + +if [ "$ENABLE_PROFILING" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --profile --use-pytorch-profiler --tensorboard-dir $LOG_DIR" +fi + +if [ "$USE_FLASH_ATTN" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --use-flash-attn" +fi + +if [ "$SEQ_PARALLEL" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --sequence-parallel" +fi + +if [ "$CONTI_PARAMS" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --use-contiguous-parameters-in-local-ddp" +fi + +if [ "$MCORE" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --use-mcore-models" +fi + +if [ "$ENABLE_ROPE" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --position-embedding-type rope" +fi + +if [ "$DISABLE_ROPE_TE" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --disable-te-fused-rope" +fi + +if [ "$TE_FP8" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --transformer-impl=transformer_engine \ + --fp8-margin=0 \ + --fp8-format=hybrid \ + --fp8-interval=1 \ + --fp8-amax-history-len=1024 \ + --fp8-amax-compute-algo=max \ + --attention-softmax-in-fp32 \ +" +fi + +run_cmd=" + torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + $EXTRA_ARGS \ + $TRAIN_ARGS \ +" + +if [ "$TEE_OUTPUT" -eq 0 ]; then + run_cmd="$run_cmd >& $TRAIN_LOG" +else + run_cmd="$run_cmd |& tee $TRAIN_LOG" +fi + +if [ "$NO_TRAINING" -eq 0 ]; then + eval $run_cmd +fi + + +echo 'import argparse +import numpy as np + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="Process Log") + parser.add_argument("filename") + args = parser.parse_args() + + with open(args.filename) as f: + lines = f.readlines() + lines = lines[2:-1] + lines = [float(a) for a in lines] + mean = np.mean(np.array(lines)) + print(mean)' > mean_log_value.py + + +# echo '============================================================================================================' +grep -Eo 'throughput per GPU [^|]*' $TRAIN_LOG | sed -E 's/.*throughput per GPU \(TFLOP\/s\/GPU\): ([0-9\.]+).*/\1/' > tmp.txt +PERFORMANCE=$(python3 mean_log_value.py tmp.txt) +echo "throughput per GPU: $PERFORMANCE" |& tee -a $TRAIN_LOG +rm tmp.txt + +# echo '============================================================================================================' +grep -Eo 'elapsed time per iteration [^|]*' $TRAIN_LOG | sed -E 's/.*elapsed time per iteration \(ms\): ([0-9\.]+).*/\1/' > tmp.txt +ETPI=$(python3 mean_log_value.py tmp.txt) +echo "elapsed time per iteration: $ETPI" |& tee -a $TRAIN_LOG + +TIME_PER_ITER=$(python3 mean_log_value.py tmp.txt 2>/dev/null | awk '{printf "%.6f", $0}') +TGS=$(awk -v bs="$BS" -v sl="$SEQ_LENGTH" -v tpi="$TIME_PER_ITER" -v ws="$WORLD_SIZE" 'BEGIN {printf "%.6f", bs * sl * 1000/ (tpi * ws)}') +echo "tokens/GPU/s: $TGS" |& tee -a $TRAIN_LOG +rm tmp.txt diff --git a/examples/llama/train_llama3.sh b/examples/llama/train_llama3.sh new file mode 100644 index 0000000000..08d070fe3c --- /dev/null +++ b/examples/llama/train_llama3.sh @@ -0,0 +1,308 @@ +#!/bin/bash +############################################################################### +# Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +################################################################################# +#set -x + +# set envs +export GPU_MAX_HW_QUEUES=2 +export TORCH_NCCL_HIGH_PRIORITY=1 +export NCCL_CHECKS_DISABLE=1 +export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 +export NCCL_IB_GID_INDEX=3 +export NCCL_CROSS_NIC=0 +export NCCL_SOCKET_IFNAME=ens50f0np0 # network interface +export GLOO_SOCKET_IFNAME=ens50f0np0 # network interface +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NCCL_PROTO=Simple +export RCCL_MSCCL_ENABLE=0 +export TOKENIZERS_PARALLELISM=false +export HSA_NO_SCRATCH_RECLAIM=1 + + +# parsing input arguments +for ARGUMENT in "$@" +do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + + KEY_LENGTH=${#KEY} + VALUE="${ARGUMENT:$KEY_LENGTH+1}" + + export "$KEY"="$VALUE" +done + + +TIME_STAMP=$(date +"%Y-%m-%d_%H-%M-%S") +EXP_NAME="${EXP_NAME:-perf}" + +TEE_OUTPUT="${TEE_OUTPUT:-1}" +USE_FLASH_ATTN="${USE_FLASH_ATTN:-1}" +NO_TRAINING="${NO_TRAINING:-0}" # NO_TRAINING=1: for computing metrics only +ENABLE_PROFILING="${ENABLE_PROFILING:-0}" #enable pytorch profiling +ENABLE_ROPE="${ENABLE_ROPE:-1}" +DISABLE_ROPE_TE="${DISABLE_ROPE_TE:-0}" +echo "NO_TRAINING=$NO_TRAINING" + +CWD=`pwd` +GPUS_PER_NODE=`python3 -c "import torch; print(torch.cuda.device_count())"` + +# single node config, Change for multinode config +MASTER_ADDR="${MASTER_ADDR:-localhost}" +MASTER_PORT="${MASTER_PORT:-6000}" +NNODES="${NNODES:-1}" +NODE_RANK="${NODE_RANK:-0}" +WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) + +MODEL_SIZE="${MODEL_SIZE:-70}" +TP="${TP:-8}" +PP="${PP:-1}" +CP="${CP:-1}" +MBS="${MBS:-2}" +BS="${BS:-8}" +SEQ_LENGTH="${SEQ_LENGTH:-2048}" +TOTAL_ITERS="${TOTAL_ITERS:-10}" +SEQ_PARALLEL="${SEQ_PARALLEL:-1}" +CONTI_PARAMS="${CONTI_PARAMS:-0}" +TE_FP8="${TE_FP8:-0}" # 0: disable FP8, 1: enable FP8 +GEMM_TUNING="${GEMM_TUNING:-1}" +MCORE="${MCORE:-1}" + +EXPERIMENT_DIR="experiment" +mkdir -p $EXPERIMENT_DIR +CHECKPOINT_PATH=${CHECKPOINT_PATH:-"$EXPERIMENT_DIR/ckpts"} + + +DATA_DIR="/root/.cache/data" # change to where the dataset is stored + +TOKENIZER_MODEL=meta-llama/Llama-3.1-8B +# Download the tokenizer model +# if ! [ -f "$TOKENIZER_MODEL" ]; then +# wget -O $TOKENIZER_MODEL https://huggingface.co/meta-llama/Llama-3.1-8B/blob/main/original/tokenizer.model +# fi + +DATA_PATH=${DATA_PATH:-"$DATA_DIR/bookcorpus_text_sentence"} + +MAX_POSITION_EMBEDDINGS=128000 + +DEFAULT_LOG_DIR="${EXPERIMENT_DIR}/${NNODES}nodes_rank${NODE_RANK}_train_${MODEL_SIZE}B_mbs${MBS}_bs${BS}_tp${TP}_pp${PP}_cp${CP}_iter${TOTAL_ITERS}/TE_FP8_${TE_FP8}/${TIME_STAMP}" +LOG_DIR="${LOG_DIR:-${DEFAULT_LOG_DIR}}" +TRAIN_LOG="${LOG_DIR}/output_${EXP_NAME}.log" +mkdir -p $LOG_DIR +echo $TRAIN_LOG + +# gemm tuning +if [ "$GEMM_TUNING" -eq 1 ]; then + export TE_HIPBLASLT_TUNING_RUN_COUNT=10 + export TE_HIPBLASLT_TUNING_ALGO_COUNT=50 +fi + +if [ "$SEQ_LENGTH" -le 8192 ]; then + ds_works=8 +else + ds_works=24 +fi + +if [[ $MODEL_SIZE -eq 8 ]]; then #llama2-7B + HIDDEN_SIZE=4096 # e.g. llama-13b: 5120 + FFN_HIDDEN_SIZE=14336 # e.g. llama-13b: 13824 + NUM_LAYERS=32 # e.g. llama-13b: 40 + NUM_HEADS=32 # e.g. llama-13b: 40 + SEQ_LENGTH=$SEQ_LENGTH + NUM_KV_HEADS=8 # llama2 70B uses GQA +elif [[ $MODEL_SIZE -eq 70 ]]; then + HIDDEN_SIZE=8192 # e.g. llama-13b: 5120 + FFN_HIDDEN_SIZE=28672 # e.g. llama-13b: 13824 + NUM_LAYERS=80 # e.g. llama-13b: 40 + NUM_HEADS=64 # e.g. llama-13b: 40 + NUM_KV_HEADS=8 # llama3 70B uses GQA + SEQ_LENGTH=$SEQ_LENGTH + MAX_POSITION_EMBEDDINGS=$MAX_POSITION_EMBEDDINGS +else + echo "Model size not supported." + exit 1 +fi + +GROUP_SIZE=$(( ${NUM_HEADS} / ${NUM_KV_HEADS} )) +NUM_GROUPS=$(( ${NUM_HEADS} / ${GROUP_SIZE} )) + +PROFILING_DIR="${LOG_DIR}/trace_${EXP_NAME}" + +GPT_ARGS=" + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --context-parallel-size ${CP} \ + --num-layers $NUM_LAYERS \ + --hidden-size $HIDDEN_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --num-attention-heads $NUM_HEADS \ + --seq-length $SEQ_LENGTH \ + --max-position-embeddings $MAX_POSITION_EMBEDDINGS \ + --untie-embeddings-and-output-weights \ + --position-embedding-type rope \ + --no-position-embedding \ + --disable-bias-linear \ + --swiglu \ + --init-method-std 0.02 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --normalization RMSNorm \ + --micro-batch-size $MBS \ + --global-batch-size $BS \ + --train-iters $TOTAL_ITERS \ + --no-async-tensor-model-parallel-allreduce \ + --bf16 \ + --no-masked-softmax-fusion \ + --disable-bias-linear \ +" + +TRAIN_ARGS="--lr 1e-4 \ + --min-lr 1e-5 \ + --lr-decay-iters 320000 \ + --lr-decay-style cosine \ + --weight-decay 1.0e-1 \ + --clip-grad 1.0 \ + --optimizer adam \ +" + +DATA_ARGS=" + --tokenizer-type HuggingFaceTokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --dataloader-type cyclic \ + --save-interval 200000 \ + --tensorboard-dir $LOG_DIR \ + --log-interval 1 \ + --eval-interval 320000 \ + --eval-iters 10 \ + --num-workers $ds_works \ + --mock-data +" +#--data-path $DATA_PATH \ +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval 5000 \ + --log-throughput \ + --no-save-optim \ + --eval-iters -1 +" +# --save $CHECKPOINT_PATH \ + +DISTRIBUTED_ARGS=" + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT \ +" + +CKPT_LOAD_ARGS="--exit-on-missing-checkpoint \ + --no-load-optim \ + --use-checkpoint-args \ + --no-load-rng" + + +EXTRA_ARGS=" + --group-query-attention \ + --num-query-groups $NUM_GROUPS \ + --no-gradient-accumulation-fusion \ + --distributed-backend nccl \ + --distributed-timeout-minutes 120 \ + --use-distributed-optimizer \ + --overlap-param-gather \ + --overlap-grad-reduce \ +" + +if [ "$ENABLE_PROFILING" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --profile --use-pytorch-profiler --tensorboard-dir $LOG_DIR" +fi + +if [ "$USE_FLASH_ATTN" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --use-flash-attn" +fi + +if [ "$SEQ_PARALLEL" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --sequence-parallel" +fi + +if [ "$CONTI_PARAMS" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --use-contiguous-parameters-in-local-ddp" +fi + +if [ "$MCORE" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --use-mcore-models" +fi + +if [ "$ENABLE_ROPE" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --position-embedding-type rope" +fi + +if [ "$DISABLE_ROPE_TE" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --disable-te-fused-rope" +fi + +if [ "$TE_FP8" -eq 1 ]; then +EXTRA_ARGS="$EXTRA_ARGS --transformer-impl=transformer_engine \ + --fp8-margin=0 \ + --fp8-format=hybrid \ + --fp8-interval=1 \ + --fp8-amax-history-len=1024 \ + --fp8-amax-compute-algo=max \ + --attention-softmax-in-fp32 \ +" +fi + +run_cmd=" + torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + $EXTRA_ARGS \ + $TRAIN_ARGS \ +" + +if [ "$TEE_OUTPUT" -eq 0 ]; then + run_cmd="$run_cmd >& $TRAIN_LOG" +else + run_cmd="$run_cmd |& tee $TRAIN_LOG" +fi + +if [ "$NO_TRAINING" -eq 0 ]; then + eval $run_cmd +fi + + +echo 'import argparse +import numpy as np + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="Process Log") + parser.add_argument("filename") + args = parser.parse_args() + + with open(args.filename) as f: + lines = f.readlines() + lines = lines[2:-1] + lines = [float(a) for a in lines] + mean = np.mean(np.array(lines)) + print(mean)' > mean_log_value.py + + +# echo '============================================================================================================' +grep -Eo 'throughput per GPU [^|]*' $TRAIN_LOG | sed -E 's/.*throughput per GPU \(TFLOP\/s\/GPU\): ([0-9\.]+).*/\1/' > tmp.txt +PERFORMANCE=$(python3 mean_log_value.py tmp.txt) +echo "throughput per GPU: $PERFORMANCE" |& tee -a $TRAIN_LOG +rm tmp.txt + +# echo '============================================================================================================' +grep -Eo 'elapsed time per iteration [^|]*' $TRAIN_LOG | sed -E 's/.*elapsed time per iteration \(ms\): ([0-9\.]+).*/\1/' > tmp.txt +ETPI=$(python3 mean_log_value.py tmp.txt) +echo "elapsed time per iteration: $ETPI" |& tee -a $TRAIN_LOG + +TIME_PER_ITER=$(python3 mean_log_value.py tmp.txt 2>/dev/null | awk '{printf "%.6f", $0}') +TGS=$(awk -v bs="$BS" -v sl="$SEQ_LENGTH" -v tpi="$TIME_PER_ITER" -v ws="$WORLD_SIZE" 'BEGIN {printf "%.6f", bs * sl * 1000/ (tpi * ws)}') +echo "tokens/GPU/s: $TGS" |& tee -a $TRAIN_LOG +rm tmp.txt + + diff --git a/examples/llama2/mean_log_value.py b/examples/llama2/mean_log_value.py deleted file mode 100755 index 46cd6732ec..0000000000 --- a/examples/llama2/mean_log_value.py +++ /dev/null @@ -1,13 +0,0 @@ -import argparse -import numpy as np - -parser = argparse.ArgumentParser(prog="Process Log") -parser.add_argument("filename") -args = parser.parse_args() - -with open(args.filename) as f: - lines = f.readlines() -lines = lines[1:-1] -lines = [float(a) for a in lines] -mean = np.mean(np.array(lines)) -print(mean) \ No newline at end of file diff --git a/examples/llama2/train_llama2.sh b/examples/llama2/train_llama2.sh deleted file mode 100755 index 7ce38702f4..0000000000 --- a/examples/llama2/train_llama2.sh +++ /dev/null @@ -1,263 +0,0 @@ -#!/bin/bash - -# set -x - -export GPU_MAX_HW_QUEUES=2 -export TORCH_NCCL_HIGH_PRIORITY=1 - -# parsing input arguments -for ARGUMENT in "$@" -do - KEY=$(echo $ARGUMENT | cut -f1 -d=) - - KEY_LENGTH=${#KEY} - VALUE="${ARGUMENT:$KEY_LENGTH+1}" - export "$KEY"="$VALUE" -done - -USE_FLASH_ATTN="${USE_FLASH_ATTN:-1}" -NO_TRAINING="${NO_TRAINING:-0}" # NO_TRAINING=1: for computing metrics only -ENABLE_PROFILING="${ENABLE_PROFILING:-0}" -ENABLE_ROPE="${ENABLE_ROPE:-1}" -DISABLE_ROPE_TE="${DISABLE_ROPE_TE:-0}" -ENABLE_MOCK_DATA="${ENABLE_MOCK_DATA:-1}" -DUMMY_RUN="${DUMMY_RUN:-0}" -ADD_TASK="${ADD_TASK:-0}" -LABEL="${LABEL:-"test"}" -LOG_DIR="profile/${LABEL}" -echo "NO_TRAINING=$NO_TRAINING" - -CWD=`pwd` -GPUS_PER_NODE=`python -c "import torch; print(torch.cuda.device_count())"` - -# Change for multinode config -export CUDA_DEVICE_MAX_CONNECTIONS=1 -MASTER_ADDR=${MASTER_ADDR:-localhost} -MASTER_PORT=${MASTER_PORT:-23731} -NNODES=${NNODES:-1} -NODE_RANK=${NODE_RANK:-0} -WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) - -MODEL_SIZE="${MODEL_SIZE:-70}" -TP="${TP:-8}" -PP="${PP:-1}" -MBS="${MBS:-2}" -BS="${BS:-8}" -SEQ_LENGTH="${SEQ_LENGTH:-4096}" -TOTAL_ITERS="${TOTAL_ITERS:-20}" -SEQ_PARALLEL="${SEQ_PARALLEL:-1}" -CONTI_PARAMS="${CONTI_PARAMS:-0}" -OPTIMIZER="${OPTIMIZER:-sgd}" -TE_BF16="${TE_BF16:-1}" - -TMP_DIR="tmp" - -CHECKPOINT_PATH=${CHECKPOINT_PATH:-"$TMP/ckpts"} -mkdir -p ${CHECKPOINT_PATH} - -DATA_PATH=${DATA_PATH:-"$TMP_DIR/data/bookcorpus_text_sentence"} - -TOKENIZER_MODEL=$TMP/tokenizer.model - -# Download the tokenizer model -if ! [ -f "$TOKENIZER_MODEL" ]; then -wget -O $TOKENIZER_MODEL https://huggingface.co/NousResearch/Llama-2-7b-chat-hf/resolve/main/tokenizer.model -fi - -MAX_POSITION_EMBEDDINGS=32768 -LOG_NAME="${TMP_DIR}/train_${MODEL_SIZE}B_iter${TOTAL_ITERS}_mbs${MBS}_bs${BS}_tp${TP}_pp${PP}_"\ -"seq${SEQ_LENGTH}_optim_${OPTIMIZER}_fa_${USE_FLASH_ATTN}_seqpara_${SEQ_PARALLEL}_"\ -"contiparam_${CONTI_PARAMS}_te_bg16_${TE_BF16}_${LABEL}" -TRAIN_LOG="${LOG_NAME}.log" -PROFILING_DIR="profile_${LOG_NAME}" - -echo $TRAIN_LOG - -if [[ $MODEL_SIZE -eq 7 ]]; then - HIDDEN_SIZE=4096 # e.g. llama-13b: 5120 - FFN_HIDDEN_SIZE=11008 # e.g. llama-13b: 13824 - NUM_LAYERS=32 # e.g. llama-13b: 40 - NUM_HEADS=32 # e.g. llama-13b: 40 - SEQ_LENGTH=$SEQ_LENGTH - MAX_POSITION_EMBEDDINGS=$MAX_POSITION_EMBEDDINGS - NUM_KV_HEADS=32 # llama2 70B uses GQA -elif [[ $MODEL_SIZE -eq 13 ]]; then - HIDDEN_SIZE=5120 # e.g. llama-13b: 5120 - FFN_HIDDEN_SIZE=13824 # e.g. llama-13b: 13824 - NUM_LAYERS=40 # e.g. llama-13b: 40 - NUM_HEADS=40 # e.g. llama-13b: 40 - SEQ_LENGTH=$SEQ_LENGTH - MAX_POSITION_EMBEDDINGS=$MAX_POSITION_EMBEDDINGS - NUM_KV_HEADS=40 # llama2 70B uses GQA -elif [[ $MODEL_SIZE -eq 20 ]]; then - HIDDEN_SIZE=8192 # e.g. llama-13b: 5120 - FFN_HIDDEN_SIZE=28672 # e.g. llama-13b: 13824 - NUM_LAYERS=20 # e.g. llama-13b: 40 - NUM_HEADS=64 # e.g. llama-13b: 40 - NUM_KV_HEADS=8 # llama2 70B uses GQA - SEQ_LENGTH=$SEQ_LENGTH - MAX_POSITION_EMBEDDINGS=$MAX_POSITION_EMBEDDINGS -elif [[ $MODEL_SIZE -eq 70 ]]; then - HIDDEN_SIZE=8192 # e.g. llama-13b: 5120 - FFN_HIDDEN_SIZE=28672 # e.g. llama-13b: 13824 - NUM_LAYERS=80 # e.g. llama-13b: 40 - NUM_HEADS=64 # e.g. llama-13b: 40 - NUM_KV_HEADS=8 # llama2 70B uses GQA - SEQ_LENGTH=$SEQ_LENGTH - MAX_POSITION_EMBEDDINGS=$MAX_POSITION_EMBEDDINGS -else - echo "Model size not supported." - exit 1 -fi - -GROUP_SIZE=$(( ${NUM_HEADS} / ${NUM_KV_HEADS} )) -NUM_GROUPS=$(( ${NUM_HEADS} / ${GROUP_SIZE} )) - -GPT_ARGS=" - --tensor-model-parallel-size ${TP} \ - --pipeline-model-parallel-size ${PP} \ - --num-layers $NUM_LAYERS \ - --hidden-size $HIDDEN_SIZE \ - --ffn-hidden-size $FFN_HIDDEN_SIZE \ - --num-attention-heads $NUM_HEADS \ - --seq-length $SEQ_LENGTH \ - --max-position-embeddings $MAX_POSITION_EMBEDDINGS \ - --untie-embeddings-and-output-weights \ - --disable-bias-linear \ - --swiglu \ - --init-method-std 0.02 \ - --attention-dropout 0.0 \ - --hidden-dropout 0.0 \ - --normalization RMSNorm \ - --micro-batch-size $MBS \ - --global-batch-size $BS \ - --lr 3.0e-4 \ - --train-iters $TOTAL_ITERS \ - --lr-decay-style cosine \ - --min-lr 3.0e-5 \ - --weight-decay 1e-1 \ - --lr-warmup-fraction .01 \ - --optimizer $OPTIMIZER \ - --no-async-tensor-model-parallel-allreduce \ - --clip-grad 1.0 \ - --bf16 \ - --no-masked-softmax-fusion \ - --overlap-grad-reduce \ -" - -DATA_ARGS=" - --tokenizer-type Llama2Tokenizer \ - --tokenizer-model ${TOKENIZER_MODEL} \ - --split 949,50,1 \ -" - -OUTPUT_ARGS=" - --log-interval 1 \ - --save-interval 1000 \ - --log-throughput \ - --no-save-optim \ - --eval-iters -1 -" - -DISTRIBUTED_ARGS=" - --nproc_per_node $GPUS_PER_NODE \ - --nnodes $NNODES \ - --node_rank $NODE_RANK \ - --master_addr $MASTER_ADDR \ - --master_port $MASTER_PORT -" - -EXTRA_ARGS=" - --group-query-attention \ - --num-query-groups $NUM_GROUPS \ - --no-gradient-accumulation-fusion \ - --distributed-backend nccl \ - --distributed-timeout-minutes 30 -" - -if [ "$ENABLE_PROFILING" -eq 1 ]; then -EXTRA_ARGS="$EXTRA_ARGS --profile --use-pytorch-profiler --tensorboard-dir $LOG_DIR" -fi - -if [ "$ADD_TASK" -eq 1 ]; then -EXTRA_ARGS="$EXTRA_ARGS --task gpt_chat" -fi - - -if [ "$ENABLE_MOCK_DATA" -eq 1 ]; then -EXTRA_ARGS="$EXTRA_ARGS --mock-data" -else -EXTRA_ARGS="$EXTRA_ARGS --data-path $DATA_PATH" -fi - -if [ "$ENABLE_ROPE" -eq 1 ]; then -EXTRA_ARGS="$EXTRA_ARGS --position-embedding-type rope" -fi - -if [ "$DISABLE_ROPE_TE" -eq 1 ]; then -EXTRA_ARGS="$EXTRA_ARGS --disable-te-fused-rope" -fi - -if [ "$USE_FLASH_ATTN" -eq 1 ]; then -EXTRA_ARGS="$EXTRA_ARGS --use-flash-attn" -fi - -if [ "$SEQ_PARALLEL" -eq 1 ]; then -EXTRA_ARGS="$EXTRA_ARGS --sequence-parallel" -fi - -if [ "$CONTI_PARAMS" -eq 1 ]; then -EXTRA_ARGS="$EXTRA_ARGS --use-contiguous-parameters-in-local-ddp" -fi - -if [ "$TE_BF16" -eq 1 ]; then -EXTRA_ARGS="$EXTRA_ARGS --transformer-impl=transformer_engine \ - --fp8-margin=0 \ - --fp8-interval=1 \ - --fp8-amax-history-len=1024 \ - --fp8-amax-compute-algo=max -" -fi - -if [ "$DUMMY_RUN" -eq 0 ]; then -run_cmd=" - torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ - $GPT_ARGS \ - $DATA_ARGS \ - $OUTPUT_ARGS \ - $EXTRA_ARGS \ - --load $CHECKPOINT_PATH -" -else -run_cmd=" -echo 'torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ - $GPT_ARGS \ - $DATA_ARGS \ - $OUTPUT_ARGS \ - $EXTRA_ARGS \ - --load $CHECKPOINT_PATH' -" -fi - -run_cmd="$run_cmd |& tee $TRAIN_LOG" - -if [ "$NO_TRAINING" -eq 0 ]; then - eval $run_cmd -fi - -MEAN_LOG_SCRIPT=examples/llama2/mean_log_value.py -TMP_FILE=${TMP_DIR}/tmp.txt - -grep -Eo 'throughput per GPU [^|]*' $TRAIN_LOG | sed -E 's/.*throughput per GPU \(TFLOP\/s\/GPU\): ([0-9\.]+).*/\1/' > $TMP_FILE -THROUGHPUT=$(python ${MEAN_LOG_SCRIPT} ${TMP_FILE}) -echo "throughput per GPU (TFLOPs/GPU): ${THROUGHPUT}" -rm $TMP_FILE - - -grep -Eo 'elapsed time per iteration [^|]*' $TRAIN_LOG | sed -E 's/.*elapsed time per iteration \(ms\): ([0-9\.]+).*/\1/' > $TMP_FILE -TIME_PER_ITER=$(python ${MEAN_LOG_SCRIPT} ${TMP_FILE} 2>/dev/null | awk '{printf "%.6f", $0}') -echo "elapsed time per iteration: ${TIME_PER_ITER}" -rm $TMP_FILE - -PERFORMANCE=$(awk -v bs="$BS" -v sl="$SEQ_LENGTH" -v tpi="$TIME_PER_ITER" -v ws="$WORLD_SIZE" 'BEGIN {printf "%.6f", bs * sl * 1000/ (tpi * ws)}') -echo "tokens/GPU/s: $PERFORMANCE" From 190213a1540992dcaa78df5724b4953f2f444f8d Mon Sep 17 00:00:00 2001 From: Ye Wang Date: Thu, 21 Nov 2024 21:17:25 -0600 Subject: [PATCH 25/30] [ROCm] remove extra_state in state dict for TE DPA (#14) --- megatron/core/extensions/transformer_engine.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py index bf5159c759..8957c1fd30 100644 --- a/megatron/core/extensions/transformer_engine.py +++ b/megatron/core/extensions/transformer_engine.py @@ -706,6 +706,15 @@ def forward( else: return core_attn_out + def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None): + state_dict = self.state_dict(prefix='', keep_vars=True) + # TE with version>=1.9 introduces an extra state in DotProductAttention Module + if is_te_min_version("1.9.0.dev0") and ('_extra_state' in state_dict): + state_dict.pop('_extra_state') + return make_sharded_tensors_for_checkpoint( + state_dict, prefix, {}, sharded_offsets + ) + if is_te_min_version("1.9.0.dev0"): From 3b50a40aeb0531892fdb43fa391db016ac9f64e5 Mon Sep 17 00:00:00 2001 From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com> Date: Thu, 5 Dec 2024 13:37:30 -0600 Subject: [PATCH 26/30] changing repo from rocm/megatron-lm to rocm/megatron-lm-private (#28) --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 27e60f4db1..49c2417d61 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -25,7 +25,7 @@ pipeline { } environment { - REPO_NAME = 'rocm/megatron-lm' + REPO_NAME = 'rocm/megatron-lm-private' CONTAINER_NAME = "megatron-lm-container" DOCKER_RUN_ARGS = "-v \$(pwd):/workspace/Megatron-LM/output --workdir /workspace/Megatron-LM \ --entrypoint /workspace/Megatron-LM/run_unit_tests.sh" From 8b5551ea36df6eb6759015e2b69ae4cb49ca5860 Mon Sep 17 00:00:00 2001 From: lizamd <161388580+lizamd@users.noreply.github.com> Date: Mon, 9 Dec 2024 19:36:44 -0800 Subject: [PATCH 27/30] Update train_llama3.sh (#30) * Update train_llama3.sh * Update train_llama3.sh --- examples/llama/train_llama3.sh | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/llama/train_llama3.sh b/examples/llama/train_llama3.sh index 08d070fe3c..4e0fbf96c5 100644 --- a/examples/llama/train_llama3.sh +++ b/examples/llama/train_llama3.sh @@ -13,8 +13,6 @@ export NCCL_CHECKS_DISABLE=1 export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 export NCCL_IB_GID_INDEX=3 export NCCL_CROSS_NIC=0 -export NCCL_SOCKET_IFNAME=ens50f0np0 # network interface -export GLOO_SOCKET_IFNAME=ens50f0np0 # network interface export CUDA_DEVICE_MAX_CONNECTIONS=1 export NCCL_PROTO=Simple export RCCL_MSCCL_ENABLE=0 @@ -37,6 +35,8 @@ done TIME_STAMP=$(date +"%Y-%m-%d_%H-%M-%S") EXP_NAME="${EXP_NAME:-perf}" +NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME:-ens50f0np0}" +GLOO_SOCKET_IFNAME="${GLOO_SOCKET_IFNAME:-ens50f0np0}" TEE_OUTPUT="${TEE_OUTPUT:-1}" USE_FLASH_ATTN="${USE_FLASH_ATTN:-1}" NO_TRAINING="${NO_TRAINING:-0}" # NO_TRAINING=1: for computing metrics only @@ -73,9 +73,7 @@ EXPERIMENT_DIR="experiment" mkdir -p $EXPERIMENT_DIR CHECKPOINT_PATH=${CHECKPOINT_PATH:-"$EXPERIMENT_DIR/ckpts"} - -DATA_DIR="/root/.cache/data" # change to where the dataset is stored - +DATA_DIR="${DATA_DIR:-/root/.cache/data}" TOKENIZER_MODEL=meta-llama/Llama-3.1-8B # Download the tokenizer model # if ! [ -f "$TOKENIZER_MODEL" ]; then From 23b9ff1c2dde24a25d903a63b54864ff07b89941 Mon Sep 17 00:00:00 2001 From: lizamd <161388580+lizamd@users.noreply.github.com> Date: Mon, 9 Dec 2024 19:37:25 -0800 Subject: [PATCH 28/30] Update train_llama2.sh (#31) * Update train_llama2.sh * Update train_llama2.sh --- examples/llama/train_llama2.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/llama/train_llama2.sh b/examples/llama/train_llama2.sh index b9ce6ccbb8..b7a7b5c22e 100644 --- a/examples/llama/train_llama2.sh +++ b/examples/llama/train_llama2.sh @@ -13,8 +13,6 @@ export NCCL_CHECKS_DISABLE=1 export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 export NCCL_IB_GID_INDEX=3 export NCCL_CROSS_NIC=0 -export NCCL_SOCKET_IFNAME=ens50f0np0 # network interface -export GLOO_SOCKET_IFNAME=ens50f0np0 # network interface export CUDA_DEVICE_MAX_CONNECTIONS=1 export NCCL_PROTO=Simple export RCCL_MSCCL_ENABLE=0 @@ -37,6 +35,8 @@ done TIME_STAMP=$(date +"%Y-%m-%d_%H-%M-%S") EXP_NAME="${EXP_NAME:-perf}" +NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME:-ens50f0np0}" +GLOO_SOCKET_IFNAME="${GLOO_SOCKET_IFNAME:-ens50f0np0}" TEE_OUTPUT="${TEE_OUTPUT:-1}" USE_FLASH_ATTN="${USE_FLASH_ATTN:-1}" NO_TRAINING="${NO_TRAINING:-0}" # NO_TRAINING=1: for computing metrics only @@ -76,8 +76,7 @@ mkdir -p $EXPERIMENT_DIR CHECKPOINT_PATH=${CHECKPOINT_PATH:-"$EXPERIMENT_DIR/ckpts"} -DATA_DIR="/root/.cache/data" # change to where the dataset is stored - +DATA_DIR="${DATA_DIR:-/root/.cache/data}" DATA_PATH=${DATA_PATH:-"$DATA_DIR/bookcorpus_text_sentence"} TOKENIZER_MODEL=$EXPERIMENT_DIR/tokenizer.model From 0b9998a8649e2d7f4d54c6195f76caad35983721 Mon Sep 17 00:00:00 2001 From: lizamd <161388580+lizamd@users.noreply.github.com> Date: Mon, 9 Dec 2024 19:37:56 -0800 Subject: [PATCH 29/30] Update readme.md (#29) --- examples/llama/readme.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/llama/readme.md b/examples/llama/readme.md index 8d0c8704b7..8ceb357d09 100644 --- a/examples/llama/readme.md +++ b/examples/llama/readme.md @@ -21,7 +21,7 @@ This guide provides the steps for setting up the environment and configuring the ### 2.1 Single Node Training To run the training on a single node, go to Megatron-LM folder, use the following command: ```bash -TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh +TEE_OUTPUT=1 MBS=2 BS=64 TP=8 TE_FP8=0 SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh ``` @@ -30,12 +30,12 @@ To run training on multiple nodes, launch the Docker container on each node. Fol - **On the Master Node:** ```bash - TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh + TEE_OUTPUT=1 MBS=2 BS=64 TP=8 TE_FP8=0 SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh ``` - **On the Slave Node(s):** ```bash - TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh + TEE_OUTPUT=1 MBS=2 BS=64 TP=8 TE_FP8=0 SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh ``` ## 3. Configurations in Script (`Megatron/examples/llama`) @@ -128,4 +128,4 @@ If you're running multi-node training, update the following environment variable --- -That's it! You've now set up the environment and configured the necessary settings for training Llama2 or Llama3 models. \ No newline at end of file +That's it! You've now set up the environment and configured the necessary settings for training Llama2 or Llama3 models. From 20292f595b6c62cba74b37142b78599bf013c455 Mon Sep 17 00:00:00 2001 From: root Date: Thu, 12 Dec 2024 00:45:28 +0000 Subject: [PATCH 30/30] update network interface --- examples/llama/train_llama2.sh | 10 ++++++++-- examples/llama/train_llama3.sh | 10 ++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/examples/llama/train_llama2.sh b/examples/llama/train_llama2.sh index b7a7b5c22e..6a8d6b71d5 100644 --- a/examples/llama/train_llama2.sh +++ b/examples/llama/train_llama2.sh @@ -35,8 +35,6 @@ done TIME_STAMP=$(date +"%Y-%m-%d_%H-%M-%S") EXP_NAME="${EXP_NAME:-perf}" -NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME:-ens50f0np0}" -GLOO_SOCKET_IFNAME="${GLOO_SOCKET_IFNAME:-ens50f0np0}" TEE_OUTPUT="${TEE_OUTPUT:-1}" USE_FLASH_ATTN="${USE_FLASH_ATTN:-1}" NO_TRAINING="${NO_TRAINING:-0}" # NO_TRAINING=1: for computing metrics only @@ -57,6 +55,14 @@ NNODES="${NNODES:-1}" NODE_RANK="${NODE_RANK:-0}" WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) +if [ "${NNODES:-1}" -gt 1 ]; then + export NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME:-ens5}" + export GLOO_SOCKET_IFNAME="${GLOO_SOCKET_IFNAME:-ens50f0}" + echo "NCCL and GLOO socket interfaces set." +else + echo "Single node setup, skipping NCCL and GLOO socket interface settings." +fi + MODEL_SIZE="${MODEL_SIZE:-70}" TP="${TP:-8}" PP="${PP:-1}" diff --git a/examples/llama/train_llama3.sh b/examples/llama/train_llama3.sh index 4e0fbf96c5..4392268545 100644 --- a/examples/llama/train_llama3.sh +++ b/examples/llama/train_llama3.sh @@ -35,8 +35,6 @@ done TIME_STAMP=$(date +"%Y-%m-%d_%H-%M-%S") EXP_NAME="${EXP_NAME:-perf}" -NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME:-ens50f0np0}" -GLOO_SOCKET_IFNAME="${GLOO_SOCKET_IFNAME:-ens50f0np0}" TEE_OUTPUT="${TEE_OUTPUT:-1}" USE_FLASH_ATTN="${USE_FLASH_ATTN:-1}" NO_TRAINING="${NO_TRAINING:-0}" # NO_TRAINING=1: for computing metrics only @@ -55,6 +53,14 @@ NNODES="${NNODES:-1}" NODE_RANK="${NODE_RANK:-0}" WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) +if [ "${NNODES:-1}" -gt 1 ]; then + export NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME:-ens5}" + export GLOO_SOCKET_IFNAME="${GLOO_SOCKET_IFNAME:-ens50f0}" + echo "NCCL and GLOO socket interfaces set." +else + echo "Single node setup, skipping NCCL and GLOO socket interface settings." +fi + MODEL_SIZE="${MODEL_SIZE:-70}" TP="${TP:-8}" PP="${PP:-1}"