NVIDIA · lizamd · Oct 8, 2024 · Oct 9, 2024 · Oct 9, 2024 · Oct 10, 2024
diff --git a/Dockerfile_rocm.ci b/Dockerfile_rocm.ci
@@ -0,0 +1,70 @@
+ARG BASE_DOCKER=rocm/pytorch:latest
+FROM $BASE_DOCKER
+
+ARG PYTORCH_ROCM_ARCH_OVERRIDE="gfx942"
+ENV WORKSPACE_DIR=/workspace
+ENV STAGE_DIR=/workspace/installs
+RUN mkdir -p $WORKSPACE_DIR
+RUN mkdir -p ${STAGE_DIR}
+WORKDIR $WORKSPACE_DIR
+
+RUN pip3 install \
+scipy \
+einops \
+flask-restful \
+nltk \
+pytest \
+pytest-cov \
+pytest_mock \
+pytest-csv \
+pytest-random-order \
+sentencepiece \
+wrapt \
+zarr \
+wandb \
+tensorstore==0.1.45 \
+pytest_mock \
+pybind11 \
+setuptools==69.5.1 \
+datasets \
+tiktoken \
+pynvml
+
+RUN pip3 install "huggingface_hub[cli]"
+RUN python3 -m nltk.downloader punkt_tab
+
+
+# Install Causal-Conv1d and its dependencies
+WORKDIR ${STAGE_DIR}
+ENV CAUSAL_CONV1D_FORCE_BUILD=TRUE
+ENV MAMBA_FORCE_BUILD=TRUE
+ENV HIP_ARCHITECTURES=${PYTORCH_ROCM_ARCH_OVERRIDE}
+RUN git clone https://github.com/Dao-AILab/causal-conv1d causal-conv1d &&\
+    cd causal-conv1d &&\
+    git show --oneline -s &&\
+    pip install .
+
+# Install mamba
+WORKDIR ${STAGE_DIR}
+RUN git clone https://github.com/state-spaces/mamba mamba &&\
+    cd mamba &&\
+    git show --oneline -s &&\
+    pip install --no-build-isolation .
+
+# Clone TE repo and submodules
+WORKDIR ${STAGE_DIR}
+ENV NVTE_FRAMEWORK=pytorch
+ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH_OVERRIDE}
+ENV NVTE_USE_HIPBLASLT=1
+RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git &&\
+    cd TransformerEngine &&\
+    pip install .
+
+WORKDIR $WORKSPACE_DIR
+COPY . Megatron-LM
+WORKDIR $WORKSPACE_DIR/Megatron-LM
+RUN pip install -e .
+
+# record configuration for posterity
+RUN pip list
+
diff --git a/Dockerfile_rocm.dev b/Dockerfile_rocm.dev
@@ -0,0 +1,72 @@
+ARG BASE_DOCKER=rocm/pytorch:latest
+FROM $BASE_DOCKER
+ARG PYTORCH_ROCM_ARCH_OVERRIDE="gfx942"
+ENV WORKSPACE_DIR=/workspace
+ENV STAGE_DIR=/workspace/installs
+RUN mkdir -p $WORKSPACE_DIR
+RUN mkdir -p ${STAGE_DIR}
+WORKDIR $WORKSPACE_DIR
+
+RUN pip3 install \
+scipy \
+einops \
+flask-restful \
+nltk \
+pytest \
+pytest-cov \
+pytest_mock \
+pytest-csv \
+pytest-random-order \
+sentencepiece \
+wrapt \
+zarr \
+wandb \
+tensorstore==0.1.45 \
+pytest_mock \
+pybind11 \
+setuptools==69.5.1 \
+datasets \
+tiktoken \
+pynvml
+
+RUN pip3 install "huggingface_hub[cli]"
+RUN python3 -m nltk.downloader punkt_tab
+
+
+# Install Causal-Conv1d and its dependencies
+WORKDIR ${STAGE_DIR}
+ENV CAUSAL_CONV1D_FORCE_BUILD=TRUE
+ENV MAMBA_FORCE_BUILD=TRUE
+ENV HIP_ARCHITECTURES=${PYTORCH_ROCM_ARCH_OVERRIDE}
+RUN git clone https://github.com/Dao-AILab/causal-conv1d causal-conv1d &&\
+    cd causal-conv1d &&\
+    git show --oneline -s &&\
+    pip install .
+
+# Install mamba
+WORKDIR ${STAGE_DIR}
+RUN git clone https://github.com/state-spaces/mamba mamba &&\
+    cd mamba &&\
+    git show --oneline -s &&\
+    pip install --no-build-isolation .
+
+# Clone TE repo and submodules
+WORKDIR ${STAGE_DIR}
+ENV NVTE_FRAMEWORK=pytorch
+ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH_OVERRIDE}
+ENV NVTE_USE_HIPBLASLT=1
+RUN git clone --recursive https://github.com/ROCm/TransformerEngine &&\
+    cd TransformerEngine &&\
+    pip install .
+
+WORKDIR $WORKSPACE_DIR
+RUN git clone https://github.com/ROCm/Megatron-LM.git Megatron-LM &&\
+    cd Megatron-LM &&\
+    git checkout rocm_dev &&\
+    pip install -e .
+
+WORKDIR $WORKSPACE_DIR/Megatron-LM
+
+# record configuration for posterity
+RUN pip list
+
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -0,0 +1,92 @@
+import org.apache.commons.io.FilenameUtils
+import groovy.json.JsonOutput
+
+
+def clean_up_docker_images() {
+    // Check if the images exist before attempting to remove them
+    def imageExists = sh(script: "docker images -q ${env.DOCKER_IMAGE}", returnStdout: true).trim()
+    if (imageExists) {
+        sh "docker rmi ${env.DOCKER_IMAGE}"
+    }
+}
+
+def clean_docker_build_cache() {
+    sh 'docker system prune -f --volumes || true'
+}
+
+pipeline {
+    agent {
+        label 'build-only'
+    }
+
+    parameters {
+        string(name: 'TEST_NODE_LABEL', defaultValue: 'MI300X_BANFF', description: 'Node or Label to launch Jenkins Job')
+        string(name: 'GPU_ARCH', defaultValue: 'gfx942', description: 'GPU Architecture')
+    }
+
+    environment {
+        REPO_NAME = 'rocm/megatron-lm-private'
+        CONTAINER_NAME = "megatron-lm-container"
+        DOCKER_RUN_ARGS = "-v \$(pwd):/workspace/Megatron-LM/output --workdir /workspace/Megatron-LM \
+        --entrypoint /workspace/Megatron-LM/run_unit_tests.sh"
+        DOCKER_RUN_CMD = "docker run --rm -t --network host -u root --group-add video --cap-add=SYS_PTRACE \
+        --cap-add SYS_ADMIN --device /dev/fuse --security-opt seccomp=unconfined --security-opt apparmor=unconfined \
+        --ipc=host --device=/dev/kfd --device=/dev/dri"
+    }
+
+    stages {
+        stage('Build Docker Image') {
+            steps {
+                clean_docker_build_cache()
+                script {
+
+                    // Generate a unique UUID for the Docker image name
+                    def uuid = sh(script: 'uuidgen', returnStdout: true).trim()
+                    env.DOCKER_IMAGE = "${REPO_NAME}:${uuid}"
+
+                    // Build Docker image
+                    sh "docker build --no-cache -f Dockerfile_rocm.ci --build-arg PYTORCH_ROCM_ARCH_OVERRIDE=${params.GPU_ARCH} -t ${env.DOCKER_IMAGE} ."
+
+                    withCredentials([usernamePassword(credentialsId: 'docker-hub-credentials', usernameVariable: 'DOCKER_USERNAME', passwordVariable: 'DOCKER_PASSWORD')]) {
+                        sh "docker push ${env.DOCKER_IMAGE}"  
+                    }
+                }
+            }
+            post {
+                always {
+                    clean_up_docker_images()
+                }
+            }
+        }
+
+        stage('Run Unit Tests') {
+            agent {
+                node {
+                    label "${params.TEST_NODE_LABEL}"
+                }
+            }
+
+            steps {
+                script {
+                    // Pull the Docker image from the repository on the test node
+                    withCredentials([usernamePassword(credentialsId: 'docker-hub-credentials', usernameVariable: 'DOCKER_USERNAME', passwordVariable: 'DOCKER_PASSWORD')]) {
+                        sh "docker pull ${env.DOCKER_IMAGE}"
+                    }
+
+                    wrap([$class: 'AnsiColorBuildWrapper', 'colorMapName': 'xterm']) {
+                        sh "${DOCKER_RUN_CMD} ${DOCKER_RUN_ARGS} --name ${env.CONTAINER_NAME} ${env.DOCKER_IMAGE}"
+                    }
+                }
+            }
+            post {
+                always {
+                // Archive test results
+                script {
+                    archiveArtifacts artifacts: 'test_report.csv', allowEmptyArchive: true
+                    clean_up_docker_images()
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/examples/llama/prepare_bookcorpus_megatron_dataset.py b/examples/llama/prepare_bookcorpus_megatron_dataset.py
@@ -0,0 +1,14 @@
+import argparse
+from pathlib import Path
+from datasets import load_dataset
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--out-dir", type=str, required=False, default="tmp/data",
+                       help="Path to output JSON")
+    args = parser.parse_args()
+    out_dir = Path(args.out_dir)
+    out_dir.mkdir(exist_ok=True, parents=True)
+
+    dataset = load_dataset("bookcorpus", split="train", trust_remote_code=True)
+    dataset.to_json(out_dir / "bookcorpus_megatron.json")
diff --git a/examples/llama/prepare_dataset.sh b/examples/llama/prepare_dataset.sh
@@ -0,0 +1,18 @@
+TMP_DIR="tmp"
+mkdir -p $TMP_DIR
+mkdir -p ${TMP_DIR}/data
+
+DATA_PATH="${TMP_DIR}/data"
+TOKENIZER_MODEL=${TMP_DIR}/tokenizer.model
+
+# Download the tokenizer model
+if ! [ -f "$TOKENIZER_MODEL" ]; then
+wget -O $TOKENIZER_MODEL https://huggingface.co/NousResearch/Llama-2-7b-chat-hf/resolve/main/tokenizer.model
+fi
+
+python3 prepare_bookcorpus_megatron_dataset.py --out-dir ${DATA_PATH}
+python3 tools/preprocess_data.py --input ${DATA_PATH}/bookcorpus_megatron.json  --tokenizer-type GPTSentencePieceTokenizer \
+--tokenizer-model ${TOKENIZER_MODEL} --output-prefix ${DATA_PATH}/bookcorpus --workers `nproc` --split-sentences
+
+python3 tools/preprocess_data.py --input ${DATA_PATH}/bookcorpus_megatron.json  --tokenizer-type GPTSentencePieceTokenizer \
+--tokenizer-model ${TOKENIZER_MODEL} --output-prefix ${DATA_PATH}/bookcorpus --workers `nproc` --split-sentences