From 7517eba5f349a7a214d492996cf2230f7fd0f630 Mon Sep 17 00:00:00 2001 From: Vladislav Date: Mon, 23 Sep 2024 16:31:24 -0700 Subject: [PATCH 1/4] Add JetStream to MaxText container --- .github/container/Dockerfile.maxtext.amd64 | 14 ++++++++++ .github/container/Dockerfile.maxtext.arm64 | 11 ++++++++ .github/workflows/_ci.yaml | 32 ++++++++++++++++++++++ 3 files changed, 57 insertions(+) diff --git a/.github/container/Dockerfile.maxtext.amd64 b/.github/container/Dockerfile.maxtext.amd64 index 63c6767c0..c3337b9e6 100644 --- a/.github/container/Dockerfile.maxtext.amd64 +++ b/.github/container/Dockerfile.maxtext.amd64 @@ -3,6 +3,9 @@ ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax ARG URLREF_MAXTEXT=https://github.com/google/maxtext.git#main ARG SRC_PATH_MAXTEXT=/opt/maxtext +ARG URLREF_JETSTREAM=https://github.com/google/jetstream.git#main +ARG SRC_PATH_JETSTREAM=/opt/jetstream + ############################################################################### ## Download source and add auxiliary scripts @@ -30,6 +33,17 @@ RUN cd "${SRC_PATH_MAXTEXT}" && patch -p1 < /opt/maxtext-mha.patch && git diff ADD test-maxtext.sh /usr/local/bin +############################################################################### +## Add JetStream +############################################################################### +ARG URLREF_JETSTREAM +ARG SRC_PATH_JETSTREAM + +RUN <<"EOF" bash -ex +git-clone.sh ${URLREF_JETSTREAM} ${SRC_PATH_JETSTREAM} +echo "-r ${SRC_PATH_JETSTREAM}/requirements.txt" >> /opt/pip-tools.d/requirements-jetstream.in +EOF + ############################################################################### ## Install accumulated packages from the base image and the previous stage ############################################################################### diff --git a/.github/container/Dockerfile.maxtext.arm64 b/.github/container/Dockerfile.maxtext.arm64 index a971d2405..ad0f43b69 100644 --- a/.github/container/Dockerfile.maxtext.arm64 +++ b/.github/container/Dockerfile.maxtext.arm64 @@ -71,6 +71,17 @@ RUN cd "${SRC_PATH_MAXTEXT}" && patch -p1 < /opt/maxtext-mha.patch && git diff ADD test-maxtext.sh /usr/local/bin +############################################################################### +## Add JetStream +############################################################################### +ARG URLREF_JETSTREAM +ARG SRC_PATH_JETSTREAM + +RUN <<"EOF" bash -ex +git-clone.sh ${URLREF_JETSTREAM} ${SRC_PATH_JETSTREAM} +echo "-r ${SRC_PATH_JETSTREAM}/requirements.txt" >> /opt/pip-tools.d/requirements-jetstream.in +EOF + ############################################################################### ## Install accumulated packages from the base image and the previous stage ############################################################################### diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index fc04b83ab..6974b8c4c 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -581,3 +581,35 @@ jobs: with: MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} secrets: inherit + + test-jetstream: + needs: build-maxtext + if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners + uses: ./.github/workflows/_test_unit.yaml + with: + TEST_NAME: jetstream + EXECUTE: | + docker run --shm-size=1g --gpus all ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} \ + bash -ec \ + "cd /opt/jetstream && " \ + "pip install -r requirements.txt && " \ + "export CUDA_VISIBLE_DEVICES=0 && " \ + "python -m unittest -v jetstream.tests.core.test_orchestrator && " \ + "python -m jetstream.engine.mock_engine_test && " \ + "python -m jetstream.core.orchestrator_test && " \ + "python -m jetstream.core.server_test" \ + | tee test-jetstream.log + STATISTICS_SCRIPT: | + summary_line=$(tail -n1 test-jetstream.log) + errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + total_tests=$((failed_tests + passed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + ARTIFACTS: | + test-jetstream.log + secrets: inherit + From c952cb320c6e8b319245512fd1f4a6702199ab00 Mon Sep 17 00:00:00 2001 From: Vladislav Date: Wed, 25 Sep 2024 14:51:38 -0700 Subject: [PATCH 2/4] Sandbox test --- .github/workflows/_ci.yaml | 18 ++++----- .github/workflows/_sandbox.yaml | 65 +++++++++++++++------------------ 2 files changed, 39 insertions(+), 44 deletions(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index fd2525649..0492aa388 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -596,15 +596,15 @@ jobs: TEST_NAME: jetstream EXECUTE: | docker run --shm-size=1g --gpus all ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} \ - bash -ec \ - "cd /opt/jetstream && " \ - "pip install -r requirements.txt && " \ - "export CUDA_VISIBLE_DEVICES=0 && " \ - "python -m unittest -v jetstream.tests.core.test_orchestrator && " \ - "python -m jetstream.engine.mock_engine_test && " \ - "python -m jetstream.core.orchestrator_test && " \ - "python -m jetstream.core.server_test" \ - | tee test-jetstream.log + bash <<"EOF" |& tee test-jetstream.log + cd /opt/jetstream + pip install -r requirements.txt + export CUDA_VISIBLE_DEVICES=0 + python -m unittest -v jetstream.tests.core.test_orchestrator + python -m jetstream.engine.mock_engine_test + python -m jetstream.core.orchestrator_test + python -m jetstream.core.server_test + EOF STATISTICS_SCRIPT: | summary_line=$(tail -n1 test-jetstream.log) errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 7b90b72ca..e120a39de 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -4,38 +4,33 @@ on: workflow_dispatch: jobs: - sandbox: - runs-on: ubuntu-22.04 - steps: - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Print usage - run: | - cat << EOF - This is an empty workflow file located in the main branch of your - repository. It serves as a testing ground for new GitHub Actions on - development branches before merging them to the main branch. By - defining and overloading this workflow on your development branch, - you can test new actions without affecting your main branch, ensuring - a smooth integration process once the changes are ready to be merged. - - Usage: - - 1. In your development branch, modify the sandbox.yml workflow file - to include the new actions you want to test. Make sure to commit - the changes to the development branch. - 2. Navigate to the 'Actions' tab in your repository, select the - '~Sandbox' workflow, and choose your development branch from the - branch dropdown menu. Click on 'Run workflow' to trigger the - workflow on your development branch. - 3. Once you have tested and verified the new actions in the Sandbox - workflow, you can incorporate them into your main workflow(s) and - merge the development branch into the main branch. Remember to - revert the changes to the sandbox.yml file in the main branch to - keep it empty for future testing. - EOF + test-jetstream: + needs: build-maxtext + if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners + uses: ./.github/workflows/_test_unit.yaml + with: + TEST_NAME: jetstream + EXECUTE: | + docker run --shm-size=1g --gpus all ghcr.io/nvidia/jax-toolbox-internal:11038994821-maxtext-amd64 \ + bash <<"EOF" |& tee test-jetstream.log + cd /opt/jetstream + pip install -r requirements.txt + export CUDA_VISIBLE_DEVICES=0 + python -m unittest -v jetstream.tests.core.test_orchestrator + python -m jetstream.engine.mock_engine_test + python -m jetstream.core.orchestrator_test + python -m jetstream.core.server_test + EOF + STATISTICS_SCRIPT: | + summary_line=$(tail -n1 test-jetstream.log) + errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + total_tests=$((failed_tests + passed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + ARTIFACTS: | + test-jetstream.log + secrets: inherit \ No newline at end of file From ddc412475180e4eb43a37782cafffed0ae245960 Mon Sep 17 00:00:00 2001 From: Vladislav Date: Wed, 25 Sep 2024 16:31:05 -0700 Subject: [PATCH 3/4] Sandbox fix --- .github/workflows/_sandbox.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index e120a39de..6d86a44e4 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -5,8 +5,6 @@ on: jobs: test-jetstream: - needs: build-maxtext - if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners uses: ./.github/workflows/_test_unit.yaml with: TEST_NAME: jetstream From ab21dae070f41c4976691d9960b53c77d0a65701 Mon Sep 17 00:00:00 2001 From: Vladislav Date: Wed, 25 Sep 2024 16:47:18 -0700 Subject: [PATCH 4/4] Revert sandbox --- .github/workflows/_sandbox.yaml | 63 ++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 28 deletions(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 6d86a44e4..7b90b72ca 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -4,31 +4,38 @@ on: workflow_dispatch: jobs: - test-jetstream: - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: jetstream - EXECUTE: | - docker run --shm-size=1g --gpus all ghcr.io/nvidia/jax-toolbox-internal:11038994821-maxtext-amd64 \ - bash <<"EOF" |& tee test-jetstream.log - cd /opt/jetstream - pip install -r requirements.txt - export CUDA_VISIBLE_DEVICES=0 - python -m unittest -v jetstream.tests.core.test_orchestrator - python -m jetstream.engine.mock_engine_test - python -m jetstream.core.orchestrator_test - python -m jetstream.core.server_test - EOF - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-jetstream.log) - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-jetstream.log - secrets: inherit \ No newline at end of file + sandbox: + runs-on: ubuntu-22.04 + steps: + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Print usage + run: | + cat << EOF + This is an empty workflow file located in the main branch of your + repository. It serves as a testing ground for new GitHub Actions on + development branches before merging them to the main branch. By + defining and overloading this workflow on your development branch, + you can test new actions without affecting your main branch, ensuring + a smooth integration process once the changes are ready to be merged. + + Usage: + + 1. In your development branch, modify the sandbox.yml workflow file + to include the new actions you want to test. Make sure to commit + the changes to the development branch. + 2. Navigate to the 'Actions' tab in your repository, select the + '~Sandbox' workflow, and choose your development branch from the + branch dropdown menu. Click on 'Run workflow' to trigger the + workflow on your development branch. + 3. Once you have tested and verified the new actions in the Sandbox + workflow, you can incorporate them into your main workflow(s) and + merge the development branch into the main branch. Remember to + revert the changes to the sandbox.yml file in the main branch to + keep it empty for future testing. + EOF