From 5e2f1c690864567c26e0d4a8e33fff7ad81b79e1 Mon Sep 17 00:00:00 2001 From: Courtney Pacheco <6019922+courtneypacheco@users.noreply.github.com> Date: Thu, 16 Jan 2025 09:15:51 -0500 Subject: [PATCH] Add XL e2e nightly CI job Add a new XL e2e nightly CI job that triggers every day at 6am UTC. Also update the existing large CI job so that uploaded files are not overwritten. Signed-off-by: Courtney Pacheco <6019922+courtneypacheco@users.noreply.github.com> --- .github/actions/free-disk-space/action.yml | 70 ++++ .github/workflows/e2e-nvidia-l40s-x4.yml | 36 +- .github/workflows/e2e-nvidia-l40s-x8.yml | 446 +++++++++++++++++++++ 3 files changed, 538 insertions(+), 14 deletions(-) create mode 100644 .github/actions/free-disk-space/action.yml create mode 100644 .github/workflows/e2e-nvidia-l40s-x8.yml diff --git a/.github/actions/free-disk-space/action.yml b/.github/actions/free-disk-space/action.yml new file mode 100644 index 00000000..43f8bccc --- /dev/null +++ b/.github/actions/free-disk-space/action.yml @@ -0,0 +1,70 @@ +name: 'Free Disk Space' +description: 'Frees disk space on the runner' +runs: + using: "composite" + steps: + - name: Print disk space before cleanup + run: | + df -h + shell: bash + - name: Free Disk Space Linux + if: runner.os == 'Linux' + run: | + # Determine if we have Ubuntu, CentOS, or other distro as our runner OS + os_id=$(grep '^ID=' /etc/os-release | cut -d "=" -f2) + echo "Detected OS distro as: ${os_id}" + + # Sometimes `docker` is not installed, so only remove images if we need to. + if command -v docker 2>&1 >/dev/null ; then + sudo docker rmi "$(docker image ls -aq) -f" >/dev/null 2>&1 || true + fi + + # Remove Android, .NET, and Haskell runtimes + sudo rm -rf \ + /usr/local/lib/android \ + /usr/share/dotnet \ + /opt/ghc \ + /usr/local/.ghcup \ + /usr/local/share/powershell \ + /usr/share/swift \ + /usr/lib/jvm || true + + printWarningMessage () { + echo "[warning] Failed to remove '$1', perhaps because it doesn't exist. Ignoring..." + } + + # Remove large packages we don't use. + echo "Attempting to remove unused ${os_id} packages..." + if [[ "${os_id}" == "ubuntu" ]]; then + sudo apt-get remove -y '^mysql-.*' || printWarningMessage '^mysql-.*' + sudo apt-get remove -y '^dotnet-.*' --fix-missing || printWarningMessage '^dotnet-.*' + sudo apt-get remove -y 'php.*' --fix-missing || printWarningMessage 'php.*' + sudo apt-get remove -y '^mongodb-.*' --fix-missing || printWarningMessage '^mongodb-.*' + sudo apt-get remove -y '^llvm-.*' --fix-missing || printWarningMessage '^llvm-.*' + sudo apt-get remove -y google-cloud-sdk --fix-missing || printWarningMessage 'google-cloud-sdk' + sudo apt-get remove -y google-cloud-cli --fix-missing || printWarningMessage 'google-cloud-cli' + sudo apt-get autoremove -y >/dev/null 2>&1 + sudo apt-get autoclean -y >/dev/null 2>&1 + elif [[ "${os_id}" == "centos" ]]; then + sudo dnf -y remove 'mysql-*' || printWarningMessage 'mysql-*' + sudo dnf -y remove 'dotnet-*' || printWarningMessage 'dotnet-*' + sudo dnf -y remove 'php-*' || printWarningMessage 'php-*' + sudo dnf -y remove 'mongodb-*' || printWarningMessage 'mongodb-*' + sudo dnf -y remove 'llvm-*' || printWarningMessage 'llvm-*' + sudo dnf -y remove google-cloud-sdk || printWarningMessage 'google-cloud-sdk' + sudo dnf -y remove google-cloud-cli || printWarningMessage 'google-cloud-cli' + sudo dnf clean all + rm -rf /var/cache/dnf* + else + echo "Unrecognized OS '${os_id}'. Skipping large package cleanup, as this logic has not been implemented for ${os_id}." + fi + shell: bash + - name: Free Disk Space MacOS + if: runner.os == 'macOS' + run: | + sudo rm -rf /System/Volumes/Data/Applications/Xcode_15* + shell: bash + - name: Print disk space after cleanup + run: | + df -h + shell: bash diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml index 0e3ec259..3f83c53f 100644 --- a/.github/workflows/e2e-nvidia-l40s-x4.yml +++ b/.github/workflows/e2e-nvidia-l40s-x4.yml @@ -3,17 +3,25 @@ name: E2E (NVIDIA L40S x4) on: - schedule: - - cron: '0 16 * * *' # Runs at 4PM UTC every day - workflow_dispatch: - inputs: - pr_or_branch: - description: 'pull request number or branch name' - required: true - default: 'main' +### WILL BE UNCOMMENTED BEFORE MERGE +# schedule: +# - cron: '0 16 * * *' # Runs at 4PM UTC every day +# workflow_dispatch: +# inputs: +# pr_or_branch: +# description: 'pull request number or branch name' +# required: true +# default: 'main' + + # FOR TESTING ON GITHUB ONLY. WILL BE REMOVED. + push: + branches: + - courtneypacheco-add-xl-e2e-job env: TMPDIR: /home/tmp + PHASE1_TRAINING_LOG_NAME: "phase-1-training-log-large.jsonl" + PHASE2_TRAINING_LOG_NAME: "phase-2-training-log-large.jsonl" jobs: start-large-ec2-runner: @@ -206,7 +214,7 @@ jobs: - name: Upload training logs Phase 1 uses: actions/upload-artifact@v4 with: - name: phase-1-training-log.jsonl + name: ${{ env.PHASE1_TRAINING_LOG_NAME }} path: ./instructlab/phase-1-training-log.jsonl retention-days: 1 overwrite: true @@ -214,7 +222,7 @@ jobs: - name: Upload training logs Phase 2 uses: actions/upload-artifact@v4 with: - name: phase-2-training-log.jsonl + name: ${{ env.PHASE2_TRAINING_LOG_NAME }} path: ./instructlab/phase-2-training-log.jsonl retention-days: 1 overwrite: true @@ -338,14 +346,14 @@ jobs: id: phase-1-download-logs uses: actions/download-artifact@v4 with: - name: phase-1-training-log.jsonl + name: ${{ env.PHASE1_TRAINING_LOG_NAME }} path: downloaded-data - name: Download loss data Phase 2 id: phase-2-download-logs uses: actions/download-artifact@v4 with: - name: phase-2-training-log.jsonl + name: ${{ env.PHASE2_TRAINING_LOG_NAME }} path: downloaded-data - name: Checkout instructlab/training @@ -366,7 +374,7 @@ jobs: continue-on-error: true run: | python training/scripts/create-loss-graph.py \ - --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \ + --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log-large.jsonl" \ --output-file "./phase-1-test.md" \ --phase "1" \ --aws-region "${{ vars.AWS_REGION }}" \ @@ -381,7 +389,7 @@ jobs: continue-on-error: true run: | python training/scripts/create-loss-graph.py \ - --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \ + --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log-large.jsonl" \ --output-file "./phase-2-test.md" \ --phase "2" \ --aws-region "${{ vars.AWS_REGION }}" \ diff --git a/.github/workflows/e2e-nvidia-l40s-x8.yml b/.github/workflows/e2e-nvidia-l40s-x8.yml new file mode 100644 index 00000000..425faa61 --- /dev/null +++ b/.github/workflows/e2e-nvidia-l40s-x8.yml @@ -0,0 +1,446 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: E2E (NVIDIA L40S x8) + +on: + ### WILL BE UNCOMMENTED BEFORE MERGE + # schedule: + # - cron: '0 6 * * *' # Runs at 6AM UTC every day + # workflow_dispatch: + # inputs: + # pr_or_branch: + # description: 'pull request number or branch name' + # required: true + # default: 'main' + + # FOR TESTING ON GITHUB ONLY. WILL BE REMOVED. + push: + branches: + - courtneypacheco-add-xl-e2e-job + +env: + TMPDIR: /home/tmp + PHASE1_TRAINING_LOG_NAME: "phase-1-training-log-xlarge.jsonl" + PHASE2_TRAINING_LOG_NAME: "phase-2-training-log-xlarge.jsonl" + +jobs: + start-xlarge-ec2-runner: + runs-on: ubuntu-latest + outputs: + label: ${{ steps.start-ec2-runner.outputs.label }} + ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} + steps: + - name: "Harden Runner" + # v2.10.1 + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f + with: + egress-policy: audit + + - name: Checkout + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + # https://github.com/actions/checkout/issues/249 + fetch-depth: 0 + + # We need to free disk space to avoid the "No space left on device" error on our GH runner + - name: Free disk space + uses: ./.github/actions/free-disk-space + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ vars.AWS_REGION }} + + - name: Start EC2 runner + id: start-ec2-runner + uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7 + with: + mode: start + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + ec2-image-id: ${{ vars.AWS_EC2_AMI }} + ec2-instance-type: g6e.48xlarge + subnet-id: subnet-024298cefa3bedd61 + security-group-id: sg-06300447c4a5fbef3 + iam-role-name: instructlab-ci-runner + aws-resource-tags: > + [ + {"Key": "Name", "Value": "instructlab-ci-github-xlarge-runner"}, + {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}, + {"Key": "GitHubRef", "Value": "${{ github.ref }}"}, + {"Key": "GitHubPR", "Value": "${{ github.event.number }}"} + ] + + e2e-xlarge-test: + needs: + - start-xlarge-ec2-runner + runs-on: ${{ needs.start-xlarge-ec2-runner.outputs.label }} + + permissions: + pull-requests: write + + steps: + - name: "Harden Runner" + # v2.10.1 + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f + with: + egress-policy: audit + + - name: Checkout + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + # https://github.com/actions/checkout/issues/249 + fetch-depth: 0 + + # Remove unused packages that come with CentOS by default + - name: Free disk space + uses: ./.github/actions/free-disk-space + + - name: Install Packages + run: | + cat /etc/os-release + mkdir -p "${TMPDIR}" + sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel + + - name: Checkout instructlab/instructlab + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + repository: "instructlab/instructlab" + path: "instructlab" + # https://github.com/actions/checkout/issues/249 + fetch-depth: 0 + + - name: Checkout instructlab/training + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + repository: "instructlab/training" + path: "training" + # https://github.com/actions/checkout/issues/249 + fetch-depth: 0 + + - name: Determine if pr_or_branch is a PR number + id: check_pr + run: | + PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set + if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then + echo "is_pr=true" >> "$GITHUB_OUTPUT" + else + echo "is_pr=false" >> "$GITHUB_OUTPUT" + fi + echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT" + + - name: Check if gh cli is installed + id: gh_cli + run: | + if command -v gh &> /dev/null ; then + echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT" + else + echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT" + fi + + - name: Install gh CLI + if: steps.gh_cli.outputs.gh_cli_installed == 'false' + run: | + sudo dnf install 'dnf-command(config-manager)' -y + sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo + sudo dnf install gh --repo gh-cli -y + + - name: test gh CLI + run: | + gh --version + + - name: set default repo + working-directory: ./training + run: | + gh repo set-default ${{ github.server_url }}/${{ github.repository }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Add comment to PR + if: steps.check_pr.outputs.is_pr == 'true' + working-directory: ./training + run: | + gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Fetch and checkout PR + if: steps.check_pr.outputs.is_pr == 'true' + working-directory: ./training + run: | + gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Checkout branch + if: steps.check_pr.outputs.is_pr == 'false' + working-directory: ./training + run: | + git checkout ${{ steps.check_pr.outputs.pr_or_branch }} + + + - name: Install ilab + working-directory: ./instructlab + run: | + export CUDA_HOME="/usr/local/cuda" + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64" + export PATH="$PATH:$CUDA_HOME/bin" + python3.11 -m venv --upgrade-deps venv + . venv/bin/activate + nvidia-smi + python3.11 -m pip cache remove llama_cpp_python + + CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install . + + # https://github.com/instructlab/instructlab/issues/1821 + # install with Torch and build dependencies installed + python3.11 -m pip install packaging wheel setuptools-scm + python3.11 -m pip install .[cuda] -r requirements-vllm-cuda.txt + + - name: Update instructlab-training library + working-directory: ./training + run: | + . ../instructlab/venv/bin/activate + pip install . + pip install .[cuda] + + - name: Check disk before tests + run: | + df -h + + - name: Run e2e test + working-directory: ./instructlab + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + . venv/bin/activate + + # set preserve to true so we can retain the logs + ./scripts/e2e-ci.sh -xp + + # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python + # and we know that it will be written into a directory created by `mktemp -d`. + # Given this information, we can use the following command to find the file: + log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl") + phase_num=1; + for log_file in $log_files; do + mv "${log_file}" phase-${phase_num}-training-log.jsonl + ((phase_num++)) + done + + - name: Check disk after tests + run: | + df -h + + - name: Upload training logs Phase 1 + uses: actions/upload-artifact@v4 + with: + name: phase-1-training-log-xlarge.jsonl + path: ./instructlab/phase-1-training-log.jsonl + retention-days: 1 + overwrite: true + + - name: Upload training logs Phase 2 + uses: actions/upload-artifact@v4 + with: + name: phase-2-training-log-xlarge.jsonl + path: ./instructlab/phase-2-training-log.jsonl + retention-days: 1 + overwrite: true + + - name: Add comment to PR if the workflow failed + if: failure() && steps.check_pr.outputs.is_pr == 'true' + working-directory: ./training + run: | + gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate." + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Add comment to PR if the workflow succeeded + if: success() && steps.check_pr.outputs.is_pr == 'true' + working-directory: ./training + run: | + gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Post job results to Slack if the workflow failed + if: failure() && steps.check_pr.outputs.is_pr == 'false' + id: slack-report-failure + uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0 + with: + token: ${{ secrets.SON_OF_JEEVES_TOKEN }} + method: chat.postMessage + payload: | + # Slack channel id, channel name, or user id to post message. + # See also: https://api.slack.com/methods/chat.postMessage#channels + # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs. + channel: 'e2e-ci-results' + text: "*e2e-nvidia-l40s-x8* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *with failures* :meow_sad-rain: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + + - name: Post job results to Slack if the workflow succeeded + if: success() && steps.check_pr.outputs.is_pr == 'false' + id: slack-report-success + uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0 + with: + token: ${{ secrets.SON_OF_JEEVES_TOKEN }} + method: chat.postMessage + payload: | + # Slack channel id, channel name, or user id to post message. + # See also: https://api.slack.com/methods/chat.postMessage#channels + # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs. + channel: 'e2e-ci-results' + text: "*e2e-nvidia-l40s-x8* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *successfully* :meow_party: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + + - name: Send Discord notification for failure + if: failure() && steps.check_pr.outputs.is_pr == 'false' + uses: sarisia/actions-status-discord@65843b6a7d18626c252a055e247ccad1f41b4004 # v1.15.1 + with: + webhook: ${{ secrets.SON_OF_JEEVES_DISCORD_WEBHOOK }} + status: ${{ job.status }} + title: "e2e-nvidia-l40s-x8" + description: | + Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **with failures** ❌ + Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details. + color: 0xCB2431 # Red color for failure + + - name: Send Discord notification for success + if: success() && steps.check_pr.outputs.is_pr == 'false' + uses: sarisia/actions-status-discord@65843b6a7d18626c252a055e247ccad1f41b4004 # v1.15.1 + with: + webhook: ${{ secrets.SON_OF_JEEVES_DISCORD_WEBHOOK }} + status: ${{ job.status }} + title: "e2e-nvidia-l40s-x8" + description: | + Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **successfully** ✅ + Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details. + color: 0x28A745 # Green color for success + + stop-xlarge-ec2-runner: + needs: + - start-xlarge-ec2-runner + - e2e-xlarge-test + runs-on: ubuntu-latest + if: ${{ always() }} + steps: + - name: "Harden Runner" + # v2.10.1 + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f + with: + egress-policy: audit + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ vars.AWS_REGION }} + + - name: Stop EC2 runner + uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6 + with: + mode: stop + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + label: ${{ needs.start-xlarge-ec2-runner.outputs.label }} + ec2-instance-id: ${{ needs.start-xlarge-ec2-runner.outputs.ec2-instance-id }} + + loss-graphs: + needs: + - stop-xlarge-ec2-runner + runs-on: ubuntu-latest + if: ${{ always() }} + steps: + - name: "Harden Runner" + # v2.10.1 + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f + with: + egress-policy: audit + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ vars.AWS_REGION }} + + - name: Download loss data Phase 1 + id: phase-1-download-logs + uses: actions/download-artifact@v4 + with: + name: ${{ env.PHASE1_TRAINING_LOG_NAME }} + path: downloaded-data + + - name: Download loss data Phase 2 + id: phase-2-download-logs + uses: actions/download-artifact@v4 + with: + name: ${{ env.PHASE2_TRAINING_LOG_NAME }} + path: downloaded-data + + - name: Checkout instructlab/training + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + repository: "instructlab/training" + path: "training" + fetch-depth: 0 + + - name: Install dependencies + working-directory: ./training + run: | + python -m pip install --upgrade pip + pip install -r requirements-dev.txt + + - name: Try to upload Phase 1 to s3 + id: phase-1-upload-s3 + continue-on-error: true + run: | + python training/scripts/create-loss-graph.py \ + --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log-xlarge.jsonl" \ + --output-file "./phase-1-test.md" \ + --phase "1" \ + --aws-region "${{ vars.AWS_REGION }}" \ + --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \ + --base-branch "${GITHUB_REF##*/}" \ + --head-sha "${{ github.sha }}" \ + --pr-number "${{ github.event.number }}" \ + --origin-repository "${{ github.repository }}" + + - name: Try to upload Phase 2 to s3 + id: phase-2-upload-s3 + continue-on-error: true + run: | + python training/scripts/create-loss-graph.py \ + --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log-xlarge.jsonl" \ + --output-file "./phase-2-test.md" \ + --phase "2" \ + --aws-region "${{ vars.AWS_REGION }}" \ + --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \ + --base-branch "${GITHUB_REF##*/}" \ + --head-sha "${{ github.sha }}" \ + --pr-number "${{ github.event.number }}" \ + --origin-repository "${{ github.repository }}" + + - name: Check Phase 1 S3 upload status for success + if: steps.phase-1-upload-s3.outcome == 'success' + run: | + echo "Uploaded Phase 1 loss graph to S3." + cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}" + + - name: Check Phase 2 S3 upload status for success + if: steps.phase-2-upload-s3.outcome == 'success' + run: | + echo "Uploaded Phase 2 loss graph to S3." + cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}" + + - name: Check Phase 1 S3 upload status for failure + if: steps.phase-1-upload-s3.outcome == 'failure' + run: | + echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate." + echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}" + + - name: Check Phase 2 S3 upload status for failure + if: steps.phase-2-upload-s3.outcome == 'failure' + run: | + echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate." + echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"