From 5e2f1c690864567c26e0d4a8e33fff7ad81b79e1 Mon Sep 17 00:00:00 2001
From: Courtney Pacheco <6019922+courtneypacheco@users.noreply.github.com>
Date: Thu, 16 Jan 2025 09:15:51 -0500
Subject: [PATCH] Add XL e2e nightly CI job

Add a new XL e2e nightly CI job that triggers every day at 6am UTC. Also update the existing large CI job so that uploaded files are not overwritten.

Signed-off-by: Courtney Pacheco <6019922+courtneypacheco@users.noreply.github.com>
---
 .github/actions/free-disk-space/action.yml |  70 ++++
 .github/workflows/e2e-nvidia-l40s-x4.yml   |  36 +-
 .github/workflows/e2e-nvidia-l40s-x8.yml   | 446 +++++++++++++++++++++
 3 files changed, 538 insertions(+), 14 deletions(-)
 create mode 100644 .github/actions/free-disk-space/action.yml
 create mode 100644 .github/workflows/e2e-nvidia-l40s-x8.yml

diff --git a/.github/actions/free-disk-space/action.yml b/.github/actions/free-disk-space/action.yml
new file mode 100644
index 00000000..43f8bccc
--- /dev/null
+++ b/.github/actions/free-disk-space/action.yml
@@ -0,0 +1,70 @@
+name: 'Free Disk Space'
+description: 'Frees disk space on the runner'
+runs:
+  using: "composite"
+  steps:
+    - name: Print disk space before cleanup
+      run: |
+        df -h
+      shell: bash
+    - name: Free Disk Space Linux
+      if: runner.os == 'Linux'
+      run: |
+        # Determine if we have Ubuntu, CentOS, or other distro as our runner OS
+        os_id=$(grep '^ID=' /etc/os-release | cut -d "=" -f2)
+        echo "Detected OS distro as: ${os_id}"
+
+        # Sometimes `docker` is not installed, so only remove images if we need to.
+        if command -v docker 2>&1 >/dev/null ; then
+          sudo docker rmi "$(docker image ls -aq) -f" >/dev/null 2>&1 || true
+        fi
+
+        # Remove Android, .NET, and Haskell runtimes
+        sudo rm -rf \
+          /usr/local/lib/android \
+          /usr/share/dotnet \
+          /opt/ghc \
+          /usr/local/.ghcup \
+          /usr/local/share/powershell \
+          /usr/share/swift \
+          /usr/lib/jvm || true
+
+        printWarningMessage () {
+          echo "[warning] Failed to remove '$1', perhaps because it doesn't exist. Ignoring..."
+        }
+
+        # Remove large packages we don't use.
+        echo "Attempting to remove unused ${os_id} packages..."
+        if [[ "${os_id}" == "ubuntu" ]]; then
+          sudo apt-get remove -y '^mysql-.*' || printWarningMessage '^mysql-.*'
+          sudo apt-get remove -y '^dotnet-.*' --fix-missing || printWarningMessage '^dotnet-.*'
+          sudo apt-get remove -y 'php.*' --fix-missing || printWarningMessage 'php.*'
+          sudo apt-get remove -y '^mongodb-.*' --fix-missing || printWarningMessage '^mongodb-.*'
+          sudo apt-get remove -y '^llvm-.*' --fix-missing || printWarningMessage '^llvm-.*'
+          sudo apt-get remove -y google-cloud-sdk --fix-missing || printWarningMessage 'google-cloud-sdk'
+          sudo apt-get remove -y google-cloud-cli --fix-missing || printWarningMessage 'google-cloud-cli'
+          sudo apt-get autoremove -y >/dev/null 2>&1
+          sudo apt-get autoclean -y >/dev/null 2>&1
+        elif [[ "${os_id}" == "centos" ]]; then
+          sudo dnf -y remove 'mysql-*' || printWarningMessage 'mysql-*'
+          sudo dnf -y remove 'dotnet-*' || printWarningMessage 'dotnet-*'
+          sudo dnf -y remove 'php-*' || printWarningMessage 'php-*'
+          sudo dnf -y remove 'mongodb-*' || printWarningMessage 'mongodb-*'
+          sudo dnf -y remove 'llvm-*' || printWarningMessage 'llvm-*'
+          sudo dnf -y remove google-cloud-sdk || printWarningMessage 'google-cloud-sdk'
+          sudo dnf -y remove google-cloud-cli || printWarningMessage 'google-cloud-cli'
+          sudo dnf clean all
+          rm -rf /var/cache/dnf*
+        else
+          echo "Unrecognized OS '${os_id}'. Skipping large package cleanup, as this logic has not been implemented for ${os_id}."
+        fi
+      shell: bash
+    - name: Free Disk Space MacOS
+      if: runner.os == 'macOS'
+      run: |
+        sudo rm -rf /System/Volumes/Data/Applications/Xcode_15*
+      shell: bash
+    - name: Print disk space after cleanup
+      run: |
+        df -h
+      shell: bash
diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml
index 0e3ec259..3f83c53f 100644
--- a/.github/workflows/e2e-nvidia-l40s-x4.yml
+++ b/.github/workflows/e2e-nvidia-l40s-x4.yml
@@ -3,17 +3,25 @@
 name: E2E (NVIDIA L40S x4)
 
 on:
-  schedule:
-    - cron: '0 16 * * *' # Runs at 4PM UTC every day
-  workflow_dispatch:
-    inputs:
-      pr_or_branch:
-        description: 'pull request number or branch name'
-        required: true
-        default: 'main'
+### WILL BE UNCOMMENTED BEFORE MERGE
+#   schedule:
+#     - cron: '0 16 * * *' # Runs at 4PM UTC every day
+#   workflow_dispatch:
+#     inputs:
+#       pr_or_branch:
+#         description: 'pull request number or branch name'
+#         required: true
+#         default: 'main'
+
+  # FOR TESTING ON GITHUB ONLY. WILL BE REMOVED.
+  push:
+    branches:
+      - courtneypacheco-add-xl-e2e-job
 
 env:
   TMPDIR: /home/tmp
+  PHASE1_TRAINING_LOG_NAME: "phase-1-training-log-large.jsonl"
+  PHASE2_TRAINING_LOG_NAME: "phase-2-training-log-large.jsonl"
 
 jobs:
   start-large-ec2-runner:
@@ -206,7 +214,7 @@ jobs:
       - name: Upload training logs Phase 1
         uses: actions/upload-artifact@v4
         with:
-          name: phase-1-training-log.jsonl
+          name: ${{ env.PHASE1_TRAINING_LOG_NAME }}
           path: ./instructlab/phase-1-training-log.jsonl
           retention-days: 1
           overwrite: true
@@ -214,7 +222,7 @@ jobs:
       - name: Upload training logs Phase 2
         uses: actions/upload-artifact@v4
         with:
-          name: phase-2-training-log.jsonl
+          name: ${{ env.PHASE2_TRAINING_LOG_NAME }}
           path: ./instructlab/phase-2-training-log.jsonl
           retention-days: 1
           overwrite: true
@@ -338,14 +346,14 @@ jobs:
         id: phase-1-download-logs
         uses: actions/download-artifact@v4
         with:
-          name: phase-1-training-log.jsonl
+          name: ${{ env.PHASE1_TRAINING_LOG_NAME }}
           path: downloaded-data
 
       - name: Download loss data Phase 2
         id: phase-2-download-logs
         uses: actions/download-artifact@v4
         with:
-          name: phase-2-training-log.jsonl
+          name: ${{ env.PHASE2_TRAINING_LOG_NAME }}
           path: downloaded-data
 
       - name: Checkout instructlab/training
@@ -366,7 +374,7 @@ jobs:
         continue-on-error: true
         run: |
           python training/scripts/create-loss-graph.py  \
-            --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
+            --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log-large.jsonl" \
             --output-file "./phase-1-test.md" \
             --phase "1" \
             --aws-region "${{ vars.AWS_REGION }}" \
@@ -381,7 +389,7 @@ jobs:
         continue-on-error: true
         run: |
           python training/scripts/create-loss-graph.py  \
-            --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
+            --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log-large.jsonl" \
             --output-file "./phase-2-test.md" \
             --phase "2" \
             --aws-region "${{ vars.AWS_REGION }}" \
diff --git a/.github/workflows/e2e-nvidia-l40s-x8.yml b/.github/workflows/e2e-nvidia-l40s-x8.yml
new file mode 100644
index 00000000..425faa61
--- /dev/null
+++ b/.github/workflows/e2e-nvidia-l40s-x8.yml
@@ -0,0 +1,446 @@
+# SPDX-License-Identifier: Apache-2.0
+
+name: E2E (NVIDIA L40S x8)
+
+on:
+  ### WILL BE UNCOMMENTED BEFORE MERGE
+  # schedule:
+  #   - cron: '0 6 * * *' # Runs at 6AM UTC every day
+  # workflow_dispatch:
+  #   inputs:
+  #     pr_or_branch:
+  #       description: 'pull request number or branch name'
+  #       required: true
+  #       default: 'main'
+
+  # FOR TESTING ON GITHUB ONLY. WILL BE REMOVED.
+  push:
+    branches:
+      - courtneypacheco-add-xl-e2e-job
+
+env:
+  TMPDIR: /home/tmp
+  PHASE1_TRAINING_LOG_NAME: "phase-1-training-log-xlarge.jsonl"
+  PHASE2_TRAINING_LOG_NAME: "phase-2-training-log-xlarge.jsonl"
+
+jobs:
+  start-xlarge-ec2-runner:
+    runs-on: ubuntu-latest
+    outputs:
+      label: ${{ steps.start-ec2-runner.outputs.label }}
+      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+    steps:
+      - name: "Harden Runner"
+        # v2.10.1
+        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f
+        with:
+          egress-policy: audit
+
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          # https://github.com/actions/checkout/issues/249
+          fetch-depth: 0
+
+      # We need to free disk space to avoid the "No space left on device" error on our GH runner
+      - name: Free disk space
+        uses: ./.github/actions/free-disk-space
+
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ vars.AWS_REGION }}
+
+      - name: Start EC2 runner
+        id: start-ec2-runner
+        uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
+        with:
+          mode: start
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          ec2-image-id: ${{ vars.AWS_EC2_AMI }}
+          ec2-instance-type: g6e.48xlarge
+          subnet-id: subnet-024298cefa3bedd61
+          security-group-id: sg-06300447c4a5fbef3
+          iam-role-name: instructlab-ci-runner
+          aws-resource-tags: >
+            [
+              {"Key": "Name", "Value": "instructlab-ci-github-xlarge-runner"},
+              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
+              {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
+              {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
+            ]
+
+  e2e-xlarge-test:
+    needs:
+      - start-xlarge-ec2-runner
+    runs-on: ${{ needs.start-xlarge-ec2-runner.outputs.label }}
+
+    permissions:
+      pull-requests: write
+
+    steps:
+      - name: "Harden Runner"
+        # v2.10.1
+        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f
+        with:
+          egress-policy: audit
+
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          # https://github.com/actions/checkout/issues/249
+          fetch-depth: 0
+
+      # Remove unused packages that come with CentOS by default
+      - name: Free disk space
+        uses: ./.github/actions/free-disk-space
+
+      - name: Install Packages
+        run: |
+          cat /etc/os-release
+          mkdir -p "${TMPDIR}"
+          sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
+
+      - name: Checkout instructlab/instructlab
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "instructlab/instructlab"
+          path: "instructlab"
+          # https://github.com/actions/checkout/issues/249
+          fetch-depth: 0
+  
+      - name: Checkout instructlab/training
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "instructlab/training"
+          path: "training"
+          # https://github.com/actions/checkout/issues/249
+          fetch-depth: 0
+
+      - name: Determine if pr_or_branch is a PR number
+        id: check_pr
+        run: |
+          PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set
+          if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
+            echo "is_pr=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "is_pr=false" >> "$GITHUB_OUTPUT"
+          fi
+          echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT"
+
+      - name: Check if gh cli is installed
+        id: gh_cli
+        run: |
+          if command -v gh &> /dev/null ; then
+            echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Install gh CLI
+        if: steps.gh_cli.outputs.gh_cli_installed == 'false'
+        run: |
+          sudo dnf install 'dnf-command(config-manager)' -y
+          sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
+          sudo dnf install gh --repo gh-cli -y
+
+      - name: test gh CLI
+        run: |
+          gh --version
+
+      - name: set default repo
+        working-directory: ./training
+        run: |
+          gh repo set-default ${{ github.server_url }}/${{ github.repository }}
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Add comment to PR
+        if: steps.check_pr.outputs.is_pr == 'true'
+        working-directory: ./training
+        run: |
+          gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Fetch and checkout PR
+        if: steps.check_pr.outputs.is_pr == 'true'
+        working-directory: ./training
+        run: |
+          gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Checkout branch
+        if: steps.check_pr.outputs.is_pr == 'false'
+        working-directory: ./training
+        run: |
+          git checkout ${{ steps.check_pr.outputs.pr_or_branch }}
+
+
+      - name: Install ilab
+        working-directory: ./instructlab
+        run: |
+          export CUDA_HOME="/usr/local/cuda"
+          export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+          export PATH="$PATH:$CUDA_HOME/bin"
+          python3.11 -m venv --upgrade-deps venv
+          . venv/bin/activate
+          nvidia-smi
+          python3.11 -m pip cache remove llama_cpp_python
+
+          CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install .
+
+          # https://github.com/instructlab/instructlab/issues/1821
+          # install with Torch and build dependencies installed
+          python3.11 -m pip install packaging wheel setuptools-scm
+          python3.11 -m pip install .[cuda] -r requirements-vllm-cuda.txt
+
+      - name: Update instructlab-training library
+        working-directory: ./training
+        run: |
+          . ../instructlab/venv/bin/activate
+          pip install .
+          pip install .[cuda]
+
+      - name: Check disk before tests
+        run: |
+          df -h
+
+      - name: Run e2e test
+        working-directory: ./instructlab
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          . venv/bin/activate
+
+          # set preserve to true so we can retain the logs
+          ./scripts/e2e-ci.sh -xp
+
+          # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
+          # and we know that it will be written into a directory created by `mktemp -d`. 
+          # Given this information, we can use the following command to find the file:
+          log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
+          phase_num=1;
+          for log_file in $log_files; do
+              mv "${log_file}" phase-${phase_num}-training-log.jsonl
+              ((phase_num++))
+          done
+
+      - name: Check disk after tests
+        run: |
+          df -h
+
+      - name: Upload training logs Phase 1
+        uses: actions/upload-artifact@v4
+        with:
+          name: phase-1-training-log-xlarge.jsonl
+          path: ./instructlab/phase-1-training-log.jsonl
+          retention-days: 1
+          overwrite: true
+
+      - name: Upload training logs Phase 2
+        uses: actions/upload-artifact@v4
+        with:
+          name: phase-2-training-log-xlarge.jsonl
+          path: ./instructlab/phase-2-training-log.jsonl
+          retention-days: 1
+          overwrite: true
+
+      - name: Add comment to PR if the workflow failed
+        if: failure() && steps.check_pr.outputs.is_pr == 'true'
+        working-directory: ./training
+        run: |
+          gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Add comment to PR if the workflow succeeded
+        if: success() && steps.check_pr.outputs.is_pr == 'true'
+        working-directory: ./training
+        run: |
+          gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Post job results to Slack if the workflow failed
+        if: failure() && steps.check_pr.outputs.is_pr == 'false'
+        id: slack-report-failure
+        uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0
+        with:
+          token: ${{ secrets.SON_OF_JEEVES_TOKEN }}
+          method: chat.postMessage
+          payload: |
+            # Slack channel id, channel name, or user id to post message.
+            # See also: https://api.slack.com/methods/chat.postMessage#channels
+            # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
+            channel: 'e2e-ci-results'
+            text: "*e2e-nvidia-l40s-x8* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *with failures* :meow_sad-rain: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+
+      - name: Post job results to Slack if the workflow succeeded
+        if: success() && steps.check_pr.outputs.is_pr == 'false'
+        id: slack-report-success
+        uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0
+        with:
+          token: ${{ secrets.SON_OF_JEEVES_TOKEN }}
+          method: chat.postMessage
+          payload: |
+            # Slack channel id, channel name, or user id to post message.
+            # See also: https://api.slack.com/methods/chat.postMessage#channels
+            # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
+            channel: 'e2e-ci-results'
+            text: "*e2e-nvidia-l40s-x8* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *successfully* :meow_party: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+
+      - name: Send Discord notification for failure
+        if: failure() && steps.check_pr.outputs.is_pr == 'false'
+        uses: sarisia/actions-status-discord@65843b6a7d18626c252a055e247ccad1f41b4004 # v1.15.1
+        with:
+          webhook: ${{ secrets.SON_OF_JEEVES_DISCORD_WEBHOOK }}
+          status: ${{ job.status }}
+          title: "e2e-nvidia-l40s-x8"
+          description: |
+            Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **with failures** ❌
+            Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details.
+          color: 0xCB2431 # Red color for failure
+
+      - name: Send Discord notification for success
+        if: success() && steps.check_pr.outputs.is_pr == 'false'
+        uses: sarisia/actions-status-discord@65843b6a7d18626c252a055e247ccad1f41b4004 # v1.15.1
+        with:
+          webhook: ${{ secrets.SON_OF_JEEVES_DISCORD_WEBHOOK }}
+          status: ${{ job.status }}
+          title: "e2e-nvidia-l40s-x8"
+          description: |
+            Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **successfully** ✅
+            Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details.
+          color: 0x28A745 # Green color for success
+
+  stop-xlarge-ec2-runner:
+    needs:
+      - start-xlarge-ec2-runner
+      - e2e-xlarge-test
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - name: "Harden Runner"
+        # v2.10.1
+        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f
+        with:
+          egress-policy: audit
+
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ vars.AWS_REGION }}
+
+      - name: Stop EC2 runner
+        uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-xlarge-ec2-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-xlarge-ec2-runner.outputs.ec2-instance-id }}
+
+  loss-graphs:
+    needs:
+      - stop-xlarge-ec2-runner
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - name: "Harden Runner"
+        # v2.10.1
+        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f
+        with:
+          egress-policy: audit
+
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ vars.AWS_REGION }}
+
+      - name: Download loss data Phase 1
+        id: phase-1-download-logs
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ env.PHASE1_TRAINING_LOG_NAME }}
+          path: downloaded-data
+
+      - name: Download loss data Phase 2
+        id: phase-2-download-logs
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ env.PHASE2_TRAINING_LOG_NAME }}
+          path: downloaded-data
+
+      - name: Checkout instructlab/training
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "instructlab/training"
+          path: "training"
+          fetch-depth: 0
+
+      - name: Install dependencies
+        working-directory: ./training
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements-dev.txt
+
+      - name: Try to upload Phase 1 to s3
+        id: phase-1-upload-s3
+        continue-on-error: true
+        run: |
+          python training/scripts/create-loss-graph.py  \
+            --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log-xlarge.jsonl" \
+            --output-file "./phase-1-test.md" \
+            --phase "1" \
+            --aws-region "${{ vars.AWS_REGION }}" \
+            --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
+            --base-branch "${GITHUB_REF##*/}" \
+            --head-sha "${{ github.sha }}" \
+            --pr-number "${{ github.event.number }}" \
+            --origin-repository "${{ github.repository }}"
+
+      - name: Try to upload Phase 2 to s3
+        id: phase-2-upload-s3
+        continue-on-error: true
+        run: |
+          python training/scripts/create-loss-graph.py  \
+            --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log-xlarge.jsonl" \
+            --output-file "./phase-2-test.md" \
+            --phase "2" \
+            --aws-region "${{ vars.AWS_REGION }}" \
+            --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
+            --base-branch "${GITHUB_REF##*/}" \
+            --head-sha "${{ github.sha }}" \
+            --pr-number "${{ github.event.number }}" \
+            --origin-repository "${{ github.repository }}"
+
+      - name: Check Phase 1 S3 upload status for success
+        if: steps.phase-1-upload-s3.outcome == 'success'
+        run: |
+          echo "Uploaded Phase 1 loss graph to S3."
+          cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
+
+      - name: Check Phase 2 S3 upload status for success
+        if: steps.phase-2-upload-s3.outcome == 'success'
+        run: |
+          echo "Uploaded Phase 2 loss graph to S3."
+          cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
+
+      - name: Check Phase 1 S3 upload status for failure
+        if: steps.phase-1-upload-s3.outcome == 'failure'
+        run: |
+          echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
+          echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
+
+      - name: Check Phase 2 S3 upload status for failure
+        if: steps.phase-2-upload-s3.outcome == 'failure'
+        run: |
+          echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
+          echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"