From a8a0b67b84631b20e4250cd329ac86dc89f013ff Mon Sep 17 00:00:00 2001
From: Karel Suta <ksuta@redhat.com>
Date: Tue, 9 Jul 2024 15:27:43 +0200
Subject: [PATCH] Adjust e2e tests to use GPU

---
 .github/workflows/e2e_tests.yaml            | 35 ++++++++-------------
 tests/e2e/mnist.py                          |  5 ++-
 tests/e2e/mnist_raycluster_sdk_kind_test.py |  5 ++-
 tests/e2e/support.py                        |  2 +-
 4 files changed, 22 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml
index b83afb4b8..1a4842062 100644
--- a/.github/workflows/e2e_tests.yaml
+++ b/.github/workflows/e2e_tests.yaml
@@ -24,28 +24,18 @@ concurrency:
   group: ${{ github.head_ref }}-${{ github.workflow }}
   cancel-in-progress: true
 
+env:
+  CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
+
 jobs:
   kubernetes:
 
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-20.04-4core-gpu
 
     steps:
-      - name: Cleanup
+      - name: Install yq
         run: |
-          ls -lart
-          echo "Initial status:"
-          df -h
-          echo "Cleaning up resources:"
-          sudo swapoff -a
-          sudo rm -f /swapfile
-          sudo apt clean
-          sudo rm -rf /usr/share/dotnet
-          sudo rm -rf /opt/ghc
-          sudo rm -rf "/usr/local/share/boost"
-          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
-          docker rmi $(docker image ls -aq)
-          echo "Final status:"
-          df -h
+          sudo wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/bin/yq && sudo chmod +x /usr/bin/yq
 
       - name: Checkout code
         uses: actions/checkout@v4
@@ -82,9 +72,15 @@ jobs:
           python-version: '3.9'
           cache: 'pip' # caching pip dependencies
 
+      - name: Setup NVidia GPU environment for KinD
+        uses: ./common/github-actions/nvidia-gpu-setup
+
       - name: Setup and start KinD cluster
         uses: ./common/github-actions/kind
 
+      - name: Install NVidia GPU operator for KinD
+        uses: ./common/github-actions/nvidia-gpu-operator
+
       - name: Deploy CodeFlare stack
         id: deploy
         run: |
@@ -92,9 +88,7 @@ jobs:
           echo Setting up CodeFlare stack
           make setup-e2e
           echo Deploying CodeFlare operator
-          IMG="${REGISTRY_ADDRESS}"/codeflare-operator
-          make image-push -e IMG="${IMG}"
-          make deploy -e IMG="${IMG}" -e ENV="e2e"
+          make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
           kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
           cd ..
 
@@ -103,9 +97,6 @@ jobs:
         with:
           user-name: sdk-user
 
-      - name: Add kueue resources
-        run: kubectl apply --server-side -f "https://github.com/kubernetes-sigs/kueue/releases/download/v0.6.2/manifests.yaml"
-
       - name: Configure RBAC for sdk user with limited permissions
         run: |
           kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
diff --git a/tests/e2e/mnist.py b/tests/e2e/mnist.py
index 2971d9c98..55ed91eaa 100644
--- a/tests/e2e/mnist.py
+++ b/tests/e2e/mnist.py
@@ -32,6 +32,9 @@
 print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR"))
 print("MASTER_PORT: is ", os.getenv("MASTER_PORT"))
 
+print("ACCELERATOR: is ", os.getenv("ACCELERATOR"))
+ACCELERATOR = os.getenv("ACCELERATOR")
+
 
 class LitMNIST(LightningModule):
     def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4):
@@ -149,7 +152,7 @@ def test_dataloader(self):
 
 # Initialize a trainer
 trainer = Trainer(
-    accelerator="auto",
+    accelerator=ACCELERATOR,
     # devices=1 if torch.cuda.is_available() else None,  # limiting got iPython runs
     max_epochs=3,
     callbacks=[TQDMProgressBar(refresh_rate=20)],
diff --git a/tests/e2e/mnist_raycluster_sdk_kind_test.py b/tests/e2e/mnist_raycluster_sdk_kind_test.py
index d85397221..37665c6f5 100644
--- a/tests/e2e/mnist_raycluster_sdk_kind_test.py
+++ b/tests/e2e/mnist_raycluster_sdk_kind_test.py
@@ -41,7 +41,7 @@ def run_mnist_raycluster_sdk_kind(self):
                 worker_cpu_limits=1,
                 worker_memory_requests=1,
                 worker_memory_limits=2,
-                num_worker_gpus=0,
+                num_worker_gpus=1,
                 image=ray_image,
                 write_to_file=True,
                 verify_tls=False,
@@ -71,7 +71,10 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster):
             runtime_env={
                 "working_dir": "./tests/e2e/",
                 "pip": "./tests/e2e/mnist_pip_requirements.txt",
+                "env_vars": {"ACCELERATOR": "gpu"},
             },
+            # Using EntrypointNumGpus doesn't seem to work properly on KinD cluster with GPU, EntrypointNumCpus seems reliable
+            entrypoint_num_gpus=1,
         )
         print(f"Submitted job with ID: {submission_id}")
         done = False
diff --git a/tests/e2e/support.py b/tests/e2e/support.py
index d8a06bb70..3eb241536 100644
--- a/tests/e2e/support.py
+++ b/tests/e2e/support.py
@@ -90,7 +90,7 @@ def create_cluster_queue(self, cluster_queue, flavor):
                             "resources": [
                                 {"name": "cpu", "nominalQuota": 9},
                                 {"name": "memory", "nominalQuota": "36Gi"},
-                                {"name": "nvidia.com/gpu", "nominalQuota": 0},
+                                {"name": "nvidia.com/gpu", "nominalQuota": 1},
                             ],
                         }
                     ],