Adjust e2e tests to use GPU

project-codeflare · Jul 10, 2024 · 7f7c7c4 · 7f7c7c4
1 parent 5ce0b2c
commit 7f7c7c4
Show file tree

Hide file tree

Showing 4 changed files with 23 additions and 29 deletions.
diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml
@@ -24,29 +24,15 @@ concurrency:
   group: ${{ github.head_ref }}-${{ github.workflow }}
   cancel-in-progress: true
 
+env:
+  CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
+
 jobs:
   kubernetes:
 
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-20.04-4core-gpu
 
     steps:
-      - name: Cleanup
-        run: |
-          ls -lart
-          echo "Initial status:"
-          df -h
-          echo "Cleaning up resources:"
-          sudo swapoff -a
-          sudo rm -f /swapfile
-          sudo apt clean
-          sudo rm -rf /usr/share/dotnet
-          sudo rm -rf /opt/ghc
-          sudo rm -rf "/usr/local/share/boost"
-          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
-          docker rmi $(docker image ls -aq)
-          echo "Final status:"
-          df -h
-
       - name: Checkout code
         uses: actions/checkout@v4
         with:
@@ -55,8 +41,8 @@ jobs:
       - name: Checkout common repo code
         uses: actions/checkout@v4
         with:
-          repository: 'project-codeflare/codeflare-common'
-          ref: 'main'
+          repository: 'sutaakar/codeflare-common'
+          ref: 'add-user-fix'
           path: 'common'
 
       - name: Checkout CodeFlare operator repository
@@ -82,19 +68,23 @@ jobs:
           python-version: '3.9'
           cache: 'pip' # caching pip dependencies
 
+      - name: Setup NVidia GPU environment for KinD
+        uses: ./common/github-actions/nvidia-gpu-setup
+
       - name: Setup and start KinD cluster
         uses: ./common/github-actions/kind
 
+      - name: Install NVidia GPU operator for KinD
+        uses: ./common/github-actions/nvidia-gpu-operator
+
       - name: Deploy CodeFlare stack
         id: deploy
         run: |
           cd codeflare-operator
           echo Setting up CodeFlare stack
           make setup-e2e
           echo Deploying CodeFlare operator
-          IMG="${REGISTRY_ADDRESS}"/codeflare-operator
-          make image-push -e IMG="${IMG}"
-          make deploy -e IMG="${IMG}" -e ENV="e2e"
+          make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
           kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
           cd ..
 
@@ -103,9 +93,6 @@ jobs:
         with:
           user-name: sdk-user
 
-      - name: Add kueue resources
-        run: kubectl apply --server-side -f "https://github.com/kubernetes-sigs/kueue/releases/download/v0.6.2/manifests.yaml"
-
       - name: Configure RBAC for sdk user with limited permissions
         run: |
           kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses

diff --git a/tests/e2e/mnist.py b/tests/e2e/mnist.py
@@ -32,6 +32,9 @@
 print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR"))
 print("MASTER_PORT: is ", os.getenv("MASTER_PORT"))
 
+print("ACCELERATOR: is ", os.getenv("ACCELERATOR"))
+ACCELERATOR = os.getenv("ACCELERATOR")
+
 
 class LitMNIST(LightningModule):
     def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4):
@@ -149,7 +152,7 @@ def test_dataloader(self):
 
 # Initialize a trainer
 trainer = Trainer(
-    accelerator="auto",
+    accelerator=ACCELERATOR,
     # devices=1 if torch.cuda.is_available() else None,  # limiting got iPython runs
     max_epochs=3,
     callbacks=[TQDMProgressBar(refresh_rate=20)],

diff --git a/tests/e2e/mnist_raycluster_sdk_kind_test.py b/tests/e2e/mnist_raycluster_sdk_kind_test.py
@@ -40,7 +40,8 @@ def run_mnist_raycluster_sdk_kind(self):
                 worker_cpu_requests="500m",
                 worker_cpu_limits=1,
                 worker_memory_requests=1,
-                worker_memory_limits=2,
+                worker_memory_limits=4,
+                worker_extended_resource_requests={"nvidia.com/gpu": 1},
                 image=ray_image,
                 write_to_file=True,
                 verify_tls=False,
@@ -70,7 +71,10 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster):
             runtime_env={
                 "working_dir": "./tests/e2e/",
                 "pip": "./tests/e2e/mnist_pip_requirements.txt",
+                "env_vars": {"ACCELERATOR": "gpu"},
             },
+            # Using EntrypointNumGpus doesn't seem to work properly on KinD cluster with GPU, EntrypointNumCpus seems reliable
+            entrypoint_num_gpus=1,
         )
         print(f"Submitted job with ID: {submission_id}")
         done = False

diff --git a/tests/e2e/support.py b/tests/e2e/support.py
@@ -90,7 +90,7 @@ def create_cluster_queue(self, cluster_queue, flavor):
                             "resources": [
                                 {"name": "cpu", "nominalQuota": 9},
                                 {"name": "memory", "nominalQuota": "36Gi"},
-                                {"name": "nvidia.com/gpu", "nominalQuota": 0},
+                                {"name": "nvidia.com/gpu", "nominalQuota": 1},
                             ],
                         }
                     ],