From 45a4ad359cc6cd06325d5d86eb3bedb89c0e0db4 Mon Sep 17 00:00:00 2001
From: Karel Suta <ksuta@redhat.com>
Date: Tue, 9 Jul 2024 15:27:43 +0200
Subject: [PATCH] Adjust e2e tests to use GPU

---
 .github/workflows/e2e_tests.yaml              | 41 +++++++------------
 pyproject.toml                                |  3 +-
 tests/e2e/local_interactive_sdk_kind_test.py  | 16 ++++++--
 tests/e2e/mnist.py                            |  5 ++-
 .../e2e/mnist_raycluster_sdk_aw_kind_test.py  | 29 +++++++++----
 tests/e2e/mnist_raycluster_sdk_kind_test.py   | 22 +++++++---
 tests/e2e/support.py                          |  2 +-
 7 files changed, 71 insertions(+), 47 deletions(-)

diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml
index b83afb4b8..4699fca19 100644
--- a/.github/workflows/e2e_tests.yaml
+++ b/.github/workflows/e2e_tests.yaml
@@ -24,29 +24,15 @@ concurrency:
   group: ${{ github.head_ref }}-${{ github.workflow }}
   cancel-in-progress: true
 
+env:
+  CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
+
 jobs:
   kubernetes:
 
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-20.04-4core-gpu
 
     steps:
-      - name: Cleanup
-        run: |
-          ls -lart
-          echo "Initial status:"
-          df -h
-          echo "Cleaning up resources:"
-          sudo swapoff -a
-          sudo rm -f /swapfile
-          sudo apt clean
-          sudo rm -rf /usr/share/dotnet
-          sudo rm -rf /opt/ghc
-          sudo rm -rf "/usr/local/share/boost"
-          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
-          docker rmi $(docker image ls -aq)
-          echo "Final status:"
-          df -h
-
       - name: Checkout code
         uses: actions/checkout@v4
         with:
@@ -55,8 +41,8 @@ jobs:
       - name: Checkout common repo code
         uses: actions/checkout@v4
         with:
-          repository: 'project-codeflare/codeflare-common'
-          ref: 'main'
+          repository: 'sutaakar/codeflare-common'
+          ref: 'add-user-fix'
           path: 'common'
 
       - name: Checkout CodeFlare operator repository
@@ -82,9 +68,15 @@ jobs:
           python-version: '3.9'
           cache: 'pip' # caching pip dependencies
 
+      - name: Setup NVidia GPU environment for KinD
+        uses: ./common/github-actions/nvidia-gpu-setup
+
       - name: Setup and start KinD cluster
         uses: ./common/github-actions/kind
 
+      - name: Install NVidia GPU operator for KinD
+        uses: ./common/github-actions/nvidia-gpu-operator
+
       - name: Deploy CodeFlare stack
         id: deploy
         run: |
@@ -92,9 +84,7 @@ jobs:
           echo Setting up CodeFlare stack
           make setup-e2e
           echo Deploying CodeFlare operator
-          IMG="${REGISTRY_ADDRESS}"/codeflare-operator
-          make image-push -e IMG="${IMG}"
-          make deploy -e IMG="${IMG}" -e ENV="e2e"
+          make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
           kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
           cd ..
 
@@ -103,9 +93,6 @@ jobs:
         with:
           user-name: sdk-user
 
-      - name: Add kueue resources
-        run: kubectl apply --server-side -f "https://github.com/kubernetes-sigs/kueue/releases/download/v0.6.2/manifests.yaml"
-
       - name: Configure RBAC for sdk user with limited permissions
         run: |
           kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
@@ -135,7 +122,7 @@ jobs:
           pip install poetry
           poetry install --with test,docs
           echo "Running e2e tests..."
-          poetry run pytest -v -s ./tests/e2e -m kind > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
+          poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
         env:
           GRPC_DNS_RESOLVER: "native"
 
diff --git a/pyproject.toml b/pyproject.toml
index 457e6de95..be225e908 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,6 +52,7 @@ filterwarnings = [
 ]
 markers = [
     "kind",
-    "openshift"
+    "openshift",
+    "nvidia_gpu"
 ]
 addopts = "--timeout=900"
diff --git a/tests/e2e/local_interactive_sdk_kind_test.py b/tests/e2e/local_interactive_sdk_kind_test.py
index 8ca0bdac9..a3eb3d429 100644
--- a/tests/e2e/local_interactive_sdk_kind_test.py
+++ b/tests/e2e/local_interactive_sdk_kind_test.py
@@ -27,7 +27,16 @@ def test_local_interactives(self):
         create_kueue_resources(self)
         self.run_local_interactives()
 
-    def run_local_interactives(self):
+    @pytest.mark.nvidia_gpu
+    def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self):
+        self.setup_method()
+        create_namespace(self)
+        create_kueue_resources(self)
+        self.run_mnist_raycluster_sdk_kind(number_of_gpus=1)
+
+    def run_local_interactives(
+        self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
+    ):
         ray_image = get_ray_image()
 
         cluster_name = "test-ray-cluster-li"
@@ -43,6 +52,7 @@ def run_local_interactives(self):
                 worker_cpu_limits=1,
                 worker_memory_requests=1,
                 worker_memory_limits=2,
+                worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
                 image=ray_image,
                 write_to_file=True,
                 verify_tls=False,
@@ -59,7 +69,7 @@ def run_local_interactives(self):
         ray.shutdown()
         ray.init(address=cluster.local_client_url(), logging_level="DEBUG")
 
-        @ray.remote
+        @ray.remote(num_gpus=number_of_gpus)
         def heavy_calculation_part(num_iterations):
             result = 0.0
             for i in range(num_iterations):
@@ -68,7 +78,7 @@ def heavy_calculation_part(num_iterations):
                         result += math.sin(i) * math.cos(j) * math.tan(k)
             return result
 
-        @ray.remote
+        @ray.remote(num_gpus=number_of_gpus)
         def heavy_calculation(num_iterations):
             results = ray.get(
                 [heavy_calculation_part.remote(num_iterations // 30) for _ in range(30)]
diff --git a/tests/e2e/mnist.py b/tests/e2e/mnist.py
index 2971d9c98..55ed91eaa 100644
--- a/tests/e2e/mnist.py
+++ b/tests/e2e/mnist.py
@@ -32,6 +32,9 @@
 print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR"))
 print("MASTER_PORT: is ", os.getenv("MASTER_PORT"))
 
+print("ACCELERATOR: is ", os.getenv("ACCELERATOR"))
+ACCELERATOR = os.getenv("ACCELERATOR")
+
 
 class LitMNIST(LightningModule):
     def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4):
@@ -149,7 +152,7 @@ def test_dataloader(self):
 
 # Initialize a trainer
 trainer = Trainer(
-    accelerator="auto",
+    accelerator=ACCELERATOR,
     # devices=1 if torch.cuda.is_available() else None,  # limiting got iPython runs
     max_epochs=3,
     callbacks=[TQDMProgressBar(refresh_rate=20)],
diff --git a/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py b/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py
index 2aa5da16d..39bd25fda 100644
--- a/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py
+++ b/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py
@@ -24,9 +24,18 @@ def test_mnist_ray_cluster_sdk_kind(self):
         self.setup_method()
         create_namespace(self)
         create_kueue_resources(self)
-        self.run_mnist_raycluster_sdk_kind()
+        self.run_mnist_raycluster_sdk_kind(accelerator="cpu")
 
-    def run_mnist_raycluster_sdk_kind(self):
+    @pytest.mark.nvidia_gpu
+    def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self):
+        self.setup_method()
+        create_namespace(self)
+        create_kueue_resources(self)
+        self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1)
+
+    def run_mnist_raycluster_sdk_kind(
+        self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
+    ):
         ray_image = get_ray_image()
 
         cluster = Cluster(
@@ -36,11 +45,11 @@ def run_mnist_raycluster_sdk_kind(self):
                 num_workers=1,
                 head_cpus="500m",
                 head_memory=2,
-                min_cpus="500m",
-                max_cpus=1,
-                min_memory=1,
-                max_memory=2,
-                num_gpus=0,
+                worker_cpu_requests="500m",
+                worker_cpu_limits=1,
+                worker_memory_requests=1,
+                worker_memory_limits=4,
+                worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
                 image=ray_image,
                 write_to_file=True,
                 verify_tls=False,
@@ -58,11 +67,11 @@ def run_mnist_raycluster_sdk_kind(self):
 
         cluster.details()
 
-        self.assert_jobsubmit_withoutlogin_kind(cluster)
+        self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus)
 
     # Assertions
 
-    def assert_jobsubmit_withoutlogin_kind(self, cluster):
+    def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus):
         ray_dashboard = cluster.cluster_dashboard_uri()
         client = RayJobClient(address=ray_dashboard, verify=False)
 
@@ -71,7 +80,9 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster):
             runtime_env={
                 "working_dir": "./tests/e2e/",
                 "pip": "./tests/e2e/mnist_pip_requirements.txt",
+                "env_vars": {"ACCELERATOR": accelerator},
             },
+            entrypoint_num_gpus=number_of_gpus,
         )
         print(f"Submitted job with ID: {submission_id}")
         done = False
diff --git a/tests/e2e/mnist_raycluster_sdk_kind_test.py b/tests/e2e/mnist_raycluster_sdk_kind_test.py
index af5fcc1f8..356d56f98 100644
--- a/tests/e2e/mnist_raycluster_sdk_kind_test.py
+++ b/tests/e2e/mnist_raycluster_sdk_kind_test.py
@@ -25,9 +25,18 @@ def test_mnist_ray_cluster_sdk_kind(self):
         self.setup_method()
         create_namespace(self)
         create_kueue_resources(self)
-        self.run_mnist_raycluster_sdk_kind()
+        self.run_mnist_raycluster_sdk_kind(accelerator="cpu")
 
-    def run_mnist_raycluster_sdk_kind(self):
+    @pytest.mark.nvidia_gpu
+    def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self):
+        self.setup_method()
+        create_namespace(self)
+        create_kueue_resources(self)
+        self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1)
+
+    def run_mnist_raycluster_sdk_kind(
+        self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
+    ):
         ray_image = get_ray_image()
 
         cluster = Cluster(
@@ -40,7 +49,8 @@ def run_mnist_raycluster_sdk_kind(self):
                 worker_cpu_requests="500m",
                 worker_cpu_limits=1,
                 worker_memory_requests=1,
-                worker_memory_limits=2,
+                worker_memory_limits=4,
+                worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
                 image=ray_image,
                 write_to_file=True,
                 verify_tls=False,
@@ -57,11 +67,11 @@ def run_mnist_raycluster_sdk_kind(self):
 
         cluster.details()
 
-        self.assert_jobsubmit_withoutlogin_kind(cluster)
+        self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus)
 
     # Assertions
 
-    def assert_jobsubmit_withoutlogin_kind(self, cluster):
+    def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus):
         ray_dashboard = cluster.cluster_dashboard_uri()
         client = RayJobClient(address=ray_dashboard, verify=False)
 
@@ -70,7 +80,9 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster):
             runtime_env={
                 "working_dir": "./tests/e2e/",
                 "pip": "./tests/e2e/mnist_pip_requirements.txt",
+                "env_vars": {"ACCELERATOR": accelerator},
             },
+            entrypoint_num_gpus=number_of_gpus,
         )
         print(f"Submitted job with ID: {submission_id}")
         done = False
diff --git a/tests/e2e/support.py b/tests/e2e/support.py
index d8a06bb70..3eb241536 100644
--- a/tests/e2e/support.py
+++ b/tests/e2e/support.py
@@ -90,7 +90,7 @@ def create_cluster_queue(self, cluster_queue, flavor):
                             "resources": [
                                 {"name": "cpu", "nominalQuota": 9},
                                 {"name": "memory", "nominalQuota": "36Gi"},
-                                {"name": "nvidia.com/gpu", "nominalQuota": 0},
+                                {"name": "nvidia.com/gpu", "nominalQuota": 1},
                             ],
                         }
                     ],