From 45a4ad359cc6cd06325d5d86eb3bedb89c0e0db4 Mon Sep 17 00:00:00 2001 From: Karel Suta Date: Tue, 9 Jul 2024 15:27:43 +0200 Subject: [PATCH] Adjust e2e tests to use GPU --- .github/workflows/e2e_tests.yaml | 41 +++++++------------ pyproject.toml | 3 +- tests/e2e/local_interactive_sdk_kind_test.py | 16 ++++++-- tests/e2e/mnist.py | 5 ++- .../e2e/mnist_raycluster_sdk_aw_kind_test.py | 29 +++++++++---- tests/e2e/mnist_raycluster_sdk_kind_test.py | 22 +++++++--- tests/e2e/support.py | 2 +- 7 files changed, 71 insertions(+), 47 deletions(-) diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index b83afb4b8..4699fca19 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -24,29 +24,15 @@ concurrency: group: ${{ github.head_ref }}-${{ github.workflow }} cancel-in-progress: true +env: + CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev" + jobs: kubernetes: - runs-on: ubuntu-20.04 + runs-on: ubuntu-20.04-4core-gpu steps: - - name: Cleanup - run: | - ls -lart - echo "Initial status:" - df -h - echo "Cleaning up resources:" - sudo swapoff -a - sudo rm -f /swapfile - sudo apt clean - sudo rm -rf /usr/share/dotnet - sudo rm -rf /opt/ghc - sudo rm -rf "/usr/local/share/boost" - sudo rm -rf "$AGENT_TOOLSDIRECTORY" - docker rmi $(docker image ls -aq) - echo "Final status:" - df -h - - name: Checkout code uses: actions/checkout@v4 with: @@ -55,8 +41,8 @@ jobs: - name: Checkout common repo code uses: actions/checkout@v4 with: - repository: 'project-codeflare/codeflare-common' - ref: 'main' + repository: 'sutaakar/codeflare-common' + ref: 'add-user-fix' path: 'common' - name: Checkout CodeFlare operator repository @@ -82,9 +68,15 @@ jobs: python-version: '3.9' cache: 'pip' # caching pip dependencies + - name: Setup NVidia GPU environment for KinD + uses: ./common/github-actions/nvidia-gpu-setup + - name: Setup and start KinD cluster uses: ./common/github-actions/kind + - name: Install NVidia GPU operator for KinD + uses: ./common/github-actions/nvidia-gpu-operator + - name: Deploy CodeFlare stack id: deploy run: | @@ -92,9 +84,7 @@ jobs: echo Setting up CodeFlare stack make setup-e2e echo Deploying CodeFlare operator - IMG="${REGISTRY_ADDRESS}"/codeflare-operator - make image-push -e IMG="${IMG}" - make deploy -e IMG="${IMG}" -e ENV="e2e" + make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e" kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager cd .. @@ -103,9 +93,6 @@ jobs: with: user-name: sdk-user - - name: Add kueue resources - run: kubectl apply --server-side -f "https://github.com/kubernetes-sigs/kueue/releases/download/v0.6.2/manifests.yaml" - - name: Configure RBAC for sdk user with limited permissions run: | kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses @@ -135,7 +122,7 @@ jobs: pip install poetry poetry install --with test,docs echo "Running e2e tests..." - poetry run pytest -v -s ./tests/e2e -m kind > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 + poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 env: GRPC_DNS_RESOLVER: "native" diff --git a/pyproject.toml b/pyproject.toml index 457e6de95..be225e908 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,7 @@ filterwarnings = [ ] markers = [ "kind", - "openshift" + "openshift", + "nvidia_gpu" ] addopts = "--timeout=900" diff --git a/tests/e2e/local_interactive_sdk_kind_test.py b/tests/e2e/local_interactive_sdk_kind_test.py index 8ca0bdac9..a3eb3d429 100644 --- a/tests/e2e/local_interactive_sdk_kind_test.py +++ b/tests/e2e/local_interactive_sdk_kind_test.py @@ -27,7 +27,16 @@ def test_local_interactives(self): create_kueue_resources(self) self.run_local_interactives() - def run_local_interactives(self): + @pytest.mark.nvidia_gpu + def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self): + self.setup_method() + create_namespace(self) + create_kueue_resources(self) + self.run_mnist_raycluster_sdk_kind(number_of_gpus=1) + + def run_local_interactives( + self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0 + ): ray_image = get_ray_image() cluster_name = "test-ray-cluster-li" @@ -43,6 +52,7 @@ def run_local_interactives(self): worker_cpu_limits=1, worker_memory_requests=1, worker_memory_limits=2, + worker_extended_resource_requests={gpu_resource_name: number_of_gpus}, image=ray_image, write_to_file=True, verify_tls=False, @@ -59,7 +69,7 @@ def run_local_interactives(self): ray.shutdown() ray.init(address=cluster.local_client_url(), logging_level="DEBUG") - @ray.remote + @ray.remote(num_gpus=number_of_gpus) def heavy_calculation_part(num_iterations): result = 0.0 for i in range(num_iterations): @@ -68,7 +78,7 @@ def heavy_calculation_part(num_iterations): result += math.sin(i) * math.cos(j) * math.tan(k) return result - @ray.remote + @ray.remote(num_gpus=number_of_gpus) def heavy_calculation(num_iterations): results = ray.get( [heavy_calculation_part.remote(num_iterations // 30) for _ in range(30)] diff --git a/tests/e2e/mnist.py b/tests/e2e/mnist.py index 2971d9c98..55ed91eaa 100644 --- a/tests/e2e/mnist.py +++ b/tests/e2e/mnist.py @@ -32,6 +32,9 @@ print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR")) print("MASTER_PORT: is ", os.getenv("MASTER_PORT")) +print("ACCELERATOR: is ", os.getenv("ACCELERATOR")) +ACCELERATOR = os.getenv("ACCELERATOR") + class LitMNIST(LightningModule): def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4): @@ -149,7 +152,7 @@ def test_dataloader(self): # Initialize a trainer trainer = Trainer( - accelerator="auto", + accelerator=ACCELERATOR, # devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs max_epochs=3, callbacks=[TQDMProgressBar(refresh_rate=20)], diff --git a/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py b/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py index 2aa5da16d..39bd25fda 100644 --- a/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py +++ b/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py @@ -24,9 +24,18 @@ def test_mnist_ray_cluster_sdk_kind(self): self.setup_method() create_namespace(self) create_kueue_resources(self) - self.run_mnist_raycluster_sdk_kind() + self.run_mnist_raycluster_sdk_kind(accelerator="cpu") - def run_mnist_raycluster_sdk_kind(self): + @pytest.mark.nvidia_gpu + def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self): + self.setup_method() + create_namespace(self) + create_kueue_resources(self) + self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1) + + def run_mnist_raycluster_sdk_kind( + self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0 + ): ray_image = get_ray_image() cluster = Cluster( @@ -36,11 +45,11 @@ def run_mnist_raycluster_sdk_kind(self): num_workers=1, head_cpus="500m", head_memory=2, - min_cpus="500m", - max_cpus=1, - min_memory=1, - max_memory=2, - num_gpus=0, + worker_cpu_requests="500m", + worker_cpu_limits=1, + worker_memory_requests=1, + worker_memory_limits=4, + worker_extended_resource_requests={gpu_resource_name: number_of_gpus}, image=ray_image, write_to_file=True, verify_tls=False, @@ -58,11 +67,11 @@ def run_mnist_raycluster_sdk_kind(self): cluster.details() - self.assert_jobsubmit_withoutlogin_kind(cluster) + self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus) # Assertions - def assert_jobsubmit_withoutlogin_kind(self, cluster): + def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus): ray_dashboard = cluster.cluster_dashboard_uri() client = RayJobClient(address=ray_dashboard, verify=False) @@ -71,7 +80,9 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster): runtime_env={ "working_dir": "./tests/e2e/", "pip": "./tests/e2e/mnist_pip_requirements.txt", + "env_vars": {"ACCELERATOR": accelerator}, }, + entrypoint_num_gpus=number_of_gpus, ) print(f"Submitted job with ID: {submission_id}") done = False diff --git a/tests/e2e/mnist_raycluster_sdk_kind_test.py b/tests/e2e/mnist_raycluster_sdk_kind_test.py index af5fcc1f8..356d56f98 100644 --- a/tests/e2e/mnist_raycluster_sdk_kind_test.py +++ b/tests/e2e/mnist_raycluster_sdk_kind_test.py @@ -25,9 +25,18 @@ def test_mnist_ray_cluster_sdk_kind(self): self.setup_method() create_namespace(self) create_kueue_resources(self) - self.run_mnist_raycluster_sdk_kind() + self.run_mnist_raycluster_sdk_kind(accelerator="cpu") - def run_mnist_raycluster_sdk_kind(self): + @pytest.mark.nvidia_gpu + def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self): + self.setup_method() + create_namespace(self) + create_kueue_resources(self) + self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1) + + def run_mnist_raycluster_sdk_kind( + self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0 + ): ray_image = get_ray_image() cluster = Cluster( @@ -40,7 +49,8 @@ def run_mnist_raycluster_sdk_kind(self): worker_cpu_requests="500m", worker_cpu_limits=1, worker_memory_requests=1, - worker_memory_limits=2, + worker_memory_limits=4, + worker_extended_resource_requests={gpu_resource_name: number_of_gpus}, image=ray_image, write_to_file=True, verify_tls=False, @@ -57,11 +67,11 @@ def run_mnist_raycluster_sdk_kind(self): cluster.details() - self.assert_jobsubmit_withoutlogin_kind(cluster) + self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus) # Assertions - def assert_jobsubmit_withoutlogin_kind(self, cluster): + def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus): ray_dashboard = cluster.cluster_dashboard_uri() client = RayJobClient(address=ray_dashboard, verify=False) @@ -70,7 +80,9 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster): runtime_env={ "working_dir": "./tests/e2e/", "pip": "./tests/e2e/mnist_pip_requirements.txt", + "env_vars": {"ACCELERATOR": accelerator}, }, + entrypoint_num_gpus=number_of_gpus, ) print(f"Submitted job with ID: {submission_id}") done = False diff --git a/tests/e2e/support.py b/tests/e2e/support.py index d8a06bb70..3eb241536 100644 --- a/tests/e2e/support.py +++ b/tests/e2e/support.py @@ -90,7 +90,7 @@ def create_cluster_queue(self, cluster_queue, flavor): "resources": [ {"name": "cpu", "nominalQuota": 9}, {"name": "memory", "nominalQuota": "36Gi"}, - {"name": "nvidia.com/gpu", "nominalQuota": 0}, + {"name": "nvidia.com/gpu", "nominalQuota": 1}, ], } ],