From a8a0b67b84631b20e4250cd329ac86dc89f013ff Mon Sep 17 00:00:00 2001 From: Karel Suta Date: Tue, 9 Jul 2024 15:27:43 +0200 Subject: [PATCH] Adjust e2e tests to use GPU --- .github/workflows/e2e_tests.yaml | 35 ++++++++------------- tests/e2e/mnist.py | 5 ++- tests/e2e/mnist_raycluster_sdk_kind_test.py | 5 ++- tests/e2e/support.py | 2 +- 4 files changed, 22 insertions(+), 25 deletions(-) diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index b83afb4b8..1a4842062 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -24,28 +24,18 @@ concurrency: group: ${{ github.head_ref }}-${{ github.workflow }} cancel-in-progress: true +env: + CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev" + jobs: kubernetes: - runs-on: ubuntu-20.04 + runs-on: ubuntu-20.04-4core-gpu steps: - - name: Cleanup + - name: Install yq run: | - ls -lart - echo "Initial status:" - df -h - echo "Cleaning up resources:" - sudo swapoff -a - sudo rm -f /swapfile - sudo apt clean - sudo rm -rf /usr/share/dotnet - sudo rm -rf /opt/ghc - sudo rm -rf "/usr/local/share/boost" - sudo rm -rf "$AGENT_TOOLSDIRECTORY" - docker rmi $(docker image ls -aq) - echo "Final status:" - df -h + sudo wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/bin/yq && sudo chmod +x /usr/bin/yq - name: Checkout code uses: actions/checkout@v4 @@ -82,9 +72,15 @@ jobs: python-version: '3.9' cache: 'pip' # caching pip dependencies + - name: Setup NVidia GPU environment for KinD + uses: ./common/github-actions/nvidia-gpu-setup + - name: Setup and start KinD cluster uses: ./common/github-actions/kind + - name: Install NVidia GPU operator for KinD + uses: ./common/github-actions/nvidia-gpu-operator + - name: Deploy CodeFlare stack id: deploy run: | @@ -92,9 +88,7 @@ jobs: echo Setting up CodeFlare stack make setup-e2e echo Deploying CodeFlare operator - IMG="${REGISTRY_ADDRESS}"/codeflare-operator - make image-push -e IMG="${IMG}" - make deploy -e IMG="${IMG}" -e ENV="e2e" + make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e" kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager cd .. @@ -103,9 +97,6 @@ jobs: with: user-name: sdk-user - - name: Add kueue resources - run: kubectl apply --server-side -f "https://github.com/kubernetes-sigs/kueue/releases/download/v0.6.2/manifests.yaml" - - name: Configure RBAC for sdk user with limited permissions run: | kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses diff --git a/tests/e2e/mnist.py b/tests/e2e/mnist.py index 2971d9c98..55ed91eaa 100644 --- a/tests/e2e/mnist.py +++ b/tests/e2e/mnist.py @@ -32,6 +32,9 @@ print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR")) print("MASTER_PORT: is ", os.getenv("MASTER_PORT")) +print("ACCELERATOR: is ", os.getenv("ACCELERATOR")) +ACCELERATOR = os.getenv("ACCELERATOR") + class LitMNIST(LightningModule): def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4): @@ -149,7 +152,7 @@ def test_dataloader(self): # Initialize a trainer trainer = Trainer( - accelerator="auto", + accelerator=ACCELERATOR, # devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs max_epochs=3, callbacks=[TQDMProgressBar(refresh_rate=20)], diff --git a/tests/e2e/mnist_raycluster_sdk_kind_test.py b/tests/e2e/mnist_raycluster_sdk_kind_test.py index d85397221..37665c6f5 100644 --- a/tests/e2e/mnist_raycluster_sdk_kind_test.py +++ b/tests/e2e/mnist_raycluster_sdk_kind_test.py @@ -41,7 +41,7 @@ def run_mnist_raycluster_sdk_kind(self): worker_cpu_limits=1, worker_memory_requests=1, worker_memory_limits=2, - num_worker_gpus=0, + num_worker_gpus=1, image=ray_image, write_to_file=True, verify_tls=False, @@ -71,7 +71,10 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster): runtime_env={ "working_dir": "./tests/e2e/", "pip": "./tests/e2e/mnist_pip_requirements.txt", + "env_vars": {"ACCELERATOR": "gpu"}, }, + # Using EntrypointNumGpus doesn't seem to work properly on KinD cluster with GPU, EntrypointNumCpus seems reliable + entrypoint_num_gpus=1, ) print(f"Submitted job with ID: {submission_id}") done = False diff --git a/tests/e2e/support.py b/tests/e2e/support.py index d8a06bb70..3eb241536 100644 --- a/tests/e2e/support.py +++ b/tests/e2e/support.py @@ -90,7 +90,7 @@ def create_cluster_queue(self, cluster_queue, flavor): "resources": [ {"name": "cpu", "nominalQuota": 9}, {"name": "memory", "nominalQuota": "36Gi"}, - {"name": "nvidia.com/gpu", "nominalQuota": 0}, + {"name": "nvidia.com/gpu", "nominalQuota": 1}, ], } ],