diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index b83afb4b8..2c05687f6 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -24,29 +24,15 @@ concurrency: group: ${{ github.head_ref }}-${{ github.workflow }} cancel-in-progress: true +env: + CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev" + jobs: kubernetes: - runs-on: ubuntu-20.04 + runs-on: ubuntu-20.04-4core-gpu steps: - - name: Cleanup - run: | - ls -lart - echo "Initial status:" - df -h - echo "Cleaning up resources:" - sudo swapoff -a - sudo rm -f /swapfile - sudo apt clean - sudo rm -rf /usr/share/dotnet - sudo rm -rf /opt/ghc - sudo rm -rf "/usr/local/share/boost" - sudo rm -rf "$AGENT_TOOLSDIRECTORY" - docker rmi $(docker image ls -aq) - echo "Final status:" - df -h - - name: Checkout code uses: actions/checkout@v4 with: @@ -55,8 +41,8 @@ jobs: - name: Checkout common repo code uses: actions/checkout@v4 with: - repository: 'project-codeflare/codeflare-common' - ref: 'main' + repository: 'sutaakar/codeflare-common' + ref: 'add-user-fix' path: 'common' - name: Checkout CodeFlare operator repository @@ -82,9 +68,15 @@ jobs: python-version: '3.9' cache: 'pip' # caching pip dependencies + - name: Setup NVidia GPU environment for KinD + uses: ./common/github-actions/nvidia-gpu-setup + - name: Setup and start KinD cluster uses: ./common/github-actions/kind + - name: Install NVidia GPU operator for KinD + uses: ./common/github-actions/nvidia-gpu-operator + - name: Deploy CodeFlare stack id: deploy run: | @@ -92,9 +84,7 @@ jobs: echo Setting up CodeFlare stack make setup-e2e echo Deploying CodeFlare operator - IMG="${REGISTRY_ADDRESS}"/codeflare-operator - make image-push -e IMG="${IMG}" - make deploy -e IMG="${IMG}" -e ENV="e2e" + make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e" kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager cd .. @@ -103,9 +93,6 @@ jobs: with: user-name: sdk-user - - name: Add kueue resources - run: kubectl apply --server-side -f "https://github.com/kubernetes-sigs/kueue/releases/download/v0.6.2/manifests.yaml" - - name: Configure RBAC for sdk user with limited permissions run: | kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses diff --git a/tests/e2e/mnist.py b/tests/e2e/mnist.py index 2971d9c98..55ed91eaa 100644 --- a/tests/e2e/mnist.py +++ b/tests/e2e/mnist.py @@ -32,6 +32,9 @@ print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR")) print("MASTER_PORT: is ", os.getenv("MASTER_PORT")) +print("ACCELERATOR: is ", os.getenv("ACCELERATOR")) +ACCELERATOR = os.getenv("ACCELERATOR") + class LitMNIST(LightningModule): def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4): @@ -149,7 +152,7 @@ def test_dataloader(self): # Initialize a trainer trainer = Trainer( - accelerator="auto", + accelerator=ACCELERATOR, # devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs max_epochs=3, callbacks=[TQDMProgressBar(refresh_rate=20)], diff --git a/tests/e2e/mnist_raycluster_sdk_kind_test.py b/tests/e2e/mnist_raycluster_sdk_kind_test.py index af5fcc1f8..0cec16a10 100644 --- a/tests/e2e/mnist_raycluster_sdk_kind_test.py +++ b/tests/e2e/mnist_raycluster_sdk_kind_test.py @@ -25,9 +25,18 @@ def test_mnist_ray_cluster_sdk_kind(self): self.setup_method() create_namespace(self) create_kueue_resources(self) - self.run_mnist_raycluster_sdk_kind() + self.run_mnist_raycluster_sdk_kind(accelerator="cpu") - def run_mnist_raycluster_sdk_kind(self): + @pytest.mark.nvidia_gpu + def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self): + self.setup_method() + create_namespace(self) + create_kueue_resources(self) + self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1) + + def run_mnist_raycluster_sdk_kind( + self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0 + ): ray_image = get_ray_image() cluster = Cluster( @@ -40,7 +49,8 @@ def run_mnist_raycluster_sdk_kind(self): worker_cpu_requests="500m", worker_cpu_limits=1, worker_memory_requests=1, - worker_memory_limits=2, + worker_memory_limits=4, + worker_extended_resource_requests={gpu_resource_name: number_of_gpus}, image=ray_image, write_to_file=True, verify_tls=False, @@ -57,11 +67,11 @@ def run_mnist_raycluster_sdk_kind(self): cluster.details() - self.assert_jobsubmit_withoutlogin_kind(cluster) + self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus) # Assertions - def assert_jobsubmit_withoutlogin_kind(self, cluster): + def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus): ray_dashboard = cluster.cluster_dashboard_uri() client = RayJobClient(address=ray_dashboard, verify=False) @@ -70,7 +80,10 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster): runtime_env={ "working_dir": "./tests/e2e/", "pip": "./tests/e2e/mnist_pip_requirements.txt", + "env_vars": {"ACCELERATOR": accelerator}, }, + # Using EntrypointNumGpus doesn't seem to work properly on KinD cluster with GPU, EntrypointNumCpus seems reliable + entrypoint_num_gpus=number_of_gpus, ) print(f"Submitted job with ID: {submission_id}") done = False diff --git a/tests/e2e/support.py b/tests/e2e/support.py index d8a06bb70..3eb241536 100644 --- a/tests/e2e/support.py +++ b/tests/e2e/support.py @@ -90,7 +90,7 @@ def create_cluster_queue(self, cluster_queue, flavor): "resources": [ {"name": "cpu", "nominalQuota": 9}, {"name": "memory", "nominalQuota": "36Gi"}, - {"name": "nvidia.com/gpu", "nominalQuota": 0}, + {"name": "nvidia.com/gpu", "nominalQuota": 1}, ], } ],