Skip to content

Commit

Permalink
Refactor e2e tests to use kueue (#505)
Browse files Browse the repository at this point in the history
Co-authored-by: Shilpa Chugh <[email protected]>
  • Loading branch information
ChughShilpa and Shilpa Chugh authored Apr 15, 2024
1 parent b553e92 commit 979ff43
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 84 deletions.
21 changes: 17 additions & 4 deletions docs/e2e.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@ Pre-requisite for KinD clusters: please add in your local `/etc/hosts` file `127
```
make kind-e2e
export CLUSTER_HOSTNAME=kind
make deploy -e IMG=quay.io/project-codeflare/codeflare-operator:v1.1.0
make setup-e2e
make deploy -e IMG=quay.io/project-codeflare/codeflare-operator:v1.3.0
For running tests locally on Kind cluster, we need to disable `rayDashboardOAuthEnabled` in `codeflare-operator-config` ConfigMap and then restart CodeFlare Operator
```

- **(Optional)** - Create and add `sdk-user` with limited permissions to the cluster to run through the e2e tests:
Expand Down Expand Up @@ -53,9 +55,13 @@ Pre-requisite for KinD clusters: please add in your local `/etc/hosts` file `127
```

- Install the latest development version of kueue
```
kubectl apply --server-side -k "github.com/opendatahub-io/kueue/config/rhoai?ref=dev"
```

- Test Phase:
- Once we have the codeflare-operator and kuberay-operator running and ready, we can run the e2e test on the codeflare-sdk repository:
- Once we have the codeflare-operator, kuberay-operator and kueue running and ready, we can run the e2e test on the codeflare-sdk repository:
```
poetry install --with test,docs
poetry run pytest -v -s ./tests/e2e/mnist_raycluster_sdk_test.py
Expand All @@ -67,11 +73,18 @@ Pre-requisite for KinD clusters: please add in your local `/etc/hosts` file `127
- Setup Phase:
- Pull the [codeflare-operator repo](https://github.com/project-codeflare/codeflare-operator) and run the following make targets:
```
make deploy -e IMG=quay.io/project-codeflare/codeflare-operator:v1.1.0
make setup-e2e
make deploy -e IMG=quay.io/project-codeflare/codeflare-operator:v1.3.0
```

- Install the latest development version of kueue
```
kubectl apply --server-side -k "github.com/opendatahub-io/kueue/config/rhoai?ref=dev"
```

- Test Phase:
- Once we have the codeflare-operator and kuberay-operator running and ready, we can run the e2e test on the codeflare-sdk repository:
- Once we have the codeflare-operator, kuberay-operator and kueue running and ready, we can run the e2e test on the codeflare-sdk repository:
```
poetry install --with test,docs
poetry run pytest -v -s ./tests/e2e/mnist_raycluster_sdk_test.py
Expand Down
53 changes: 53 additions & 0 deletions tests/e2e/kueue_resources_setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/bin/bash

name=${name:-cluster-queue-mnist}
flavor=${flavor:-default-flavor-mnist}
local_queue_name=${local_queue_name:-local-queue-mnist}
namespace=$1

echo "Applying Cluster Queue"

cat <<EOF | kubectl apply --server-side -f -
apiVersion: kueue.x-k8s.io/v1beta1
kind: ClusterQueue
metadata:
name: $name
spec:
namespaceSelector: {}
resourceGroups:
- coveredResources: ["cpu", "memory", "nvidia.com/gpu"]
flavors:
- name: "default-flavor-mnist"
resources:
- name: "cpu"
nominalQuota: 9
- name: "memory"
nominalQuota: 36Gi
- name: "nvidia.com/gpu"
nominalQuota: 0
EOF
echo "Cluster Queue $name applied!"

echo "Applying Resource flavor"
cat <<EOF | kubectl apply --server-side -f -
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: $flavor
EOF
echo "Resource flavor $flavor applied!"

echo "Applying local queue"

cat <<EOF | kubectl apply --server-side -f -
apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
metadata:
namespace: $namespace
name: $local_queue_name
annotations:
"kueue.x-k8s.io/default-queue": "true"
spec:
clusterQueue: $name
EOF
echo "Local Queue $local_queue_name applied!"
Original file line number Diff line number Diff line change
@@ -1,38 +1,32 @@
from kubernetes import client, config
import kubernetes.client

import os
import requests

from time import sleep

import ray

from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration
from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication
from codeflare_sdk.job import RayJobClient

import pytest

from support import *

# Creates a Ray cluster, and trains the MNIST dataset using the CodeFlare SDK.
# Asserts creation of AppWrapper, RayCluster, and successful completion of the training job.
# Covers successfull installation of CodeFlare-SDK
# This test creates a Ray Cluster and covers the Ray Job submission functionality on Kind Cluster


@pytest.mark.kind
@pytest.mark.openshift
class TestMNISTRayClusterSDK:
class TestRayClusterSDKKind:
def setup_method(self):
initialize_kubernetes_client(self)

def teardown_method(self):
delete_namespace(self)

def test_mnist_ray_cluster_sdk(self):
def test_mnist_ray_cluster_sdk_kind(self):
self.setup_method()
create_namespace(self)
self.run_mnist_raycluster_sdk()
create_kueue_resources(self)
self.run_mnist_raycluster_sdk_kind()

def run_mnist_raycluster_sdk(self):
def run_mnist_raycluster_sdk_kind(self):
ray_image = get_ray_image()

cluster = Cluster(
Expand All @@ -47,31 +41,30 @@ def run_mnist_raycluster_sdk(self):
min_memory=1,
max_memory=2,
num_gpus=0,
instascale=False,
image=ray_image,
write_to_file=True,
mcad=True,
verify_tls=False,
)
)

cluster.up()
self.assert_appwrapper_exists()

cluster.status()

cluster.wait_ready()
self.assert_raycluster_exists()

cluster.status()

cluster.details()

auth_token = run_oc_command(["whoami", "--show-token=true"])
self.assert_jobsubmit_withoutlogin_kind(cluster)

# Assertions

def assert_jobsubmit_withoutlogin_kind(self, cluster):
ray_dashboard = cluster.cluster_dashboard_uri()
header = {"Authorization": f"Bearer {auth_token}"}
client = RayJobClient(address=ray_dashboard, headers=header, verify=True)
client = RayJobClient(address=ray_dashboard, verify=False)

# Submit the job
submission_id = client.submit_job(
entrypoint="python mnist.py",
runtime_env={
Expand Down Expand Up @@ -100,38 +93,8 @@ def run_mnist_raycluster_sdk(self):
self.assert_job_completion(status)

client.delete_job(submission_id)
cluster.down()

# Assertions
def assert_appwrapper_exists(self):
try:
self.custom_api.get_namespaced_custom_object(
"workload.codeflare.dev",
"v1beta1",
self.namespace,
"appwrappers",
"mnist",
)
print(
f"AppWrapper 'mnist' has been created in the namespace: '{self.namespace}'"
)
assert True
except Exception as e:
print(f"AppWrapper 'mnist' has not been created. Error: {e}")
assert False

def assert_raycluster_exists(self):
try:
self.custom_api.get_namespaced_custom_object(
"ray.io", "v1", self.namespace, "rayclusters", "mnist"
)
print(
f"RayCluster 'mnist' created successfully in the namespace: '{self.namespace}'"
)
assert True
except Exception as e:
print(f"RayCluster 'mnist' has not been created. Error: {e}")
assert False
cluster.down()

def assert_job_completion(self, status):
if status == "SUCCEEDED":
Expand Down
32 changes: 6 additions & 26 deletions tests/e2e/mnist_raycluster_sdk_oauth_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from support import *

# This test Creates a Ray cluster with openshift_oauth enable and covers the Ray Job submission with authentication and without authentication functionality
# This test creates a Ray Cluster and covers the Ray Job submission with authentication and without authentication functionality on Openshift Cluster


@pytest.mark.openshift
Expand All @@ -23,6 +23,7 @@ def teardown_method(self):
def test_mnist_ray_cluster_sdk_auth(self):
self.setup_method()
create_namespace(self)
create_kueue_resources(self)
self.run_mnist_raycluster_sdk_oauth()

def run_mnist_raycluster_sdk_oauth(self):
Expand All @@ -47,15 +48,13 @@ def run_mnist_raycluster_sdk_oauth(self):
min_memory=1,
max_memory=2,
num_gpus=0,
instascale=False,
image=ray_image,
write_to_file=True,
mcad=True,
verify_tls=False,
)
)

cluster.up()
self.assert_appwrapper_exists()

cluster.status()

Expand All @@ -66,7 +65,6 @@ def run_mnist_raycluster_sdk_oauth(self):
cluster.details()

self.assert_jobsubmit_withoutLogin(cluster)

self.assert_jobsubmit_withlogin(cluster)

# Assertions
Expand Down Expand Up @@ -95,18 +93,16 @@ def assert_jobsubmit_withoutLogin(self, cluster):
assert False

def assert_jobsubmit_withlogin(self, cluster):
self.assert_appwrapper_exists()
auth_token = run_oc_command(["whoami", "--show-token=true"])
ray_dashboard = cluster.cluster_dashboard_uri()
header = {"Authorization": f"Bearer {auth_token}"}
client = RayJobClient(address=ray_dashboard, headers=header, verify=True)
client = RayJobClient(address=ray_dashboard, headers=header, verify=False)

# Submit the job
submission_id = client.submit_job(
entrypoint="python mnist.py",
runtime_env={
"working_dir": "./tests/e2e/",
"pip": "mnist_pip_requirements.txt",
"pip": "./tests/e2e/mnist_pip_requirements.txt",
},
)
print(f"Submitted job with ID: {submission_id}")
Expand All @@ -130,24 +126,8 @@ def assert_jobsubmit_withlogin(self, cluster):
self.assert_job_completion(status)

client.delete_job(submission_id)
cluster.down()

def assert_appwrapper_exists(self):
try:
self.custom_api.get_namespaced_custom_object(
"workload.codeflare.dev",
"v1beta1",
self.namespace,
"appwrappers",
"mnist",
)
print(
f"AppWrapper 'mnist' has been created in the namespace: '{self.namespace}'"
)
assert True
except Exception as e:
print(f"AppWrapper 'mnist' has not been created. Error: {e}")
assert False
cluster.down()

def assert_job_completion(self, status):
if status == "SUCCEEDED":
Expand Down
7 changes: 7 additions & 0 deletions tests/e2e/support.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import subprocess
from kubernetes import client, config
import kubernetes.client
import subprocess


def get_ray_image():
Expand Down Expand Up @@ -45,3 +46,9 @@ def run_oc_command(args):
except subprocess.CalledProcessError as e:
print(f"Error executing 'oc {' '.join(args)}': {e}")
return None


def create_kueue_resources(self):
# Set executable permissions
os.chmod("tests/e2e/kueue_resources_setup.sh", 0o755)
subprocess.call(["bash", "tests/e2e/kueue_resources_setup.sh", self.namespace])

0 comments on commit 979ff43

Please sign in to comment.