Skip to content

Commit

Permalink
Add e2e tests using GPU to execute current test scenarios
Browse files Browse the repository at this point in the history
  • Loading branch information
sutaakar committed Jul 11, 2024
1 parent 130e003 commit 70c8707
Show file tree
Hide file tree
Showing 8 changed files with 73 additions and 43 deletions.
30 changes: 8 additions & 22 deletions .github/workflows/e2e_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,26 +30,9 @@ env:
jobs:
kubernetes:

runs-on: ubuntu-20.04
runs-on: ubuntu-20.04-4core-gpu

steps:
- name: Cleanup
run: |
ls -lart
echo "Initial status:"
df -h
echo "Cleaning up resources:"
sudo swapoff -a
sudo rm -f /swapfile
sudo apt clean
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
docker rmi $(docker image ls -aq)
echo "Final status:"
df -h
- name: Checkout code
uses: actions/checkout@v4
with:
Expand Down Expand Up @@ -85,9 +68,15 @@ jobs:
python-version: '3.9'
cache: 'pip' # caching pip dependencies

- name: Setup NVidia GPU environment for KinD
uses: ./common/github-actions/nvidia-gpu-setup

- name: Setup and start KinD cluster
uses: ./common/github-actions/kind

- name: Install NVidia GPU operator for KinD
uses: ./common/github-actions/nvidia-gpu-operator

- name: Deploy CodeFlare stack
id: deploy
run: |
Expand All @@ -104,9 +93,6 @@ jobs:
with:
user-name: sdk-user

- name: Add kueue resources
run: kubectl apply --server-side -f "https://github.com/kubernetes-sigs/kueue/releases/download/v0.6.2/manifests.yaml"

- name: Configure RBAC for sdk user with limited permissions
run: |
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
Expand Down Expand Up @@ -136,7 +122,7 @@ jobs:
pip install poetry
poetry install --with test,docs
echo "Running e2e tests..."
poetry run pytest -v -s ./tests/e2e -m kind > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
env:
GRPC_DNS_RESOLVER: "native"

Expand Down
9 changes: 8 additions & 1 deletion docs/e2e.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
## On KinD clusters
Pre-requisite for KinD clusters: please add in your local `/etc/hosts` file `127.0.0.1 kind`. This will map your localhost IP address to the KinD cluster's hostname. This is already performed on [GitHub Actions](https://github.com/project-codeflare/codeflare-common/blob/1edd775e2d4088a5a0bfddafb06ff3a773231c08/github-actions/kind/action.yml#L70-L72)

If the system you run on contains NVidia GPU then you can enable the GPU support in KinD, this will allow you to run also GPU tests.
To enable GPU on KinD follow [these instructions](https://www.substratus.ai/blog/kind-with-gpus).

- Setup Phase:
- Pull the [codeflare-operator repo](https://github.com/project-codeflare/codeflare-operator) and run the following make targets:
```
Expand Down Expand Up @@ -66,7 +69,11 @@ Pre-requisite for KinD clusters: please add in your local `/etc/hosts` file `127
poetry install --with test,docs
poetry run pytest -v -s ./tests/e2e/mnist_raycluster_sdk_test.py
```

- If the cluster doesn't have NVidia GPU support then we need to disable NVidia GPU tests by providing proper marker:
```
poetry install --with test,docs
poetry run pytest -v -s ./tests/e2e/mnist_raycluster_sdk_test.py -m 'kind and not nvidia_gpu'
```


## On OpenShift clusters
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ filterwarnings = [
]
markers = [
"kind",
"openshift"
"openshift",
"nvidia_gpu"
]
addopts = "--timeout=900"
16 changes: 13 additions & 3 deletions tests/e2e/local_interactive_sdk_kind_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,16 @@ def test_local_interactives(self):
create_kueue_resources(self)
self.run_local_interactives()

def run_local_interactives(self):
@pytest.mark.nvidia_gpu
def test_local_interactives_nvidia_gpu(self):
self.setup_method()
create_namespace(self)
create_kueue_resources(self)
self.run_local_interactives(number_of_gpus=1)

def run_local_interactives(
self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
):
ray_image = get_ray_image()

cluster_name = "test-ray-cluster-li"
Expand All @@ -43,6 +52,7 @@ def run_local_interactives(self):
worker_cpu_limits=1,
worker_memory_requests=1,
worker_memory_limits=2,
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
image=ray_image,
write_to_file=True,
verify_tls=False,
Expand All @@ -59,7 +69,7 @@ def run_local_interactives(self):
ray.shutdown()
ray.init(address=cluster.local_client_url(), logging_level="DEBUG")

@ray.remote
@ray.remote(num_gpus=number_of_gpus / 2)
def heavy_calculation_part(num_iterations):
result = 0.0
for i in range(num_iterations):
Expand All @@ -68,7 +78,7 @@ def heavy_calculation_part(num_iterations):
result += math.sin(i) * math.cos(j) * math.tan(k)
return result

@ray.remote
@ray.remote(num_gpus=number_of_gpus / 2)
def heavy_calculation(num_iterations):
results = ray.get(
[heavy_calculation_part.remote(num_iterations // 30) for _ in range(30)]
Expand Down
5 changes: 4 additions & 1 deletion tests/e2e/mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@
print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR"))
print("MASTER_PORT: is ", os.getenv("MASTER_PORT"))

print("ACCELERATOR: is ", os.getenv("ACCELERATOR"))
ACCELERATOR = os.getenv("ACCELERATOR")


class LitMNIST(LightningModule):
def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4):
Expand Down Expand Up @@ -149,7 +152,7 @@ def test_dataloader(self):

# Initialize a trainer
trainer = Trainer(
accelerator="auto",
accelerator=ACCELERATOR,
# devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs
max_epochs=3,
callbacks=[TQDMProgressBar(refresh_rate=20)],
Expand Down
29 changes: 20 additions & 9 deletions tests/e2e/mnist_raycluster_sdk_aw_kind_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,18 @@ def test_mnist_ray_cluster_sdk_kind(self):
self.setup_method()
create_namespace(self)
create_kueue_resources(self)
self.run_mnist_raycluster_sdk_kind()
self.run_mnist_raycluster_sdk_kind(accelerator="cpu")

def run_mnist_raycluster_sdk_kind(self):
@pytest.mark.nvidia_gpu
def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self):
self.setup_method()
create_namespace(self)
create_kueue_resources(self)
self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1)

def run_mnist_raycluster_sdk_kind(
self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
):
ray_image = get_ray_image()

cluster = Cluster(
Expand All @@ -36,11 +45,11 @@ def run_mnist_raycluster_sdk_kind(self):
num_workers=1,
head_cpus="500m",
head_memory=2,
min_cpus="500m",
max_cpus=1,
min_memory=1,
max_memory=2,
num_gpus=0,
worker_cpu_requests="500m",
worker_cpu_limits=1,
worker_memory_requests=1,
worker_memory_limits=4,
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
image=ray_image,
write_to_file=True,
verify_tls=False,
Expand All @@ -58,11 +67,11 @@ def run_mnist_raycluster_sdk_kind(self):

cluster.details()

self.assert_jobsubmit_withoutlogin_kind(cluster)
self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus)

# Assertions

def assert_jobsubmit_withoutlogin_kind(self, cluster):
def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus):
ray_dashboard = cluster.cluster_dashboard_uri()
client = RayJobClient(address=ray_dashboard, verify=False)

Expand All @@ -71,7 +80,9 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster):
runtime_env={
"working_dir": "./tests/e2e/",
"pip": "./tests/e2e/mnist_pip_requirements.txt",
"env_vars": {"ACCELERATOR": accelerator},
},
entrypoint_num_gpus=number_of_gpus,
)
print(f"Submitted job with ID: {submission_id}")
done = False
Expand Down
22 changes: 17 additions & 5 deletions tests/e2e/mnist_raycluster_sdk_kind_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,18 @@ def test_mnist_ray_cluster_sdk_kind(self):
self.setup_method()
create_namespace(self)
create_kueue_resources(self)
self.run_mnist_raycluster_sdk_kind()
self.run_mnist_raycluster_sdk_kind(accelerator="cpu")

def run_mnist_raycluster_sdk_kind(self):
@pytest.mark.nvidia_gpu
def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self):
self.setup_method()
create_namespace(self)
create_kueue_resources(self)
self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1)

def run_mnist_raycluster_sdk_kind(
self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
):
ray_image = get_ray_image()

cluster = Cluster(
Expand All @@ -40,7 +49,8 @@ def run_mnist_raycluster_sdk_kind(self):
worker_cpu_requests="500m",
worker_cpu_limits=1,
worker_memory_requests=1,
worker_memory_limits=2,
worker_memory_limits=4,
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
image=ray_image,
write_to_file=True,
verify_tls=False,
Expand All @@ -57,11 +67,11 @@ def run_mnist_raycluster_sdk_kind(self):

cluster.details()

self.assert_jobsubmit_withoutlogin_kind(cluster)
self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus)

# Assertions

def assert_jobsubmit_withoutlogin_kind(self, cluster):
def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus):
ray_dashboard = cluster.cluster_dashboard_uri()
client = RayJobClient(address=ray_dashboard, verify=False)

Expand All @@ -70,7 +80,9 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster):
runtime_env={
"working_dir": "./tests/e2e/",
"pip": "./tests/e2e/mnist_pip_requirements.txt",
"env_vars": {"ACCELERATOR": accelerator},
},
entrypoint_num_gpus=number_of_gpus,
)
print(f"Submitted job with ID: {submission_id}")
done = False
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/support.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def create_cluster_queue(self, cluster_queue, flavor):
"resources": [
{"name": "cpu", "nominalQuota": 9},
{"name": "memory", "nominalQuota": "36Gi"},
{"name": "nvidia.com/gpu", "nominalQuota": 0},
{"name": "nvidia.com/gpu", "nominalQuota": 1},
],
}
],
Expand Down

0 comments on commit 70c8707

Please sign in to comment.