Skip to content

Commit

Permalink
Adjust e2e tests to use GPU
Browse files Browse the repository at this point in the history
  • Loading branch information
sutaakar committed Jul 10, 2024
1 parent 5ce0b2c commit 18ead0f
Show file tree
Hide file tree
Showing 7 changed files with 69 additions and 45 deletions.
37 changes: 12 additions & 25 deletions .github/workflows/e2e_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,29 +24,15 @@ concurrency:
group: ${{ github.head_ref }}-${{ github.workflow }}
cancel-in-progress: true

env:
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"

jobs:
kubernetes:

runs-on: ubuntu-20.04
runs-on: ubuntu-20.04-4core-gpu

steps:
- name: Cleanup
run: |
ls -lart
echo "Initial status:"
df -h
echo "Cleaning up resources:"
sudo swapoff -a
sudo rm -f /swapfile
sudo apt clean
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
docker rmi $(docker image ls -aq)
echo "Final status:"
df -h
- name: Checkout code
uses: actions/checkout@v4
with:
Expand Down Expand Up @@ -82,19 +68,23 @@ jobs:
python-version: '3.9'
cache: 'pip' # caching pip dependencies

- name: Setup NVidia GPU environment for KinD
uses: ./common/github-actions/nvidia-gpu-setup

- name: Setup and start KinD cluster
uses: ./common/github-actions/kind

- name: Install NVidia GPU operator for KinD
uses: ./common/github-actions/nvidia-gpu-operator

- name: Deploy CodeFlare stack
id: deploy
run: |
cd codeflare-operator
echo Setting up CodeFlare stack
make setup-e2e
echo Deploying CodeFlare operator
IMG="${REGISTRY_ADDRESS}"/codeflare-operator
make image-push -e IMG="${IMG}"
make deploy -e IMG="${IMG}" -e ENV="e2e"
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
cd ..
Expand All @@ -103,9 +93,6 @@ jobs:
with:
user-name: sdk-user

- name: Add kueue resources
run: kubectl apply --server-side -f "https://github.com/kubernetes-sigs/kueue/releases/download/v0.6.2/manifests.yaml"

- name: Configure RBAC for sdk user with limited permissions
run: |
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
Expand Down Expand Up @@ -135,7 +122,7 @@ jobs:
pip install poetry
poetry install --with test,docs
echo "Running e2e tests..."
poetry run pytest -v -s ./tests/e2e -m kind > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
env:
GRPC_DNS_RESOLVER: "native"

Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ filterwarnings = [
]
markers = [
"kind",
"openshift"
"openshift",
"nvidia_gpu"
]
addopts = "--timeout=900"
16 changes: 13 additions & 3 deletions tests/e2e/local_interactive_sdk_kind_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,16 @@ def test_local_interactives(self):
create_kueue_resources(self)
self.run_local_interactives()

def run_local_interactives(self):
@pytest.mark.nvidia_gpu
def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self):
self.setup_method()
create_namespace(self)
create_kueue_resources(self)
self.run_local_interactives(number_of_gpus=1)

def run_local_interactives(
self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
):
ray_image = get_ray_image()

cluster_name = "test-ray-cluster-li"
Expand All @@ -43,6 +52,7 @@ def run_local_interactives(self):
worker_cpu_limits=1,
worker_memory_requests=1,
worker_memory_limits=2,
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
image=ray_image,
write_to_file=True,
verify_tls=False,
Expand All @@ -59,7 +69,7 @@ def run_local_interactives(self):
ray.shutdown()
ray.init(address=cluster.local_client_url(), logging_level="DEBUG")

@ray.remote
@ray.remote(num_gpus=number_of_gpus/2)
def heavy_calculation_part(num_iterations):
result = 0.0
for i in range(num_iterations):
Expand All @@ -68,7 +78,7 @@ def heavy_calculation_part(num_iterations):
result += math.sin(i) * math.cos(j) * math.tan(k)
return result

@ray.remote
@ray.remote(num_gpus=number_of_gpus/2)
def heavy_calculation(num_iterations):
results = ray.get(
[heavy_calculation_part.remote(num_iterations // 30) for _ in range(30)]
Expand Down
5 changes: 4 additions & 1 deletion tests/e2e/mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@
print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR"))
print("MASTER_PORT: is ", os.getenv("MASTER_PORT"))

print("ACCELERATOR: is ", os.getenv("ACCELERATOR"))
ACCELERATOR = os.getenv("ACCELERATOR")


class LitMNIST(LightningModule):
def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4):
Expand Down Expand Up @@ -149,7 +152,7 @@ def test_dataloader(self):

# Initialize a trainer
trainer = Trainer(
accelerator="auto",
accelerator=ACCELERATOR,
# devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs
max_epochs=3,
callbacks=[TQDMProgressBar(refresh_rate=20)],
Expand Down
29 changes: 20 additions & 9 deletions tests/e2e/mnist_raycluster_sdk_aw_kind_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,18 @@ def test_mnist_ray_cluster_sdk_kind(self):
self.setup_method()
create_namespace(self)
create_kueue_resources(self)
self.run_mnist_raycluster_sdk_kind()
self.run_mnist_raycluster_sdk_kind(accelerator="cpu")

def run_mnist_raycluster_sdk_kind(self):
@pytest.mark.nvidia_gpu
def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self):
self.setup_method()
create_namespace(self)
create_kueue_resources(self)
self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1)

def run_mnist_raycluster_sdk_kind(
self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
):
ray_image = get_ray_image()

cluster = Cluster(
Expand All @@ -36,11 +45,11 @@ def run_mnist_raycluster_sdk_kind(self):
num_workers=1,
head_cpus="500m",
head_memory=2,
min_cpus="500m",
max_cpus=1,
min_memory=1,
max_memory=2,
num_gpus=0,
worker_cpu_requests="500m",
worker_cpu_limits=1,
worker_memory_requests=1,
worker_memory_limits=4,
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
image=ray_image,
write_to_file=True,
verify_tls=False,
Expand All @@ -58,11 +67,11 @@ def run_mnist_raycluster_sdk_kind(self):

cluster.details()

self.assert_jobsubmit_withoutlogin_kind(cluster)
self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus)

# Assertions

def assert_jobsubmit_withoutlogin_kind(self, cluster):
def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus):
ray_dashboard = cluster.cluster_dashboard_uri()
client = RayJobClient(address=ray_dashboard, verify=False)

Expand All @@ -71,7 +80,9 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster):
runtime_env={
"working_dir": "./tests/e2e/",
"pip": "./tests/e2e/mnist_pip_requirements.txt",
"env_vars": {"ACCELERATOR": accelerator},
},
entrypoint_num_gpus=number_of_gpus,
)
print(f"Submitted job with ID: {submission_id}")
done = False
Expand Down
22 changes: 17 additions & 5 deletions tests/e2e/mnist_raycluster_sdk_kind_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,18 @@ def test_mnist_ray_cluster_sdk_kind(self):
self.setup_method()
create_namespace(self)
create_kueue_resources(self)
self.run_mnist_raycluster_sdk_kind()
self.run_mnist_raycluster_sdk_kind(accelerator="cpu")

def run_mnist_raycluster_sdk_kind(self):
@pytest.mark.nvidia_gpu
def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self):
self.setup_method()
create_namespace(self)
create_kueue_resources(self)
self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1)

def run_mnist_raycluster_sdk_kind(
self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
):
ray_image = get_ray_image()

cluster = Cluster(
Expand All @@ -40,7 +49,8 @@ def run_mnist_raycluster_sdk_kind(self):
worker_cpu_requests="500m",
worker_cpu_limits=1,
worker_memory_requests=1,
worker_memory_limits=2,
worker_memory_limits=4,
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
image=ray_image,
write_to_file=True,
verify_tls=False,
Expand All @@ -57,11 +67,11 @@ def run_mnist_raycluster_sdk_kind(self):

cluster.details()

self.assert_jobsubmit_withoutlogin_kind(cluster)
self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus)

# Assertions

def assert_jobsubmit_withoutlogin_kind(self, cluster):
def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus):
ray_dashboard = cluster.cluster_dashboard_uri()
client = RayJobClient(address=ray_dashboard, verify=False)

Expand All @@ -70,7 +80,9 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster):
runtime_env={
"working_dir": "./tests/e2e/",
"pip": "./tests/e2e/mnist_pip_requirements.txt",
"env_vars": {"ACCELERATOR": accelerator},
},
entrypoint_num_gpus=number_of_gpus,
)
print(f"Submitted job with ID: {submission_id}")
done = False
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/support.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def create_cluster_queue(self, cluster_queue, flavor):
"resources": [
{"name": "cpu", "nominalQuota": 9},
{"name": "memory", "nominalQuota": "36Gi"},
{"name": "nvidia.com/gpu", "nominalQuota": 0},
{"name": "nvidia.com/gpu", "nominalQuota": 1},
],
}
],
Expand Down

0 comments on commit 18ead0f

Please sign in to comment.