Skip to content

Commit

Permalink
Adjust e2e tests to use GPU
Browse files Browse the repository at this point in the history
  • Loading branch information
sutaakar committed Jul 9, 2024
1 parent 1ab5421 commit a8a0b67
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 25 deletions.
35 changes: 13 additions & 22 deletions .github/workflows/e2e_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,28 +24,18 @@ concurrency:
group: ${{ github.head_ref }}-${{ github.workflow }}
cancel-in-progress: true

env:
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"

jobs:
kubernetes:

runs-on: ubuntu-20.04
runs-on: ubuntu-20.04-4core-gpu

steps:
- name: Cleanup
- name: Install yq
run: |
ls -lart
echo "Initial status:"
df -h
echo "Cleaning up resources:"
sudo swapoff -a
sudo rm -f /swapfile
sudo apt clean
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
docker rmi $(docker image ls -aq)
echo "Final status:"
df -h
sudo wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/bin/yq && sudo chmod +x /usr/bin/yq
- name: Checkout code
uses: actions/checkout@v4
Expand Down Expand Up @@ -82,19 +72,23 @@ jobs:
python-version: '3.9'
cache: 'pip' # caching pip dependencies

- name: Setup NVidia GPU environment for KinD
uses: ./common/github-actions/nvidia-gpu-setup

- name: Setup and start KinD cluster
uses: ./common/github-actions/kind

- name: Install NVidia GPU operator for KinD
uses: ./common/github-actions/nvidia-gpu-operator

- name: Deploy CodeFlare stack
id: deploy
run: |
cd codeflare-operator
echo Setting up CodeFlare stack
make setup-e2e
echo Deploying CodeFlare operator
IMG="${REGISTRY_ADDRESS}"/codeflare-operator
make image-push -e IMG="${IMG}"
make deploy -e IMG="${IMG}" -e ENV="e2e"
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
cd ..
Expand All @@ -103,9 +97,6 @@ jobs:
with:
user-name: sdk-user

- name: Add kueue resources
run: kubectl apply --server-side -f "https://github.com/kubernetes-sigs/kueue/releases/download/v0.6.2/manifests.yaml"

- name: Configure RBAC for sdk user with limited permissions
run: |
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
Expand Down
5 changes: 4 additions & 1 deletion tests/e2e/mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@
print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR"))
print("MASTER_PORT: is ", os.getenv("MASTER_PORT"))

print("ACCELERATOR: is ", os.getenv("ACCELERATOR"))
ACCELERATOR = os.getenv("ACCELERATOR")


class LitMNIST(LightningModule):
def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4):
Expand Down Expand Up @@ -149,7 +152,7 @@ def test_dataloader(self):

# Initialize a trainer
trainer = Trainer(
accelerator="auto",
accelerator=ACCELERATOR,
# devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs
max_epochs=3,
callbacks=[TQDMProgressBar(refresh_rate=20)],
Expand Down
5 changes: 4 additions & 1 deletion tests/e2e/mnist_raycluster_sdk_kind_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def run_mnist_raycluster_sdk_kind(self):
worker_cpu_limits=1,
worker_memory_requests=1,
worker_memory_limits=2,
num_worker_gpus=0,
num_worker_gpus=1,
image=ray_image,
write_to_file=True,
verify_tls=False,
Expand Down Expand Up @@ -71,7 +71,10 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster):
runtime_env={
"working_dir": "./tests/e2e/",
"pip": "./tests/e2e/mnist_pip_requirements.txt",
"env_vars": {"ACCELERATOR": "gpu"},
},
# Using EntrypointNumGpus doesn't seem to work properly on KinD cluster with GPU, EntrypointNumCpus seems reliable
entrypoint_num_gpus=1,
)
print(f"Submitted job with ID: {submission_id}")
done = False
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/support.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def create_cluster_queue(self, cluster_queue, flavor):
"resources": [
{"name": "cpu", "nominalQuota": 9},
{"name": "memory", "nominalQuota": "36Gi"},
{"name": "nvidia.com/gpu", "nominalQuota": 0},
{"name": "nvidia.com/gpu", "nominalQuota": 1},
],
}
],
Expand Down

0 comments on commit a8a0b67

Please sign in to comment.