Skip to content

Commit

Permalink
Adjust e2e tests to use GPU
Browse files Browse the repository at this point in the history
  • Loading branch information
sutaakar committed Jul 10, 2024
1 parent 5ce0b2c commit 7f7c7c4
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 29 deletions.
39 changes: 13 additions & 26 deletions .github/workflows/e2e_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,29 +24,15 @@ concurrency:
group: ${{ github.head_ref }}-${{ github.workflow }}
cancel-in-progress: true

env:
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"

jobs:
kubernetes:

runs-on: ubuntu-20.04
runs-on: ubuntu-20.04-4core-gpu

steps:
- name: Cleanup
run: |
ls -lart
echo "Initial status:"
df -h
echo "Cleaning up resources:"
sudo swapoff -a
sudo rm -f /swapfile
sudo apt clean
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
docker rmi $(docker image ls -aq)
echo "Final status:"
df -h
- name: Checkout code
uses: actions/checkout@v4
with:
Expand All @@ -55,8 +41,8 @@ jobs:
- name: Checkout common repo code
uses: actions/checkout@v4
with:
repository: 'project-codeflare/codeflare-common'
ref: 'main'
repository: 'sutaakar/codeflare-common'
ref: 'add-user-fix'
path: 'common'

- name: Checkout CodeFlare operator repository
Expand All @@ -82,19 +68,23 @@ jobs:
python-version: '3.9'
cache: 'pip' # caching pip dependencies

- name: Setup NVidia GPU environment for KinD
uses: ./common/github-actions/nvidia-gpu-setup

- name: Setup and start KinD cluster
uses: ./common/github-actions/kind

- name: Install NVidia GPU operator for KinD
uses: ./common/github-actions/nvidia-gpu-operator

- name: Deploy CodeFlare stack
id: deploy
run: |
cd codeflare-operator
echo Setting up CodeFlare stack
make setup-e2e
echo Deploying CodeFlare operator
IMG="${REGISTRY_ADDRESS}"/codeflare-operator
make image-push -e IMG="${IMG}"
make deploy -e IMG="${IMG}" -e ENV="e2e"
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
cd ..
Expand All @@ -103,9 +93,6 @@ jobs:
with:
user-name: sdk-user

- name: Add kueue resources
run: kubectl apply --server-side -f "https://github.com/kubernetes-sigs/kueue/releases/download/v0.6.2/manifests.yaml"

- name: Configure RBAC for sdk user with limited permissions
run: |
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
Expand Down
5 changes: 4 additions & 1 deletion tests/e2e/mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@
print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR"))
print("MASTER_PORT: is ", os.getenv("MASTER_PORT"))

print("ACCELERATOR: is ", os.getenv("ACCELERATOR"))
ACCELERATOR = os.getenv("ACCELERATOR")


class LitMNIST(LightningModule):
def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4):
Expand Down Expand Up @@ -149,7 +152,7 @@ def test_dataloader(self):

# Initialize a trainer
trainer = Trainer(
accelerator="auto",
accelerator=ACCELERATOR,
# devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs
max_epochs=3,
callbacks=[TQDMProgressBar(refresh_rate=20)],
Expand Down
6 changes: 5 additions & 1 deletion tests/e2e/mnist_raycluster_sdk_kind_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ def run_mnist_raycluster_sdk_kind(self):
worker_cpu_requests="500m",
worker_cpu_limits=1,
worker_memory_requests=1,
worker_memory_limits=2,
worker_memory_limits=4,
worker_extended_resource_requests={"nvidia.com/gpu": 1},
image=ray_image,
write_to_file=True,
verify_tls=False,
Expand Down Expand Up @@ -70,7 +71,10 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster):
runtime_env={
"working_dir": "./tests/e2e/",
"pip": "./tests/e2e/mnist_pip_requirements.txt",
"env_vars": {"ACCELERATOR": "gpu"},
},
# Using EntrypointNumGpus doesn't seem to work properly on KinD cluster with GPU, EntrypointNumCpus seems reliable
entrypoint_num_gpus=1,
)
print(f"Submitted job with ID: {submission_id}")
done = False
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/support.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def create_cluster_queue(self, cluster_queue, flavor):
"resources": [
{"name": "cpu", "nominalQuota": 9},
{"name": "memory", "nominalQuota": "36Gi"},
{"name": "nvidia.com/gpu", "nominalQuota": 0},
{"name": "nvidia.com/gpu", "nominalQuota": 1},
],
}
],
Expand Down

0 comments on commit 7f7c7c4

Please sign in to comment.