-
Notifications
You must be signed in to change notification settings - Fork 109
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
ca2fab1
commit d6ccf07
Showing
10 changed files
with
1,043 additions
and
220 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#!/usr/bin/env python3 | ||
# Copyright (c) Meta Platforms, Inc. and affiliates. | ||
# All rights reserved. | ||
# | ||
# This source code is licensed under the BSD-style license found in the | ||
# LICENSE file in the root directory of this source tree. | ||
|
||
from torchx.components.dist import ddp | ||
from torchx.runner import get_runner | ||
from integ_test_utils import ( | ||
build_images, | ||
BuildInfo, | ||
push_images, | ||
MissingEnvError | ||
) | ||
import argparse | ||
from torchx.specs import AppState | ||
from torchx.util.types import none_throws | ||
|
||
def argparser() -> argparse.ArgumentParser: | ||
parser = argparse.ArgumentParser(description="Kueue dist trainer integration test runner.") | ||
parser.add_argument("--container_repo", type=str) | ||
parser.add_argument("--dryrun", action="store_true", | ||
help="Does not actually submit the app," " just prints the scheduler request",) | ||
return parser | ||
|
||
def build_and_push_image(container_repo: str) -> BuildInfo: | ||
build = build_images() | ||
push_images(build, container_repo=container_repo) | ||
return build | ||
|
||
def run_kueue_test(dryrun: bool = False): | ||
# Gather args & build image | ||
print("Building image") | ||
args = argparser().parse_args() | ||
build = build_and_push_image(args.container_repo) | ||
image = build.torchx_image | ||
# Create the app definition | ||
runner = get_runner("kueue") | ||
app = ddp( | ||
name="kueue-test", | ||
image=image, | ||
m="torchx.examples.apps.lightning.train", | ||
cpu=1, | ||
memMB=4000, | ||
j="1x1", | ||
) | ||
# Pass config variables | ||
cfg={"namespace":"torchx-dev", "local_queue":"torchx-local-queue"} | ||
print("Submitting job") | ||
if dryrun: | ||
dryrun_info = runner.dryrun(app, "kueue", cfg) | ||
print(f"Dryrun info: {dryrun_info}") | ||
else: | ||
app_handle = runner.run(app, "kueue", cfg) | ||
print(app_handle) | ||
runner.wait(app_handle) | ||
final_status = runner.status(app_handle) | ||
print(f"Final status: {final_status}") | ||
if none_throws(final_status).state != AppState.SUCCEEDED: | ||
raise Exception(f"Dist app failed with status: {final_status}") | ||
|
||
def main() -> None: | ||
args = argparser().parse_args() | ||
|
||
try: | ||
run_kueue_test(args.dryrun) | ||
except MissingEnvError: | ||
print("Skip runnig tests, executed only docker build step") | ||
|
||
if __name__ == "__main__": | ||
main() | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
#!/bin/bash | ||
|
||
set -eux | ||
minikube delete | ||
minikube start --driver=docker --cpus=max --memory=max --nodes=2 | ||
minikube addons enable registry | ||
|
||
# setup multi node volumes | ||
# https://github.com/kubernetes/minikube/issues/12360#issuecomment-1430243861 | ||
minikube addons disable storage-provisioner | ||
minikube addons disable default-storageclass | ||
minikube addons enable volumesnapshots | ||
minikube addons enable csi-hostpath-driver | ||
kubectl patch storageclass csi-hostpath-sc -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' | ||
|
||
# create namespace | ||
kubectl create namespace torchx-dev | ||
|
||
# install Kueue and Kueue related resources | ||
VERSION=v0.6.0 | ||
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/$VERSION/manifests.yaml | ||
|
||
# Function to check if the kueue manager pod is running | ||
check_pod_status() { | ||
local status=$(kubectl get pods -n kueue-system | grep "kueue-controller-manager" | awk '{print $3}') | ||
echo "$status" | ||
} | ||
|
||
# Wait until the pod is in the 'Running' state | ||
echo "Waiting for kueue-controller-manager pod to be running in the kueue-system namespace..." | ||
while [[ $(check_pod_status) != "Running" ]]; do | ||
sleep 5 | ||
done | ||
# Function to check if the service exists | ||
check_service_existence() { | ||
kubectl get svc kueue-webhook-service -n kueue-system --no-headers 2>/dev/null | ||
} | ||
|
||
# Wait until the service exists | ||
echo "Waiting for kueue-webhook-service to exist in the kueue-system namespace..." | ||
while [[ $(check_service_existence) == "" ]]; do | ||
sleep 5 | ||
done | ||
echo "kueue-webhook-service exists in the kueue-system namespace." | ||
sleep 20 | ||
# Create Cluster Queue - UPDATE MAX VALUES | ||
cat <<EOF | kubectl apply --server-side -f - | ||
apiVersion: kueue.x-k8s.io/v1beta1 | ||
kind: ClusterQueue | ||
metadata: | ||
name: "cluster-queue" | ||
spec: | ||
namespaceSelector: {} # match all. | ||
resourceGroups: | ||
- coveredResources: ["cpu", "memory", "pods"] | ||
flavors: | ||
- name: "default-flavor" | ||
resources: | ||
- name: "cpu" | ||
nominalQuota: 16 | ||
- name: "memory" | ||
nominalQuota: 64000Mi | ||
- name: "pods" | ||
nominalQuota: 5 | ||
EOF | ||
echo "Cluster Queue: cluster-queue applied!" | ||
|
||
echo "Applying Resource Flavor" | ||
cat <<EOF | kubectl apply --server-side -f - | ||
apiVersion: kueue.x-k8s.io/v1beta1 | ||
kind: ResourceFlavor | ||
metadata: | ||
name: default-flavor | ||
EOF | ||
echo "Resource Flavour: default-flavor applied!" | ||
|
||
cat <<EOF | kubectl apply --server-side -f - | ||
apiVersion: kueue.x-k8s.io/v1beta1 | ||
kind: LocalQueue | ||
metadata: | ||
namespace: torchx-dev | ||
name: torchx-local-queue | ||
spec: | ||
clusterQueue: cluster-queue | ||
EOF | ||
echo "Local Queue: torchx-local-queue applied!" | ||
|
||
# portforwarding | ||
kubectl port-forward --namespace kube-system service/registry 5000:80 & | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.