diff --git a/tests/README.md b/tests/README.md index e396e92d..d5a0dcda 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,118 +1,23 @@ # Running Tests Manually -Manually running tests requires use of the the [opendatahub-io/peak](https://github.com/opendatahub-io/peak) project. - ## Prerequisites * Admin access to an OpenShift cluster ([CRC](https://developers.redhat.com/products/openshift-local/overview) is fine) -* Mac users may need to do the following: - -```bash -brew install coreutils -ln -s /usr/local/bin/greadlink /usr/local/bin/readlink -``` - -* NOTE: The tests appear to be flaky when running from mac. Running from linux is actually the recommended way to run them. - * If you run these tests in a local cluster and have not deployed the Open Data Hub on your OpenShift cluster: ```bash -# Install CodeFlare operator -oc apply -f https://raw.githubusercontent.com/opendatahub-io/distributed-workloads/main/tests/resources/codeflare-subscription.yaml - -installPlanName=$(oc get installplans -n openshift-operators -o jsonpath='{.items[?(@.metadata.ownerReferences[0].name=="codeflare-operator")].metadata.name}') -oc patch installplan $installPlanName -n openshift-operators --type merge -p '{"spec":{"approved":true}}' - -# Install ODH operator and wait for the deploy/opendatahub-operator-controller-manager in the openshift-operators namespace to become available -oc apply -f https://raw.githubusercontent.com/opendatahub-io/distributed-workloads/main/tests/resources/odh-subscription.yaml - -# Deploy Open Data Hub core components -oc new-project opendatahub -oc apply -f https://raw.githubusercontent.com/opendatahub-io/odh-manifests/master/kfdef/odh-core.yaml -n opendatahub - -# Deploy CodeFlare stack -oc apply -f https://raw.githubusercontent.com/opendatahub-io/distributed-workloads/main/codeflare-stack-kfdef.yaml -``` - -## Bash tests - Setup - -Clone the [opendatahub-io/peak](https://github.com/opendatahub-io/peak) project anywhere you like in your working environment. But, do not clone it into the `distributed-workloads` directory. - -```bash -git clone https://github.com/opendatahub-io/peak -cd peak -``` - -Then we need to update the peak project with its submodule dependencies. Specifically, [opendatahub-io/openshift-test-kit](https://github.com/opendatahub-io/openshift-test-kit/tree/0e469c4bf967b531780eb05d6b96463214288db7) defined in the `.gitmodules` file. - -```bash -git submodule update --init -``` - -Now we need to pull our `distributed workloads` project into the peak repo for testing. This is done by creating a file, `my-list`, that contains the repository name you want to use, the channel, the repo's location (this can be a github url or a relative path to a local directory) and branch name. - -For example, if you cloned peak into the same directory level as `distributed-workloads`, then you would create a file, `my-list`, in the following way: - -```bash -echo distributed-workloads nil ../distributed-workloads main > my-list -``` - -Now we can setup our tests. - -```bash -./setup.sh -t my-list -``` - -This should create a directory, `distributed-workloads` in the `operator-tests` directory of the peak repo. - -## Bash tests - Running Tests - -`run.sh` will search through the 'operator-tests' directory for a *.sh file name we provide to it as an argument. In this case, we want to run the `distributed-workloads.sh` script. - -```bash -./run.sh distributed-workloads.sh -``` - -If everything is working correctly you should see an output similar to the below: - -```bash -Running example test - -Running operator-tests/distributed-workloads/tests/basictests/distributed-workloads.sh:15: executing 'oc project opendatahub' expecting success... - - -✔ SUCCESS after 0.184s: operator-tests/distributed-workloads/tests/basictests/distributed-workloads.sh:15: executing 'oc project opendatahub' expecting success - -Running operator-tests/distributed-workloads/tests/basictests/distributed-workloads.sh:16: executing 'oc get pods' expecting success... - - -✔ SUCCESS after 0.127s: operator-tests/distributed-workloads/tests/basictests/distributed-workloads.sh:16: executing 'oc get pods' expecting success +# Go to the root folder of the repository +cd .. +# Install CodeFlare operator +make install-codeflare-operator -Installing Codeflare Operator - - -Installing distributed workloads kfdef - - -Testing MCAD TorchX Functionality - - -Testing MCAD Ray Functionality - - -Uninstalling distributed workloads kfdef - - -Uninstalling Codeflare Operator - -``` - -In some cases, your cluster may not have the default user+password(admin, admin) combination on it. In those situations, you can manually pass in a custom user and password by running tests as below: +# Install ODH operator +make install-opendatahub-operator -```bash -OPENSHIFT_TESTUSER_NAME= OPENSHIFT_TESTUSER_PASS= ./run.sh distributed-workloads.sh +# Deploy ODH and CodeFlare stack +make deploy-codeflare ``` ## Go tests - Setup diff --git a/tests/basictests/distributed-workloads.sh b/tests/basictests/distributed-workloads.sh deleted file mode 100755 index 9cd119ae..00000000 --- a/tests/basictests/distributed-workloads.sh +++ /dev/null @@ -1,165 +0,0 @@ -#!/bin/bash - -source $TEST_DIR/common - -MY_DIR=$(readlink -f `dirname "${BASH_SOURCE[0]}"`) - -RESOURCEDIR="${MY_DIR}/../resources" - -source ${MY_DIR}/../util - -TEST_USER=${OPENSHIFT_TESTUSER_NAME:-"admin"} -TEST_PASS=${OPENSHIFT_TESTUSER_PASS:-"admin"} -OPENSHIFT_OAUTH_ENDPOINT="https://$(oc get route -n openshift-authentication oauth-openshift -o json | jq -r '.spec.host')" - -os::test::junit::declare_suite_start "$MY_SCRIPT" - -function check_distributed_workloads_kfdef(){ - header "Checking distributed workloads stack" - - # Ensure that KubeRay pods start - os::cmd::try_until_text "oc get pod -n ${ODHPROJECT} |grep kuberay-operator | awk '{print \$2}'" "1/1" $odhdefaulttimeout $odhdefaultinterval - - # Ensure the codeflare-notebook imagestream is there - os::cmd::expect_success_and_text "oc get imagestreams -n ${ODHPROJECT} codeflare-notebook --no-headers=true |awk '{print \$1}'" "codeflare-notebook" - - # Add additional role required by notebook sa - oc adm policy add-role-to-user admin -n ${ODHPROJECT} --rolebinding-name "admin-$TEST_USER" $TEST_USER - oc adm policy add-role-to-user kuberay-operator -n ${ODHPROJECT} --rolebinding-name "kuberay-operator-$TEST_USER" $TEST_USER -} - -function test_mcad_torchx_functionality() { - header "Testing MCAD TorchX Functionality" - - ########### Clean Cluster should be free of these resources ############ - # Get appwrapper name - AW=$(oc get appwrapper.workload.codeflare.dev -n ${ODHPROJECT} | grep mnistjob | cut -d ' ' -f 1) || true - # Clean up resources - if [[ -n $AW ]]; then - os::cmd::expect_success "oc delete appwrapper.workload.codeflare.dev $AW -n ${ODHPROJECT} || true" - fi - os::cmd::expect_success "oc delete notebook jupyter-nb-kube-3aadmin -n ${ODHPROJECT} || true" - os::cmd::expect_success "oc delete cm notebooks-mcad -n ${ODHPROJECT} || true" - os::cmd::expect_success "oc delete pvc jupyterhub-nb-kube-3aadmin-pvc -n ${ODHPROJECT} || true" - ############################################################################## - - # Wait for the notebook controller ready - os::cmd::try_until_text "oc get deployment odh-notebook-controller-manager -n ${ODHPROJECT} --no-headers=true | awk '{print \$2}'" "1/1" $odhdefaulttimeout $odhdefaultinterval - - # Create a mnist_ray_mini.ipynb as a configMap - os::cmd::expect_success "oc create configmap notebooks-mcad -n ${ODHPROJECT} --from-file=${RESOURCEDIR}/mnist_mcad_mini.ipynb" - - # Get Token - local TESTUSER_BEARER_TOKEN="$(curl -skiL -u $TEST_USER:$TEST_PASS -H 'X-CSRF-Token: xxx' "$OPENSHIFT_OAUTH_ENDPOINT/oauth/authorize?response_type=token&client_id=openshift-challenging-client" | grep -oP 'access_token=\K[^&]*')" - - # Spawn notebook-server using the codeflare custom nb image - os::cmd::expect_success "cat ${RESOURCEDIR}/custom-nb-small.yaml \ - | sed s/%INGRESS%/$(oc get ingresses.config/cluster -o jsonpath={.spec.domain})/g \ - | sed s/%OCPSERVER%/$(oc whoami --show-server=true|cut -f3 -d "/")/g \ - | sed s/%OCPTOKEN%/${TESTUSER_BEARER_TOKEN}/g \ - | sed s/%NAMESPACE%/${ODHPROJECT}/g \ - | sed s/%JOBTYPE%/mcad/g | oc apply -n ${ODHPROJECT} -f -" - - # Wait for the notebook-server to be ready - os::cmd::try_until_text "oc get pod -n ${ODHPROJECT} | grep "jupyter-nb-kube-3aadmin" | awk '{print \$2}'" "2/2" $odhdefaulttimeout $odhdefaultinterval - - # Wait for appwrapper to exist - os::cmd::try_until_text "oc get appwrapper.workload.codeflare.dev -n ${ODHPROJECT} | grep mnistjob" "mnistjob-*" $odhdefaulttimeout $odhdefaultinterval - - # Get appwrapper name - AW=$(oc get appwrapper.workload.codeflare.dev -n ${ODHPROJECT} | grep mnistjob | cut -d ' ' -f 1) - - # Wait for the mnisttest appwrapper state to become running - os::cmd::try_until_text "oc get appwrapper.workload.codeflare.dev $AW -n ${ODHPROJECT} -ojsonpath='{.status.state}'" "Running" $odhdefaulttimeout $odhdefaultinterval - - # Wait for workload to succeed and clean up - os::cmd::try_until_text "oc get appwrapper.workload.codeflare.dev $AW -n ${ODHPROJECT}" "*NotFound*" $odhdefaulttimeout $odhdefaultinterval - - # Test clean up resources - os::cmd::expect_success "oc delete notebook jupyter-nb-kube-3aadmin -n ${ODHPROJECT}" - os::cmd::expect_failure "oc get notebook jupyter-nb-kube-3aadmin -n ${ODHPROJECT}" - - os::cmd::expect_success "oc delete cm notebooks-mcad -n ${ODHPROJECT} || true" - os::cmd::expect_failure "oc get cm notebooks-mcad -n ${ODHPROJECT}" - - os::cmd::expect_success "oc delete appwrapper.workload.codeflare.dev $AW -n ${ODHPROJECT} || true" - os::cmd::expect_failure "oc get appwrapper.workload.codeflare.dev $AW -n ${ODHPROJECT}" - - os::cmd::expect_success "oc delete pvc jupyterhub-nb-kube-3aadmin-pvc -n ${ODHPROJECT} || true" - os::cmd::expect_failure "oc get pvc jupyterhub-nb-kube-3aadmin-pvc -n ${ODHPROJECT}" -} - -function test_mcad_ray_functionality() { - header "Testing MCAD Ray Functionality" - - ########### ToDo: Clean Cluster should be free of those resources ############ - # Clean up resources - os::cmd::expect_success "oc delete notebook jupyter-nb-kube-3aadmin -n ${ODHPROJECT} || true" - os::cmd::expect_success "oc delete cm notebooks-ray -n ${ODHPROJECT} || true" - os::cmd::expect_success "oc delete appwrapper.workload.codeflare.dev mnisttest -n ${ODHPROJECT} || true" - os::cmd::expect_success "oc delete raycluster mnisttest -n ${ODHPROJECT} || true" - os::cmd::expect_success "oc delete pvc jupyterhub-nb-kube-3aadmin-pvc -n ${ODHPROJECT} || true" - ############################################################################## - - # Wait for the notebook controller ready - os::cmd::try_until_text "oc get deployment odh-notebook-controller-manager -n ${ODHPROJECT} --no-headers=true | awk '{print \$2}'" "1/1" $odhdefaulttimeout $odhdefaultinterval - - # Create a mnist_ray_mini.ipynb as a configMap - os::cmd::expect_success "oc create configmap notebooks-ray -n ${ODHPROJECT} --from-file=${RESOURCEDIR}/mnist_ray_mini.ipynb --from-file=${RESOURCEDIR}/mnist.py --from-file=${RESOURCEDIR}/requirements.txt" - - # Get Token - local TESTUSER_BEARER_TOKEN="$(curl -skiL -u $TEST_USER:$TEST_PASS -H 'X-CSRF-Token: xxx' "$OPENSHIFT_OAUTH_ENDPOINT/oauth/authorize?response_type=token&client_id=openshift-challenging-client" | grep -oP 'access_token=\K[^&]*')" - - # Spawn notebook-server using the codeflare custom nb image - os::cmd::expect_success "cat ${RESOURCEDIR}/custom-nb-small.yaml \ - | sed s/%INGRESS%/$(oc get ingresses.config/cluster -o jsonpath={.spec.domain})/g \ - | sed s/%OCPSERVER%/$(oc whoami --show-server=true|cut -f3 -d "/")/g \ - | sed s/%OCPTOKEN%/${TESTUSER_BEARER_TOKEN}/g \ - | sed s/%NAMESPACE%/${ODHPROJECT}/g \ - | sed s/%JOBTYPE%/ray/g | oc apply -n ${ODHPROJECT} -f -" - - # Wait for the notebook-server to be ready - os::cmd::try_until_text "oc get pod -n ${ODHPROJECT} | grep "jupyter-nb-kube-3aadmin" | awk '{print \$2}'" "2/2" $odhdefaulttimeout $odhdefaultinterval - - # Wait for the mnisttest appwrapper state to become running - os::cmd::try_until_text "oc get appwrapper.workload.codeflare.dev mnisttest -n ${ODHPROJECT} -ojsonpath='{.status.state}'" "Running" $odhdefaulttimeout $odhdefaultinterval - - # Wait for Raycluster to be ready - os::cmd::try_until_text "oc get raycluster -n ${ODHPROJECT} mnisttest -ojsonpath='{.status.state}'" "ready" $odhdefaulttimeout $odhdefaultinterval - - # Wait for job to be completed and cleaned up - os::cmd::try_until_text "oc get appwrapper.workload.codeflare.dev mnisttest -n ${ODHPROJECT}" "*NotFound*" $odhdefaulttimeout $odhdefaultinterval - os::cmd::expect_failure "oc get raycluster mnisttest -n ${ODHPROJECT}" - - # Test clean up resources - os::cmd::expect_success "oc delete notebook jupyter-nb-kube-3aadmin -n ${ODHPROJECT}" - os::cmd::expect_failure "oc get notebook jupyter-nb-kube-3aadmin -n ${ODHPROJECT}" - - os::cmd::expect_success "oc delete cm notebooks-ray -n ${ODHPROJECT} || true" - os::cmd::expect_failure "oc get cm notebooks-ray -n ${ODHPROJECT}" - - os::cmd::expect_success "oc delete appwrapper.workload.codeflare.dev mnisttest -n ${ODHPROJECT} || true" - os::cmd::expect_failure "oc get appwrapper.workload.codeflare.dev mnisttest -n ${ODHPROJECT}" - - os::cmd::expect_success "oc delete raycluster mnisttest -n ${ODHPROJECT} || true" - os::cmd::expect_failure "oc get raycluster mnisttest -n ${ODHPROJECT}" - - os::cmd::expect_success "oc delete pvc jupyterhub-nb-kube-3aadmin-pvc -n ${ODHPROJECT} || true" - os::cmd::expect_failure "oc get pvc jupyterhub-nb-kube-3aadmin-pvc -n ${ODHPROJECT}" - -} - -function clean_permissions() { - header "Cleaning extra admin roles" - oc adm policy remove-role-from-user admin -n ${ODHPROJECT} $TEST_USER - oc adm policy remove-role-from-user kuberay-operator -n ${ODHPROJECT} $TEST_USER -} - - -check_distributed_workloads_kfdef -test_mcad_torchx_functionality -test_mcad_ray_functionality -clean_permissions - - -os::test::junit::declare_suite_end diff --git a/tests/basictests/ray.sh b/tests/basictests/ray.sh deleted file mode 100755 index f47c3e88..00000000 --- a/tests/basictests/ray.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash - -source $TEST_DIR/common - -MY_DIR=$(readlink -f `dirname "${BASH_SOURCE[0]}"`) - -RESOURCEDIR="${MY_DIR}/../resources" - -source ${MY_DIR}/../util - -os::test::junit::declare_suite_start "$MY_SCRIPT" - -function check_ray_operator() { - header "Testing Ray Operator" - os::cmd::expect_success "oc project ${ODHPROJECT}" - os::cmd::try_until_text "oc get crd rayclusters.ray.io" "rayclusters.ray.io" $odhdefaulttimeout $odhdefaultinterval - os::cmd::try_until_text "oc get role kuberay-operator-leader-election" "kuberay-operator-leader-election" $odhdefaulttimeout $odhdefaultinterval - os::cmd::try_until_text "oc get rolebinding kuberay-operator-leader-election" "kuberay-operator-leader-election" $odhdefaulttimeout $odhdefaultinterval - os::cmd::try_until_text "oc get sa kuberay-operator" "kuberay-operator" $odhdefaulttimeout $odhdefaultinterval - os::cmd::try_until_text "oc get deployment kuberay-operator" "kuberay-operator" $odhdefaulttimeout $odhdefaultinterval - os::cmd::try_until_text "oc get pods -l app.kubernetes.io/component=kuberay-operator --field-selector='status.phase=Running' -o jsonpath='{$.items[*].metadata.name}' | wc -w" "1" $odhdefaulttimeout $odhdefaultinterval -} - -function start_test_ray_cluster(){ - header "Starting Ray Cluster" - os::cmd::expect_success "oc project ${ODHPROJECT}" - os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/ray/ray-test-cluster-test.yaml" - os::cmd::try_until_text "oc get RayCluster kuberay-cluster-test" "kuberay-cluster-test" $odhdefaulttimeout $odhdefaultinterval - os::cmd::try_until_text "oc get pods -l ray.io/identifier=kuberay-cluster-test-head -o jsonpath='{$.items[*].status.phase}'" "Running" $odhdefaulttimeout $odhdefaultinterval - os::cmd::try_until_text "oc get pods -l ray.io/identifier=kuberay-cluster-test-worker -o jsonpath='{$.items[*].status.phase}'" "Running" $odhdefaulttimeout $odhdefaultinterval -} - -function check_functionality(){ - header "Testing Ray Functionality" - os::cmd::expect_success "oc project ${ODHPROJECT}" - os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/ray/ray-simple-test.yaml" - sleep 30 - os::cmd::try_until_text "oc get pods -l app=ray-simple-test -o jsonpath='{$.items[*].status.containerStatuses[0].lastState.terminated.exitCode}'" "" $odhdefaulttimeout $odhdefaultinterval - os::cmd::try_until_text "oc get pods -l app=ray-simple-test -o jsonpath='{$.items[*].status.containerStatuses[0].restartCount}'" "0" $odhdefaulttimeout $odhdefaultinterval - pod_name=($(oc get pods -l app=ray-simple-test -o jsonpath='{$.items[*].metadata.name}')) - os::cmd::try_until_text "oc logs ${pod_name} | grep 'Simple tests passed'" "Simple tests passed" $odhdefaulttimeout $odhdefaultinterval -} - -function clean_up_ray_cluster(){ - header "Cleaning up Ray cluster" - os::cmd::expect_success "oc project ${ODHPROJECT}" - os::cmd::expect_success "oc delete deployment ray-simple-test -n ${ODHPROJECT}" - os::cmd::expect_success "oc delete RayCluster kuberay-cluster-test -n ${ODHPROJECT}" -} - -check_ray_operator -start_test_ray_cluster -check_functionality -clean_up_ray_cluster - -os::test::junit::declare_suite_end diff --git a/tests/resources/codeflare-subscription.yaml b/tests/resources/codeflare-subscription.yaml deleted file mode 100644 index b97f08d4..00000000 --- a/tests/resources/codeflare-subscription.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: operators.coreos.com/v1alpha1 -kind: Subscription -metadata: - name: codeflare-operator - namespace: openshift-operators -spec: - channel: alpha - name: codeflare-operator - source: community-operators - sourceNamespace: openshift-marketplace - installPlanApproval: Manual - startingCSV: codeflare-operator.v0.2.3 diff --git a/tests/resources/custom-nb-small.yaml b/tests/resources/custom-nb-small.yaml deleted file mode 100644 index 540f012a..00000000 --- a/tests/resources/custom-nb-small.yaml +++ /dev/null @@ -1,177 +0,0 @@ -# This template maybe used to spin up a custom notebook image -# i.e.: sed s/%INGRESS%/$(oc get ingresses.config/cluster -o jsonpath={.spec.domain})/g tests/resources/custom-nb.template | oc apply -f - -# resources generated: -# pod/jupyter-nb-kube-3aadmin-0 -# service/jupyter-nb-kube-3aadmin -# route.route.openshift.io/jupyter-nb-kube-3aadmin (jupyter-nb-kube-3aadmin-opendatahub.apps.tedbig412.cp.fyre.ibm.com) -# service/jupyter-nb-kube-3aadmin-tls -apiVersion: kubeflow.org/v1 -kind: Notebook -metadata: - annotations: - notebooks.opendatahub.io/inject-oauth: "true" - notebooks.opendatahub.io/last-image-selection: codeflare-notebook:v0.7.1 - notebooks.opendatahub.io/last-size-selection: Small - notebooks.opendatahub.io/oauth-logout-url: https://odh-dashboard-%NAMESPACE%.%INGRESS%/notebookController/kube-3aadmin/home - opendatahub.io/link: https://jupyter-nb-kube-3aadmin-%NAMESPACE%.%INGRESS%/notebook/%NAMESPACE%/jupyter-nb-kube-3aadmin - opendatahub.io/username: kube:admin - generation: 1 - labels: - app: jupyter-nb-kube-3aadmin - opendatahub.io/dashboard: "true" - opendatahub.io/odh-managed: "true" - opendatahub.io/user: kube-3aadmin - name: jupyter-nb-kube-3aadmin - namespace: %NAMESPACE% -spec: - template: - spec: - affinity: - nodeAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - preference: - matchExpressions: - - key: nvidia.com/gpu.present - operator: NotIn - values: - - "true" - weight: 1 - containers: - - env: - - name: NOTEBOOK_ARGS - value: |- - --ServerApp.port=8888 - --ServerApp.token='' - --ServerApp.password='' - --ServerApp.base_url=/notebook/%NAMESPACE%/jupyter-nb-kube-3aadmin - --ServerApp.quit_button=False - --ServerApp.tornado_settings={"user":"kube-3aadmin","hub_host":"https://odh-dashboard-%NAMESPACE%.%INGRESS%","hub_prefix":"/notebookController/kube-3aadmin"} - - name: JUPYTER_IMAGE - value: image-registry.openshift-image-registry.svc:5000/%NAMESPACE%/codeflare-notebook:v0.7.1 - - name: JUPYTER_NOTEBOOK_PORT - value: "8888" - - name: OCP_SERVER - value: https://%OCPSERVER% - - name: OCP_TOKEN - value: %OCPTOKEN% - image: image-registry.openshift-image-registry.svc:5000/%NAMESPACE%/codeflare-notebook:v0.7.1 - command: ["/bin/sh", "-c", "pip install papermill && oc login --token=${OCP_TOKEN} --server=${OCP_SERVER} --insecure-skip-tls-verify=true && papermill /opt/app-root/notebooks-%JOBTYPE%/mnist_%JOBTYPE%_mini.ipynb /opt/app-root/src/mcad-out.ipynb && sleep infinity"] - # args: ["pip install papermill && oc login --token=${OCP_TOKEN} --server=${OCP_SERVER} --insecure-skip-tls-verify=true && papermill /opt/app-root/notebooks/mcad.ipynb /opt/app-root/src/mcad-out.ipynb" ] - imagePullPolicy: Always - # livenessProbe: - # failureThreshold: 3 - # httpGet: - # path: /notebook/%NAMESPACE%/jupyter-nb-kube-3aadmin/api - # port: notebook-port - # scheme: HTTP - # initialDelaySeconds: 10 - # periodSeconds: 5 - # successThreshold: 1 - # timeoutSeconds: 1 - name: jupyter-nb-kube-3aadmin - ports: - - containerPort: 8888 - name: notebook-port - protocol: TCP - resources: - limits: - cpu: "2" - memory: 3Gi - requests: - cpu: "1" - memory: 3Gi - volumeMounts: - - mountPath: /opt/app-root/src - name: jupyterhub-nb-kube-3aadmin-pvc - - mountPath: /opt/app-root/notebooks-%JOBTYPE% - name: notebooks-%JOBTYPE% - workingDir: /opt/app-root/src - - args: - - --provider=openshift - - --https-address=:8443 - - --http-address= - - --openshift-service-account=jupyter-nb-kube-3aadmin - - --cookie-secret-file=/etc/oauth/config/cookie_secret - - --cookie-expire=24h0m0s - - --tls-cert=/etc/tls/private/tls.crt - - --tls-key=/etc/tls/private/tls.key - - --upstream=http://localhost:8888 - - --upstream-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt - - --skip-auth-regex=^(?:/notebook/$(NAMESPACE)/jupyter-nb-kube-3aadmin)?/api$ - - --email-domain=* - - --skip-provider-button - - --openshift-sar={"verb":"get","resource":"notebooks","resourceAPIGroup":"kubeflow.org","resourceName":"jupyter-nb-kube-3aadmin","namespace":"$(NAMESPACE)"} - - --logout-url=https://odh-dashboard-%NAMESPACE%.%INGRESS%/notebookController/kube-3aadmin/home - env: - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - image: registry.redhat.io/openshift4/ose-oauth-proxy:v4.10 - imagePullPolicy: Always - livenessProbe: - failureThreshold: 3 - httpGet: - path: /oauth/healthz - port: oauth-proxy - scheme: HTTPS - initialDelaySeconds: 30 - periodSeconds: 5 - successThreshold: 1 - timeoutSeconds: 1 - name: oauth-proxy - ports: - - containerPort: 8443 - name: oauth-proxy - protocol: TCP - readinessProbe: - failureThreshold: 3 - httpGet: - path: /oauth/healthz - port: oauth-proxy - scheme: HTTPS - initialDelaySeconds: 5 - periodSeconds: 5 - successThreshold: 1 - timeoutSeconds: 1 - resources: - limits: - cpu: 100m - memory: 64Mi - requests: - cpu: 100m - memory: 64Mi - volumeMounts: - - mountPath: /etc/oauth/config - name: oauth-config - - mountPath: /etc/tls/private - name: tls-certificates - enableServiceLinks: false - serviceAccountName: jupyter-nb-kube-3aadmin - volumes: - - name: jupyterhub-nb-kube-3aadmin-pvc - persistentVolumeClaim: - claimName: jupyterhub-nb-kube-3aadmin-pvc - - name: oauth-config - secret: - defaultMode: 420 - secretName: jupyter-nb-kube-3aadmin-oauth-config - - name: tls-certificates - secret: - defaultMode: 420 - secretName: jupyter-nb-kube-3aadmin-tls - - name: notebooks-%JOBTYPE% - configMap: - name: notebooks-%JOBTYPE% ---- -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: jupyterhub-nb-kube-3aadmin-pvc - namespace: %NAMESPACE% -spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 10Gi diff --git a/tests/resources/mcad-mnist-tests-Containerfile b/tests/resources/mcad-mnist-tests-Containerfile deleted file mode 100644 index 38eb064b..00000000 --- a/tests/resources/mcad-mnist-tests-Containerfile +++ /dev/null @@ -1,9 +0,0 @@ -FROM ghcr.io/pytorch/torchx:0.5.0dev0 - -ADD mnist.py /app/mnist.py - -# change group permissions for running in OCP -RUN chgrp -R 0 /app -RUN chmod -R g+w /app - -ENTRYPOINT ["python", "/app/mnist.py"] diff --git a/tests/resources/mnist.py b/tests/resources/mnist.py deleted file mode 100644 index 0c7750d4..00000000 --- a/tests/resources/mnist.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright 2022 IBM, Red Hat -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# In[] -import os - -import torch -from pytorch_lightning import LightningModule, Trainer -from pytorch_lightning.callbacks.progress import TQDMProgressBar -from pytorch_lightning.loggers import CSVLogger -from torch import nn -from torch.nn import functional as F -from torch.utils.data import DataLoader, random_split -from torchmetrics import Accuracy -from torchvision import transforms -from torchvision.datasets import MNIST - -PATH_DATASETS = os.environ.get("PATH_DATASETS", ".") -BATCH_SIZE = 256 if torch.cuda.is_available() else 64 -# %% - -print("prior to running the trainer") -print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR")) -print("MASTER_PORT: is ", os.getenv("MASTER_PORT")) - - -class LitMNIST(LightningModule): - def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4): - - super().__init__() - - # Set our init args as class attributes - self.data_dir = data_dir - self.hidden_size = hidden_size - self.learning_rate = learning_rate - - # Hardcode some dataset specific attributes - self.num_classes = 10 - self.dims = (1, 28, 28) - channels, width, height = self.dims - self.transform = transforms.Compose( - [ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)), - ] - ) - - # Define PyTorch model - self.model = nn.Sequential( - nn.Flatten(), - nn.Linear(channels * width * height, hidden_size), - nn.ReLU(), - nn.Dropout(0.1), - nn.Linear(hidden_size, hidden_size), - nn.ReLU(), - nn.Dropout(0.1), - nn.Linear(hidden_size, self.num_classes), - ) - - self.val_accuracy = Accuracy() - self.test_accuracy = Accuracy() - - def forward(self, x): - x = self.model(x) - return F.log_softmax(x, dim=1) - - def training_step(self, batch, batch_idx): - x, y = batch - logits = self(x) - loss = F.nll_loss(logits, y) - return loss - - def validation_step(self, batch, batch_idx): - x, y = batch - logits = self(x) - loss = F.nll_loss(logits, y) - preds = torch.argmax(logits, dim=1) - self.val_accuracy.update(preds, y) - - # Calling self.log will surface up scalars for you in TensorBoard - self.log("val_loss", loss, prog_bar=True) - self.log("val_acc", self.val_accuracy, prog_bar=True) - - def test_step(self, batch, batch_idx): - x, y = batch - logits = self(x) - loss = F.nll_loss(logits, y) - preds = torch.argmax(logits, dim=1) - self.test_accuracy.update(preds, y) - - # Calling self.log will surface up scalars for you in TensorBoard - self.log("test_loss", loss, prog_bar=True) - self.log("test_acc", self.test_accuracy, prog_bar=True) - - def configure_optimizers(self): - optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) - return optimizer - - #################### - # DATA RELATED HOOKS - #################### - - def prepare_data(self): - # download - print("Downloading MNIST dataset...") - MNIST(self.data_dir, train=True, download=True) - MNIST(self.data_dir, train=False, download=True) - - def setup(self, stage=None): - - # Assign train/val datasets for use in dataloaders - if stage == "fit" or stage is None: - mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) - self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) - - # Assign test dataset for use in dataloader(s) - if stage == "test" or stage is None: - self.mnist_test = MNIST( - self.data_dir, train=False, transform=self.transform - ) - - def train_dataloader(self): - return DataLoader(self.mnist_train, batch_size=BATCH_SIZE) - - def val_dataloader(self): - return DataLoader(self.mnist_val, batch_size=BATCH_SIZE) - - def test_dataloader(self): - return DataLoader(self.mnist_test, batch_size=BATCH_SIZE) - - -# Init DataLoader from MNIST Dataset - -model = LitMNIST() - -print("GROUP: ", int(os.environ.get("GROUP_WORLD_SIZE", 1))) -print("LOCAL: ", int(os.environ.get("LOCAL_WORLD_SIZE", 1))) - -# Initialize a trainer -trainer = Trainer( - accelerator="auto", - # devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs - max_epochs=5, - callbacks=[TQDMProgressBar(refresh_rate=20)], - num_nodes=int(os.environ.get("GROUP_WORLD_SIZE", 1)), - devices=int(os.environ.get("LOCAL_WORLD_SIZE", 1)), - strategy="ddp", -) - -# Train the model ⚡ -trainer.fit(model) diff --git a/tests/resources/mnist_mcad_mini.ipynb b/tests/resources/mnist_mcad_mini.ipynb deleted file mode 100644 index 9e3deae8..00000000 --- a/tests/resources/mnist_mcad_mini.ipynb +++ /dev/null @@ -1,80 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Import pieces from codeflare-sdk\n", - "from codeflare_sdk.job.jobs import DDPJobDefinition\n", - "from time import sleep" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "47ca5c15", - "metadata": {}, - "outputs": [], - "source": [ - "job = DDPJobDefinition(name=\"mnistjob\", script=\"mnist.py\", scheduler_args={\"namespace\": \"opendatahub\"}, j=\"1x1\", gpu=0, cpu=3, memMB=4000, image=\"quay.io/project-codeflare/mnist-job-test:v0.0.1\").submit()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d24e9f95", - "metadata": {}, - "outputs": [], - "source": [ - "finished = False\n", - "while not finished:\n", - " sleep(1)\n", - " try:\n", - " finished = (\"Epoch 4: 100%\" in job.logs())\n", - " except:\n", - " finished = False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f078b7cd", - "metadata": {}, - "outputs": [], - "source": [ - "job.cancel()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.13" - }, - "vscode": { - "interpreter": { - "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tests/resources/mnist_ray_mini.ipynb b/tests/resources/mnist_ray_mini.ipynb deleted file mode 100644 index cdc93aa2..00000000 --- a/tests/resources/mnist_ray_mini.ipynb +++ /dev/null @@ -1,142 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Import pieces from codeflare-sdk\n", - "from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration\n", - "from codeflare_sdk.job.jobs import DDPJobDefinition\n", - "from time import sleep" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f4bc870-091f-4e11-9642-cba145710159", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Create our cluster and submit appwrapper\n", - "cluster = Cluster(ClusterConfiguration(namespace='opendatahub', name='mnisttest', num_workers=2, min_cpus=2, max_cpus=2, min_memory=4, max_memory=4, num_gpus=0, instascale=False))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f0884bbc-c224-4ca0-98a0-02dfa09c2200", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Bring up the cluster\n", - "cluster.up()\n", - "sleep(10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a99d5aff", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "cluster.wait_ready()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "df71c1ed", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "cluster.status()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7fd45bc5-03c0-4ae5-9ec5-dd1c30f1a084", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "cluster.details()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "47ca5c15", - "metadata": {}, - "outputs": [], - "source": [ - "job = DDPJobDefinition(name=\"mnisttest\", script=\"mnist.py\", workspace=\"file:///opt/app-root/notebooks-ray/..data\", scheduler_args={\"requirements\": \"/opt/app-root/notebooks-ray/requirements.txt\"}).submit(cluster)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f63a178a", - "metadata": {}, - "outputs": [], - "source": [ - "finished = False\n", - "while not finished:\n", - " sleep(1)\n", - " status = job.status()\n", - " finished = (str(status.state) == \"SUCCEEDED\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6b099777", - "metadata": {}, - "outputs": [], - "source": [ - "cluster.down()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.13" - }, - "vscode": { - "interpreter": { - "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tests/resources/odh-subscription.yaml b/tests/resources/odh-subscription.yaml deleted file mode 100644 index 04eb310a..00000000 --- a/tests/resources/odh-subscription.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: operators.coreos.com/v1alpha1 -kind: Subscription -metadata: - labels: - operators.coreos.com/opendatahub-operator.openshift-operators: "" - name: opendatahub-operator - namespace: openshift-operators -spec: - channel: stable - name: opendatahub-operator - source: community-operators - sourceNamespace: openshift-marketplace diff --git a/tests/resources/requirements.txt b/tests/resources/requirements.txt deleted file mode 100644 index 073e4247..00000000 --- a/tests/resources/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -pytorch_lightning==1.5.10 -ray_lightning -torchmetrics==0.9.1 -torchvision==0.12.0 \ No newline at end of file