From 4d7c43316321fe76b677904c4b42d163fb8efc81 Mon Sep 17 00:00:00 2001 From: Vicente Zepeda Mas Date: Thu, 22 Jun 2023 13:45:32 +0200 Subject: [PATCH 01/26] Removes snappy usage, no longer needed Signed-off-by: Vicente Zepeda Mas --- dags/nocp/scripts/run_ocm_api_load.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dags/nocp/scripts/run_ocm_api_load.sh b/dags/nocp/scripts/run_ocm_api_load.sh index db4b96a6c..2f8afe1ac 100755 --- a/dags/nocp/scripts/run_ocm_api_load.sh +++ b/dags/nocp/scripts/run_ocm_api_load.sh @@ -89,15 +89,12 @@ run_ocm_api_load(){ # Timeout runs ocm-load-test for the specified duration even if airflow killed this script (when user wants to stop benchmark execution). This helps in ocm-load-test to cleanup resources it created. 10 minutes extra timeout is set so that test can prepare results after running for the given duration. # kill-after option needs sudo permissions - timeout --kill-after=60s --preserve-status $(((tduration + 10) * 60)) $TESTDIR/build/ocm-load-test --aws-region $AWS_DEFAULT_REGION --aws-account-id $AWS_ACCOUNT_ID --aws-access-key $AWS_OSDCCADMIN_KEY --aws-access-secret $AWS_OSDCCADMIN_SECRET --cooldown $COOLDOWN --duration $tduration --elastic-index ocm-request-test --elastic-insecure-skip-verify=true --elastic-server $ES_SERVER --gateway-url $GATEWAY_URL --ocm-token $OCM_TOKEN --ocm-token-url $OCM_TOKEN_URL --output-path $TESTDIR/results --rate $trate --test-id $UUID --test-names $tname $rampoptions + timeout --kill-after=60s --preserve-status $(((tduration + 10) * 60)) $TESTDIR/build/ocm-load-test --aws-region $AWS_DEFAULT_REGION --aws-account-id $AWS_ACCOUNT_ID --aws-access-key $AWS_OSDCCADMIN_KEY --aws-access-secret $AWS_OSDCCADMIN_SECRET --cooldown $COOLDOWN --duration $tduration --elastic-index ocm-load-metrics --elastic-insecure-skip-verify=true --elastic-server $ES_SERVER --gateway-url $GATEWAY_URL --ocm-token $OCM_TOKEN --ocm-token-url $OCM_TOKEN_URL --output-path $TESTDIR/results --rate $trate --test-id $UUID --test-names $tname $rampoptions sleep $COOLDOWN done benchmark_rv=$? end_time=$(date +%s) - echo "Uploading Result files..." - python3 $TESTDIR/automation.py upload --dir $TESTDIR/results --server ${SNAPPY_DATA_SERVER_URL} --user ${SNAPPY_DATA_SERVER_USERNAME} --password ${SNAPPY_DATA_SERVER_PASSWORD} - # scraping metrics export KUBE_ES_INDEX=ocm-uhc-acct-mngr envsubst < $TESTDIR/ci/templates/kube-burner-config.yaml > $TESTDIR/kube-burner-am-config.yaml From 96e05e21f7f4697276ab40ca4185e5ec83f2269a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Sevilla?= Date: Thu, 22 Jun 2023 15:01:34 +0200 Subject: [PATCH 02/26] Remove cluster-density from DAGs (#328) Signed-off-by: Raul Sevilla --- .../config/benchmarks/large-control-plane-mgs.json | 7 +------ .../config/benchmarks/large-control-plane.json | 7 +------ .../config/benchmarks/medium-control-plane-mgs.json | 7 +------ .../config/benchmarks/medium-control-plane.json | 7 +------ .../config/benchmarks/small-control-plane-mgs.json | 5 ----- .../config/benchmarks/small-control-plane.json | 5 ----- 6 files changed, 4 insertions(+), 34 deletions(-) diff --git a/dags/openshift_nightlies/config/benchmarks/large-control-plane-mgs.json b/dags/openshift_nightlies/config/benchmarks/large-control-plane-mgs.json index 4c1778967..0b86525a0 100644 --- a/dags/openshift_nightlies/config/benchmarks/large-control-plane-mgs.json +++ b/dags/openshift_nightlies/config/benchmarks/large-control-plane-mgs.json @@ -3,12 +3,7 @@ { "name": "node-density", "workload": "kube-burner", - "custom_cmd": "kube-burner ocp node-density --uuid=${UUID} --pods-per-node=245 --timeout=2h --pod-ready-threshold=10s --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" - }, - { - "name": "cluster-density", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp cluster-density --uuid=${UUID} --iterations=4000 --timeout=6h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" + "custom_cmd": "kube-burner ocp node-density --uuid=${UUID} --pods-per-node=245 --timeout=3h --pod-ready-threshold=10s --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" }, { "name": "cluster-density-v2", diff --git a/dags/openshift_nightlies/config/benchmarks/large-control-plane.json b/dags/openshift_nightlies/config/benchmarks/large-control-plane.json index 0b3e2771c..62dcd771f 100644 --- a/dags/openshift_nightlies/config/benchmarks/large-control-plane.json +++ b/dags/openshift_nightlies/config/benchmarks/large-control-plane.json @@ -33,12 +33,7 @@ { "name": "node-density", "workload": "kube-burner", - "custom_cmd": "kube-burner ocp node-density --uuid=${UUID} --pods-per-node=245 --timeout=2h --pod-ready-threshold=10s --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" - }, - { - "name": "cluster-density", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp cluster-density --uuid=${UUID} --iterations=4000 --timeout=6h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" + "custom_cmd": "kube-burner ocp node-density --uuid=${UUID} --pods-per-node=245 --timeout=3h --pod-ready-threshold=10s --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" }, { "name": "cluster-density-v2", diff --git a/dags/openshift_nightlies/config/benchmarks/medium-control-plane-mgs.json b/dags/openshift_nightlies/config/benchmarks/medium-control-plane-mgs.json index 42831e9a6..c0ffa3198 100644 --- a/dags/openshift_nightlies/config/benchmarks/medium-control-plane-mgs.json +++ b/dags/openshift_nightlies/config/benchmarks/medium-control-plane-mgs.json @@ -3,12 +3,7 @@ { "name": "node-density", "workload": "kube-burner", - "custom_cmd": "kube-burner ocp node-density --uuid=${UUID} --pods-per-node=245 --timeout=2h --pod-ready-threshold=5s --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" - }, - { - "name": "cluster-density", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp cluster-density --uuid=${UUID} --iterations=1000 --timeout=6h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" + "custom_cmd": "kube-burner ocp node-density --uuid=${UUID} --pods-per-node=245 --timeout=3h --pod-ready-threshold=5s --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" }, { "name": "cluster-density-v2", diff --git a/dags/openshift_nightlies/config/benchmarks/medium-control-plane.json b/dags/openshift_nightlies/config/benchmarks/medium-control-plane.json index 93f1de4f1..3c619e4ed 100644 --- a/dags/openshift_nightlies/config/benchmarks/medium-control-plane.json +++ b/dags/openshift_nightlies/config/benchmarks/medium-control-plane.json @@ -23,12 +23,7 @@ { "name": "node-density", "workload": "kube-burner", - "custom_cmd": "kube-burner ocp node-density --uuid=${UUID} --pods-per-node=245 --timeout=2h --pod-ready-threshold=5s --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" - }, - { - "name": "cluster-density", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp cluster-density --uuid=${UUID} --iterations=1000 --timeout=6h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" + "custom_cmd": "kube-burner ocp node-density --uuid=${UUID} --pods-per-node=245 --timeout=3h --pod-ready-threshold=5s --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" }, { "name": "cluster-density-v2", diff --git a/dags/openshift_nightlies/config/benchmarks/small-control-plane-mgs.json b/dags/openshift_nightlies/config/benchmarks/small-control-plane-mgs.json index 239b4e05b..4fb13d562 100644 --- a/dags/openshift_nightlies/config/benchmarks/small-control-plane-mgs.json +++ b/dags/openshift_nightlies/config/benchmarks/small-control-plane-mgs.json @@ -15,11 +15,6 @@ "workload": "kube-burner", "custom_cmd": "kube-burner ocp node-density-cni --uuid=${UUID} --pods-per-node=245 --timeout=2h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" }, - { - "name": "cluster-density", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp cluster-density --uuid=${UUID} --iterations=500 --timeout=6h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" - }, { "name": "cluster-density-v2", "workload": "kube-burner", diff --git a/dags/openshift_nightlies/config/benchmarks/small-control-plane.json b/dags/openshift_nightlies/config/benchmarks/small-control-plane.json index d82833074..f34a77254 100644 --- a/dags/openshift_nightlies/config/benchmarks/small-control-plane.json +++ b/dags/openshift_nightlies/config/benchmarks/small-control-plane.json @@ -25,11 +25,6 @@ "workload": "kube-burner", "custom_cmd": "kube-burner ocp node-density-cni --uuid=${UUID} --pods-per-node=245 --timeout=2h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" }, - { - "name": "cluster-density", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp cluster-density --uuid=${UUID} --iterations=500 --timeout=6h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" - }, { "name": "cluster-density-v2", "workload": "kube-burner", From 163441cc015a839ce04c7d19e1987137d33fd706 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Sevilla?= Date: Mon, 26 Jun 2023 13:35:17 +0200 Subject: [PATCH 03/26] Version v1.7.1 (#332) Signed-off-by: Raul Sevilla --- images/airflow/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/images/airflow/Dockerfile b/images/airflow/Dockerfile index 52f400766..df1ce250f 100644 --- a/images/airflow/Dockerfile +++ b/images/airflow/Dockerfile @@ -14,7 +14,7 @@ RUN curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | b ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6 -RUN curl -L https://github.com/cloud-bulldozer/kube-burner/releases/download/v1.6/kube-burner-1.6-Linux-x86_64.tar.gz | tar xz -C /usr/bin kube-burner +RUN curl -L https://github.com/cloud-bulldozer/kube-burner/releases/download/v1.7.1/kube-burner-1.7.1-Linux-x86_64.tar.gz | tar xz -C /usr/bin kube-burner RUN curl -L https://github.com/jtaleric/k8s-netperf/releases/download/v0.0.7/k8s-netperf_0.0.7_linux_amd64.tar.gz | tar xz -C /usr/bin k8s-netperf USER airflow RUN pip install prometheus-api-client elasticsearch apache-airflow-providers-elasticsearch apache-airflow-providers-cncf-kubernetes --upgrade From 76df166f344cf1eaf8877a01cff855eb1a0aca10 Mon Sep 17 00:00:00 2001 From: "Joe Talerico (rook)" Date: Mon, 26 Jun 2023 12:24:28 -0400 Subject: [PATCH 04/26] Removing workload nodes from ROSA (#331) - We have removed workload from self-managed, we should remove from ROSA as well. Signed-off-by: Joe Talerico --- .../scripts/install/rosa.sh | 42 ------------------- 1 file changed, 42 deletions(-) diff --git a/dags/openshift_nightlies/scripts/install/rosa.sh b/dags/openshift_nightlies/scripts/install/rosa.sh index 18e644133..5a1037306 100755 --- a/dags/openshift_nightlies/scripts/install/rosa.sh +++ b/dags/openshift_nightlies/scripts/install/rosa.sh @@ -105,38 +105,6 @@ _login_check(){ echo "Failed to login after 100 attempts with 5 sec interval" } -_wait_for_workload_nodes_ready(){ - _download_kubeconfig "$(_get_cluster_id $1)" ./kubeconfig - export KUBECONFIG=./kubeconfig - ALL_READY_ITERATIONS=0 - ITERATIONS=0 - # Node count is number of workload nodes, which is 3 - NODES_COUNT=3 - # 180 seconds per node, waiting 5 times 60 seconds (5*60 = 5 minutes) with all nodes ready to finalize - while [ ${ITERATIONS} -le ${NODES_COUNT} ] ; do - NODES_READY_COUNT=$(oc get nodes | grep -i workload | grep " Ready " | wc -l) - if [ ${NODES_READY_COUNT} -ne ${NODES_COUNT} ] ; then - echo "WARNING: ${ITERATIONS}/${NODES_COUNT} iterations. ${NODES_READY_COUNT}/${NODES_COUNT} nodes ready. Waiting 180 seconds for next check" - ALL_READY_ITERATIONS=0 - ITERATIONS=$((${ITERATIONS}+1)) - sleep 180 - else - if [ ${ALL_READY_ITERATIONS} -eq 5 ] ; then - echo "INFO: ${ALL_READY_ITERATIONS}/5. All nodes ready, continuing process" - return 0 - else - echo "INFO: ${ALL_READY_ITERATIONS}/5. All nodes ready. Waiting 60 seconds for next check" - ALL_READY_ITERATIONS=$((${ALL_READY_ITERATIONS}+1)) - sleep 60 - fi - fi - done - END_CLUSTER_STATUS="Ready. No Workers" - echo "ERROR: Workload nodes (${NODES_READY_COUNT}/${NODES_COUNT}) are ready after about $((${NODES_COUNT}*3)) minutes, dumping oc get nodes..." - oc get nodes - exit 1 -} - _wait_for_cluster_ready(){ START_TIMER=$(date +%s) echo "INFO: Installation starts at $(date -d @${START_TIMER})" @@ -437,7 +405,6 @@ install(){ postinstall(){ # sleeping to address issue #324 sleep 120 - export WORKLOAD_TYPE=$(cat ${json_file} | jq -r .openshift_workload_node_instance_type) export EXPIRATION_TIME=$(cat ${json_file} | jq -r .rosa_expiration_time) _download_kubeconfig "$(_get_cluster_id ${CLUSTER_NAME})" ./kubeconfig unset KUBECONFIG @@ -447,10 +414,6 @@ postinstall(){ export PASSWORD=$(echo ${CLUSTER_NAME} | md5sum | awk '{print $1}') ocm create idp -n localauth -t htpasswd --username kubeadmin --password ${PASSWORD} -c ${CLUSTER_NAME} ocm create user kubeadmin -c "$(_get_cluster_id ${CLUSTER_NAME})" --group=cluster-admins - if [[ $WORKLOAD_TYPE != "null" ]]; then - # create machinepool for workload nodes - ocm create machinepool -c ${CLUSTER_NAME} --instance-type ${WORKLOAD_TYPE} --labels 'node-role.kubernetes.io/workload=' --taints 'role=workload:NoSchedule' --replicas 3 workload - fi # set expiration time EXPIRATION_STRING=$(date -d "${EXPIRATION_TIME} minutes" '+{"expiration_timestamp": "%FT%TZ"}') ocm patch /api/clusters_mgmt/v1/clusters/"$(_get_cluster_id ${CLUSTER_NAME})" <<< ${EXPIRATION_STRING} @@ -460,10 +423,6 @@ postinstall(){ URL=$(rosa describe cluster -c $CLUSTER_NAME --output json | jq -r ".api.url") PASSWORD=$(rosa create admin -c "$(_get_cluster_id ${CLUSTER_NAME})" -y 2>/dev/null | grep "oc login" | awk '{print $7}') if [ $HCP == "true" ]; then _login_check $URL $PASSWORD; fi - if [[ $WORKLOAD_TYPE != "null" ]]; then - # create machinepool for workload nodes - rosa create machinepool -c ${CLUSTER_NAME} --instance-type ${WORKLOAD_TYPE} --name workload --labels node-role.kubernetes.io/workload= --taints role=workload:NoSchedule --replicas 3 - fi # set expiration to 24h rosa edit cluster -c "$(_get_cluster_id ${CLUSTER_NAME})" --expiration=${EXPIRATION_TIME}m fi @@ -685,7 +644,6 @@ if [[ "$operation" == "install" ]]; then install index_metadata fi - if [[ $WORKLOAD_TYPE != "null" ]]; then _wait_for_workload_nodes_ready ${CLUSTER_NAME}; fi elif [ "${CLUSTER_STATUS}" == "ready" ] ; then printf "INFO: Cluster ${CLUSTER_NAME} already installed and ready, reusing..." postinstall From 59337ca5ba7be18790f8c85ab79d31f5d868c3d2 Mon Sep 17 00:00:00 2001 From: Murali Krishnasamy <70236227+mukrishn@users.noreply.github.com> Date: Tue, 27 Jun 2023 11:53:54 -0400 Subject: [PATCH 05/26] delete and retry errored rosa cluster (#333) * delete and retry errored rosa cluster * force a failure * corrected indentation --- dags/openshift_nightlies/scripts/install/rosa.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/dags/openshift_nightlies/scripts/install/rosa.sh b/dags/openshift_nightlies/scripts/install/rosa.sh index 5a1037306..3676b2455 100755 --- a/dags/openshift_nightlies/scripts/install/rosa.sh +++ b/dags/openshift_nightlies/scripts/install/rosa.sh @@ -646,10 +646,15 @@ if [[ "$operation" == "install" ]]; then fi elif [ "${CLUSTER_STATUS}" == "ready" ] ; then printf "INFO: Cluster ${CLUSTER_NAME} already installed and ready, reusing..." - postinstall + postinstall + elif [ "${CLUSTER_STATUS}" == "error" ] ; then + printf "INFO: Cluster ${CLUSTER_NAME} errored, cleaning them now..." + cleanup + printf "INFO: Fail this install to re-try a fresh install" + exit 1 else printf "INFO: Cluster ${CLUSTER_NAME} already installed but not ready, exiting..." - exit 1 + exit 1 fi elif [[ "$operation" == "cleanup" ]]; then From 81a0039b47478aa20501511932ccf59c4c86cb1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Sevilla?= Date: Wed, 28 Jun 2023 11:40:35 +0200 Subject: [PATCH 06/26] The current k8s-netperf statement is breaking the image build process (#334) Signed-off-by: Raul Sevilla --- images/airflow/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/images/airflow/Dockerfile b/images/airflow/Dockerfile index df1ce250f..166050650 100644 --- a/images/airflow/Dockerfile +++ b/images/airflow/Dockerfile @@ -15,6 +15,6 @@ RUN curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | b ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6 RUN curl -L https://github.com/cloud-bulldozer/kube-burner/releases/download/v1.7.1/kube-burner-1.7.1-Linux-x86_64.tar.gz | tar xz -C /usr/bin kube-burner -RUN curl -L https://github.com/jtaleric/k8s-netperf/releases/download/v0.0.7/k8s-netperf_0.0.7_linux_amd64.tar.gz | tar xz -C /usr/bin k8s-netperf +RUN curl -L https://github.com/cloud-bulldozer/k8s-netperf/releases/download/v0.1.11/k8s-netperf_Linux_v0.1.11_x86_64.tar.gz | tar xz -C /usr/bin k8s-netperf USER airflow RUN pip install prometheus-api-client elasticsearch apache-airflow-providers-elasticsearch apache-airflow-providers-cncf-kubernetes --upgrade From b154dc160476379019f41349391b3832b8301ace Mon Sep 17 00:00:00 2001 From: Murali Krishnasamy <70236227+mukrishn@users.noreply.github.com> Date: Wed, 5 Jul 2023 13:44:01 -0400 Subject: [PATCH 07/26] `oc login` is not required (#336) It has KUBECONFIG env var set already, oc login is not required again. --- .../scripts/utils/run_scale_ci_diagnosis.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/dags/openshift_nightlies/scripts/utils/run_scale_ci_diagnosis.sh b/dags/openshift_nightlies/scripts/utils/run_scale_ci_diagnosis.sh index dd08950c9..3f388b222 100755 --- a/dags/openshift_nightlies/scripts/utils/run_scale_ci_diagnosis.sh +++ b/dags/openshift_nightlies/scripts/utils/run_scale_ci_diagnosis.sh @@ -27,10 +27,6 @@ setup(){ curl -sS https://mirror.openshift.com/pub/openshift-v4/clients/ocp/latest/openshift-client-linux.tar.gz | tar xz oc export PATH=$PATH:$(pwd) - - if [[ ! -z "$KUBEADMIN_PASSWORD" ]]; then - oc login -u kubeadmin -p $KUBEADMIN_PASSWORD --insecure-skip-tls-verify - fi } setup From d778a4aabd36171185c409587173e44693842e62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Sevilla?= Date: Thu, 6 Jul 2023 09:48:42 +0200 Subject: [PATCH 08/26] Bump to airflow 2.6.2 (#335) * Bump to airflow 2.6.2 Signed-off-by: Raul Sevilla * Update nocp manifest Signed-off-by: Raul Sevilla * Remove default value Signed-off-by: Raul Sevilla * Golang 1.19 Signed-off-by: Raul Sevilla * Remove conflicting markupsafe Signed-off-by: Raul Sevilla * Bump kube-burner Signed-off-by: Raul Sevilla * Replace kube-burner location Signed-off-by: Raul Sevilla * Fix pydantic issue Signed-off-by: Raul Sevilla --------- Signed-off-by: Raul Sevilla --- Makefile | 2 +- dags/common/models/dag_config.py | 2 +- dags/nocp/manifest.yaml | 2 +- dags/openshift_nightlies/manifest.yaml | 2 +- dags/requirements.txt | 6 ++++++ dags/setup.cfg | 9 --------- dags/tox.ini | 8 +------- images/airflow-ansible/Dockerfile | 2 +- images/airflow-managed-services/Dockerfile | 6 +++--- images/airflow/Dockerfile | 3 +-- scripts/common.sh | 4 ++-- scripts/playground/templates/airflow.yaml | 6 ++++-- scripts/tenant/templates/airflow.yaml | 4 ++-- 13 files changed, 24 insertions(+), 32 deletions(-) create mode 100644 dags/requirements.txt diff --git a/Makefile b/Makefile index 85e7c7cec..88136590c 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ QUAY_ACCOUNT ?= quay.io/cloud-bulldozer IMAGE_BUILDER ?= podman -AIRFLOW_VERSION ?= 2.3.2 +AIRFLOW_VERSION ?= 2.6.2 AIRFLOW_PYTHON_VERSION ?= python3.8 AIRFLOW_IMAGE_TAG ?= $(AIRFLOW_VERSION)-$(AIRFLOW_PYTHON_VERSION) IMAGE_TAG ?= $(AIRFLOW_VERSION) diff --git a/dags/common/models/dag_config.py b/dags/common/models/dag_config.py index e3e3cd587..9611eeccd 100644 --- a/dags/common/models/dag_config.py +++ b/dags/common/models/dag_config.py @@ -20,6 +20,6 @@ class DagConfig: }) executor_image: Optional[dict] = field(default_factory=lambda: { "repository": "quay.io/cloud-bulldozer", - "tag": "2.3.2" + "tag": "2.6.2" }) dependencies: Optional[dict] = field(default_factory=lambda: {}) diff --git a/dags/nocp/manifest.yaml b/dags/nocp/manifest.yaml index 4583c4376..17debd8d9 100644 --- a/dags/nocp/manifest.yaml +++ b/dags/nocp/manifest.yaml @@ -7,4 +7,4 @@ dagConfig: cleanupOnSuccess: true executorImages: repository: quay.io/cloud-bulldozer - tag: 2.3.2 + tag: 2.6.2 diff --git a/dags/openshift_nightlies/manifest.yaml b/dags/openshift_nightlies/manifest.yaml index 85c9e9d89..0f836d8ce 100644 --- a/dags/openshift_nightlies/manifest.yaml +++ b/dags/openshift_nightlies/manifest.yaml @@ -25,7 +25,7 @@ dagConfig: cleanupOnSuccess: true executorImages: repository: quay.io/cloud-bulldozer - tag: 2.3.2 + tag: 2.6.2 dependencies: e2e_benchmarking: repo: https://github.com/cloud-bulldozer/e2e-benchmarking.git diff --git a/dags/requirements.txt b/dags/requirements.txt new file mode 100644 index 000000000..15f170356 --- /dev/null +++ b/dags/requirements.txt @@ -0,0 +1,6 @@ +kubernetes>=25.0.0 +apache-airflow==2.6.2 +prometheus-api-client==0.5.2 +elasticsearch==7.13.4 +apache-airflow-providers-slack==7.3.0 +pydantic>=1.10.0,<2.0.0 # https://github.com/apache/airflow/issues/32311 diff --git a/dags/setup.cfg b/dags/setup.cfg index f4b42bd6b..0c07ceaf7 100644 --- a/dags/setup.cfg +++ b/dags/setup.cfg @@ -20,15 +20,6 @@ home-page = https://github.com/cloud-bulldozer/benchmark-operator/cli zip_safe = False packages = find: include_package_data = True -# Add here dependencies of your project (semicolon/line-separated), e.g. -install_requires = - kubernetes>=25.0.0 - apache-airflow==2.3.2 - prometheus-api-client==0.5.2 - elasticsearch==7.13.4 - apache-airflow-providers-slack==7.1.0 - markupsafe==2.0.1 - python_requires = >=3.8 [options.extras_require] diff --git a/dags/tox.ini b/dags/tox.ini index e7b1fee95..cf7dc147d 100644 --- a/dags/tox.ini +++ b/dags/tox.ini @@ -8,13 +8,7 @@ extras = tests setenv = py{38,39}-unit: COVERAGE_FILE = .coverage.{envname} -deps = - kubernetes>=25.0.0 - apache-airflow==2.3.2 - prometheus-api-client==0.5.2 - elasticsearch==7.13.4 - apache-airflow-providers-slack==7.1.0 - markupsafe==2.0.1 +deps = -r{toxinidir}/requirements.txt python_requires = >=3.8 diff --git a/images/airflow-ansible/Dockerfile b/images/airflow-ansible/Dockerfile index 860271d70..e5b02608f 100644 --- a/images/airflow-ansible/Dockerfile +++ b/images/airflow-ansible/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=quay.io/cloud-bulldozer/airflow:2.3.2 +ARG BASE_IMAGE FROM ${BASE_IMAGE} USER root RUN apt install bc awscli -y diff --git a/images/airflow-managed-services/Dockerfile b/images/airflow-managed-services/Dockerfile index 8e0d2b186..5f306bc9a 100644 --- a/images/airflow-managed-services/Dockerfile +++ b/images/airflow-managed-services/Dockerfile @@ -1,7 +1,7 @@ -ARG BASE_IMAGE=quay.io/cloud-bulldozer/airflow:2.3.2 +ARG BASE_IMAGE=quay.io/cloud-bulldozer/airflow:latest # Hypershift Compilation -FROM golang:1.18 AS hypershift -RUN git clone --branch main https://github.com/openshift/hypershift +FROM golang:1.19 AS hypershift +RUN git clone --single-branch --branch main https://github.com/openshift/hypershift --depth=1 WORKDIR hypershift RUN make build # Runtime image diff --git a/images/airflow/Dockerfile b/images/airflow/Dockerfile index 166050650..c15ed949a 100644 --- a/images/airflow/Dockerfile +++ b/images/airflow/Dockerfile @@ -14,7 +14,6 @@ RUN curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | b ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6 -RUN curl -L https://github.com/cloud-bulldozer/kube-burner/releases/download/v1.7.1/kube-burner-1.7.1-Linux-x86_64.tar.gz | tar xz -C /usr/bin kube-burner +RUN curl -L https://github.com/cloud-bulldozer/kube-burner/releases/download/v1.7.2/kube-burner-V1.7.2-linux-x86_64.tar.gz | tar xz -C /usr/bin kube-burner RUN curl -L https://github.com/cloud-bulldozer/k8s-netperf/releases/download/v0.1.11/k8s-netperf_Linux_v0.1.11_x86_64.tar.gz | tar xz -C /usr/bin k8s-netperf USER airflow -RUN pip install prometheus-api-client elasticsearch apache-airflow-providers-elasticsearch apache-airflow-providers-cncf-kubernetes --upgrade diff --git a/scripts/common.sh b/scripts/common.sh index b8cd6e20b..7d0228667 100644 --- a/scripts/common.sh +++ b/scripts/common.sh @@ -33,14 +33,14 @@ output_info() { _argo_password=$(kubectl get secret/argocd-initial-admin-secret -n argocd -o jsonpath='{.data.password}' | base64 --decode) printf "\n\n ArgoCD Configs" - printf "\n Host: $_argo_url \n User: $_argo_user \n Password: $_argo_password" + printf "\n Host: https://$_argo_url \n User: $_argo_user \n Password: $_argo_password" _airflow_url=$(oc get route/airflow -o jsonpath='{.spec.host}' -n $_airflow_ns) _airflow_user="admin" _airflow_password="REDACTED" printf "\n\n Airflow Configs (Password was user defined so this script doesn't know it!)" - printf "\n Host: $_airflow_url \n User: $_airflow_user \n Password: $_airflow_password\n\n" + printf "\n Host: https://$_airflow_url \n User: $_airflow_user \n Password: $_airflow_password\n\n" _results_dashboard_url=$(oc get route/perf-dashboard -o jsonpath='{.spec.host}' -n dashboard) if [ -z "$_results_dashboard_url" ]; then diff --git a/scripts/playground/templates/airflow.yaml b/scripts/playground/templates/airflow.yaml index 1adbaaa76..a06b4d8c7 100644 --- a/scripts/playground/templates/airflow.yaml +++ b/scripts/playground/templates/airflow.yaml @@ -22,8 +22,8 @@ spec: releaseName: airflow values: |- defaultAirflowRepository: quay.io/cloud-bulldozer/airflow - defaultAirflowTag: 2.3.2 - airflowVersion: 2.3.2 + defaultAirflowTag: 2.6.2 + airflowVersion: 2.6.2 executor: KubernetesExecutor createUserJob: useHelmHooks: false @@ -107,6 +107,8 @@ spec: name: airflow-webserver weight: 100 wildcardPolicy: None + tls: + termination: edge --- apiVersion: rbac.authorization.k8s.io/v1 kind: Role diff --git a/scripts/tenant/templates/airflow.yaml b/scripts/tenant/templates/airflow.yaml index f9e5a201f..fa717bf8b 100644 --- a/scripts/tenant/templates/airflow.yaml +++ b/scripts/tenant/templates/airflow.yaml @@ -23,8 +23,8 @@ spec: releaseName: airflow values: |- defaultAirflowRepository: quay.io/cloud-bulldozer/airflow - defaultAirflowTag: 2.3.2 - airflowVersion: 2.3.2 + defaultAirflowTag: 2.6.2 + airflowVersion: 2.6.2 executor: KubernetesExecutor images: airflow: From 12b5201bcc4092559d2f73e0c9838534cd5d5785 Mon Sep 17 00:00:00 2001 From: Murali Krishnasamy <70236227+mukrishn@users.noreply.github.com> Date: Thu, 6 Jul 2023 12:03:35 -0400 Subject: [PATCH 09/26] updated workflow (#311) --- .github/workflows/release.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index a34add4a2..1cc691064 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,10 +1,12 @@ name: Release Airflow Images on: + schedule: # triggers every midnight + - cron: '0 0 * * *' push: branches: - master tags: - - "*" # triggers only if push new tag version + - "*" # triggers on a push event jobs: containers: From 626d70e532e05efd14c26a1f13b88549f9fb2a81 Mon Sep 17 00:00:00 2001 From: Murali Krishnasamy <70236227+mukrishn@users.noreply.github.com> Date: Thu, 6 Jul 2023 15:07:17 -0400 Subject: [PATCH 10/26] Hypershift: rosa cli to create multi az hosted cluster (#316) * rebased and squashed all commits * clean up & retry errored cluster during install * include multi-az for classic builds * revalidate oc login for 10 times * add extra machinepools and migrate infra pods (#15) * Revert "add extra machinepools and migrate infra pods (#15)" (#16) This reverts commit 2edac30d3fa098dcabdebbed661786044b23a406. * Update manifest.yaml --- .../benchmarks/hosted-control-plane-p75.json | 17 +- .../config/install/rosa/rosa-hcp-ovn.json | 5 +- .../scripts/install/rosa.sh | 313 ++++++++++++------ .../scripts/run_benchmark.sh | 2 +- .../tasks/benchmarks/e2e.py | 5 +- .../tasks/install/rosa/defaults.json | 4 +- 6 files changed, 222 insertions(+), 124 deletions(-) diff --git a/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p75.json b/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p75.json index f1ab97671..d64034267 100644 --- a/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p75.json +++ b/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p75.json @@ -2,21 +2,14 @@ "benchmarks": [ { "name": "cluster-density-ms-p75", - "workload": "kube-burner", + "workload": "kube-burner-ocp-wrapper", + "trigger_rule": "all_done", "command": "./run.sh", "env": { "WORKLOAD": "cluster-density-ms", - "JOB_ITERATIONS": "75", - "JOB_TIMEOUT": "18000", - "STEP_SIZE": "2m", - "HYPERSHIFT": "true", - "METRICS_PROFILE": "metrics-profiles/hypershift-metrics.yaml", - "QPS": "20", - "BURST": "20", - "LOG_LEVEL": "info", - "PLATFORM_ALERTS": "false", - "CLEANUP_WHEN_FINISH": "true", - "CLEANUP": "true" + "ITERATIONS": "75", + "LOG_LEVEL": "debug", + "EXTRA_FLAGS": "--churn-duration=1h --churn-percent=10 --churn-delay=30s" } } ] diff --git a/dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn.json b/dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn.json index 2802d6aca..c144f6820 100644 --- a/dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn.json +++ b/dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn.json @@ -7,16 +7,15 @@ "aws_region": "us-east-2", "rosa_environment": "staging", "rosa_cli_version": "master", + "ocm_cli_version": "v0.1.67", "ocm_environment": "stage", "managed_channel_group": "nightly", "managed_ocp_version": "latest", - "openshift_worker_count": 7, + "openshift_worker_count": 9, "openshift_network_type": "OVNKubernetes", "openshift_worker_instance_type": "m5.2xlarge", "machineset_metadata_label_prefix": "machine.openshift.io", - "staging_mgmt_cluster_name": "hs-mc-0vfs0e6gg", "staging_mgmt_provisioner_shards": "b4bb294b-a76c-11ed-91b2-0a580a831ba1", - "staging_svc_cluster_name": "hs-sc-0vfs0cl5g", "number_of_hostedcluster": 2, "hcp_install_interval": 60 } diff --git a/dags/openshift_nightlies/scripts/install/rosa.sh b/dags/openshift_nightlies/scripts/install/rosa.sh index 3676b2455..01a2f3317 100755 --- a/dags/openshift_nightlies/scripts/install/rosa.sh +++ b/dags/openshift_nightlies/scripts/install/rosa.sh @@ -41,19 +41,19 @@ _wait_for_nodes_ready(){ ITERATIONS=0 if [ $HCP == "true" ]; then NODES_COUNT=$2 - ALL_READY_ITERATIONS=5 + ALL_READY_ITERATIONS=4 #reduced extra buffers for hosted cp clusters else # Node count is number of workers + 3 masters + 3 infra NODES_COUNT=$(($2+6)) fi - # 180 seconds per node, waiting 5 times 60 seconds (5*60 = 5 minutes) with all nodes ready to finalize - while [ ${ITERATIONS} -le $((${NODES_COUNT}+2)) ] ; do + # 30 seconds per node, waiting for all nodes ready to finalize + while [ ${ITERATIONS} -le $((${NODES_COUNT}*5)) ] ; do NODES_READY_COUNT=$(oc get nodes | grep " Ready " | wc -l) if [ ${NODES_READY_COUNT} -ne ${NODES_COUNT} ] ; then - echo "WARNING: ${ITERATIONS}/${NODES_COUNT} iterations. ${NODES_READY_COUNT}/${NODES_COUNT} nodes ready. Waiting 180 seconds for next check" + echo "WARNING: ${ITERATIONS}/${NODES_COUNT} iterations. ${NODES_READY_COUNT}/${NODES_COUNT} nodes ready. Waiting 30 seconds for next check" # ALL_READY_ITERATIONS=0 ITERATIONS=$((${ITERATIONS}+1)) - sleep 180 + sleep 30 else if [ ${ALL_READY_ITERATIONS} -eq 5 ] ; then echo "INFO: ${ALL_READY_ITERATIONS}/5. All nodes ready, continuing process" @@ -73,7 +73,7 @@ _wait_for_nodes_ready(){ _aws_cmd(){ ITR=0 - while [ $ITR -le 20 ]; do + while [ $ITR -le 30 ]; do if [[ "$(aws ec2 $1 2>&1)" == *"error"* ]]; then echo "Failed to $1, retrying after 30 seconds" ITR=$(($ITR+1)) @@ -82,6 +82,8 @@ _aws_cmd(){ return 0 fi done + echo "Failed to $1 after 10 minutes of multiple retries" + exit 1 } _login_check(){ @@ -93,16 +95,46 @@ _login_check(){ echo "Attempt $ITR: Failed to login $1, retrying after 5 seconds" ITR=$(($ITR+1)) sleep 5 + RECHECK=1 + else + if [[ $RECHECK -eq 10 ]]; then + CURRENT_TIMER=$(date +%s) + # Time since rosa cluster is ready until all nodes are ready + DURATION=$(($CURRENT_TIMER - $START_TIMER)) + INDEXDATA+=("cluster_admin_login-${DURATION}") + _adm_logic_check $1 $2 + return 0 + else + echo "Rechecking login for $((10-$RECHECK)) more times" + RECHECK=$(($RECHECK+1)) + sleep 1 + fi + fi + done + END_CLUSTER_STATUS="Ready. Not Access" + echo "Failed to login after 100 attempts with 5 sec interval" +} + +_adm_logic_check(){ + ITR=1 + START_TIMER=$(date +%s) + while [ $ITR -le 100 ]; do + oc login $1 --username cluster-admin --password $2 --insecure-skip-tls-verify=true --request-timeout=30s + CHECK=$(oc adm top images 2>&1 > /dev/null) + if [[ $? != 0 ]]; then + echo "Attempt $ITR: Failed to login $1, retrying after 5 seconds" + ITR=$(($ITR+1)) + sleep 5 else CURRENT_TIMER=$(date +%s) # Time since rosa cluster is ready until all nodes are ready DURATION=$(($CURRENT_TIMER - $START_TIMER)) - INDEXDATA+=("login-${DURATION}") + INDEXDATA+=("cluster_oc_adm-${DURATION}") return 0 fi - done + done END_CLUSTER_STATUS="Ready. Not Access" - echo "Failed to login after 100 attempts with 5 sec interval" + echo "Failed to execute oc adm commands after 100 attempts with 5 sec interval" } _wait_for_cluster_ready(){ @@ -173,10 +205,6 @@ _wait_for_cluster_ready(){ _create_aws_vpc(){ - echo "Allocate Elastic IP" - aws ec2 allocate-address --tag-specifications ResourceType=elastic-ip,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=eip-$CLUSTER_NAME}]" --output json - export E_IP=$(aws ec2 describe-addresses --filters "Name=tag:Name,Values=eip-$CLUSTER_NAME" --output json | jq -r ".Addresses[0].AllocationId") - echo "Create Internet Gateway" aws ec2 create-internet-gateway --tag-specifications ResourceType=internet-gateway,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=igw-$CLUSTER_NAME}]" --output json export IGW=$(aws ec2 describe-internet-gateways --filters "Name=tag:Name,Values=igw-$CLUSTER_NAME" --output json | jq -r ".InternetGateways[0].InternetGatewayId") @@ -189,25 +217,39 @@ _create_aws_vpc(){ aws ec2 modify-vpc-attribute --vpc-id $VPC --enable-dns-hostnames "{\"Value\":true}" aws ec2 attach-internet-gateway --vpc-id $VPC --internet-gateway-id $IGW - echo "Create Subnets and Route tables" - aws ec2 create-subnet --vpc-id $VPC --cidr-block 10.0.1.0/24 --tag-specifications ResourceType=subnet,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=public-subnet-$CLUSTER_NAME}]" --output json - export PUB_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=public-subnet-$CLUSTER_NAME" --output json | jq -r ".Subnets[0].SubnetId") - aws ec2 create-nat-gateway --subnet-id $PUB_SUB --allocation-id $E_IP --tag-specifications ResourceType=natgateway,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=ngw-$CLUSTER_NAME}]" --output json - export NGW=$(aws ec2 describe-nat-gateways --filter "Name=tag:Name,Values=ngw-$CLUSTER_NAME" --output json | jq -r ".NatGateways[]" | jq -r 'select(.State == "available" or .State == "pending")' | jq -r ".NatGatewayId") aws ec2 create-route-table --vpc-id $VPC --tag-specifications ResourceType=route-table,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=public-rt-table-$CLUSTER_NAME}]" --output json export PUB_RT_TB=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=public-rt-table-$CLUSTER_NAME" --output json | jq -r '.RouteTables[0].RouteTableId') - aws ec2 associate-route-table --route-table-id $PUB_RT_TB --subnet-id $PUB_SUB aws ec2 create-route --route-table-id $PUB_RT_TB --destination-cidr-block 0.0.0.0/0 --gateway-id $IGW - aws ec2 create-subnet --vpc-id $VPC --cidr-block 10.0.2.0/24 --tag-specifications ResourceType=subnet,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=private-subnet-$CLUSTER_NAME}]" --output json - export PRI_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=private-subnet-$CLUSTER_NAME" --output json | jq -r ".Subnets[0].SubnetId") - aws ec2 create-route-table --vpc-id $VPC --tag-specifications ResourceType=route-table,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=private-rt-table-$CLUSTER_NAME}]" --output json - export PRI_RT_TB=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=private-rt-table-$CLUSTER_NAME" --output json | jq -r '.RouteTables[0].RouteTableId') - aws ec2 associate-route-table --route-table-id $PRI_RT_TB --subnet-id $PRI_SUB - aws ec2 create-route --route-table-id $PRI_RT_TB --destination-cidr-block 0.0.0.0/0 --gateway-id $NGW + ITR=0 + export ALL_PRI_RT_TB="" + for ZONE in a b c; + do + ITR=$((ITR+1)) + echo "Allocate Elastic IP" + aws ec2 allocate-address --tag-specifications ResourceType=elastic-ip,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=eip-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json + export E_IP=$(aws ec2 describe-addresses --filters "Name=tag:Name,Values=eip-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Addresses[0].AllocationId") + + echo "Create Subnets and Route tables" + aws ec2 create-subnet --vpc-id $VPC --cidr-block 10.0.$ITR.0/24 --availability-zone $AWS_REGION$ZONE --tag-specifications ResourceType=subnet,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=public-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json + export PUB_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=public-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Subnets[0].SubnetId") + aws ec2 create-nat-gateway --subnet-id $PUB_SUB --allocation-id $E_IP --tag-specifications ResourceType=natgateway,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=ngw-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json + export NGW=$(aws ec2 describe-nat-gateways --filter "Name=tag:Name,Values=ngw-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".NatGateways[]" | jq -r 'select(.State == "available" or .State == "pending")' | jq -r ".NatGatewayId") + echo "Wait until NatGateway $NGW is available" + aws ec2 wait nat-gateway-available --nat-gateway-ids $NGW + aws ec2 associate-route-table --route-table-id $PUB_RT_TB --subnet-id $PUB_SUB + + aws ec2 create-subnet --vpc-id $VPC --cidr-block 10.0.$((ITR+10)).0/24 --availability-zone $AWS_REGION$ZONE --tag-specifications ResourceType=subnet,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=private-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json + export PRI_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=private-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Subnets[0].SubnetId") + aws ec2 create-route-table --vpc-id $VPC --tag-specifications ResourceType=route-table,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=private-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json + export PRI_RT_TB=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=private-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r '.RouteTables[0].RouteTableId') + export ALL_PRI_RT_TB="${ALL_PRI_RT_TB} ${PRI_RT_TB}" + aws ec2 associate-route-table --route-table-id $PRI_RT_TB --subnet-id $PRI_SUB + aws ec2 create-route --route-table-id $PRI_RT_TB --destination-cidr-block 0.0.0.0/0 --gateway-id $NGW + done echo "Create private VPC endpoint to S3" - aws ec2 create-vpc-endpoint --vpc-id $VPC --service-name com.amazonaws.$AWS_REGION.s3 --route-table-ids $PRI_RT_TB --tag-specifications ResourceType=vpc-endpoint,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=vpce-$CLUSTER_NAME}]" + aws ec2 create-vpc-endpoint --vpc-id $VPC --service-name com.amazonaws.$AWS_REGION.s3 --route-table-ids $ALL_PRI_RT_TB --tag-specifications ResourceType=vpc-endpoint,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=vpce-$CLUSTER_NAME}]" } _delete_aws_vpc(){ @@ -218,29 +260,37 @@ _delete_aws_vpc(){ export VPCE=$(aws ec2 describe-vpc-endpoints --filters "Name=tag:Name,Values=vpce-$CLUSTER_NAME" --output json | jq -r '.VpcEndpoints[0].VpcEndpointId') if [ $VPCE != null ]; then _aws_cmd "delete-vpc-endpoints --vpc-endpoint-ids $VPCE"; fi - echo "Delete Subnets and Route tables" - export PRI_RT_TB=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=private-rt-table-$CLUSTER_NAME" --output json | jq -r '.RouteTables[0].RouteTableId') - export RT_TB_ASSO_ID=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=private-rt-table-$CLUSTER_NAME" --output json | jq -r '.RouteTables[0].Associations[0].RouteTableAssociationId') - export PRI_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=private-subnet-$CLUSTER_NAME" --output json | jq -r ".Subnets[0].SubnetId") - - if [ $PRI_RT_TB != null ]; then _aws_cmd "delete-route --route-table-id $PRI_RT_TB --destination-cidr-block 0.0.0.0/0"; fi - if [ $RT_TB_ASSO_ID != null ]; then _aws_cmd "disassociate-route-table --association-id $RT_TB_ASSO_ID"; fi - if [ $PRI_RT_TB != null ]; then _aws_cmd "delete-route-table --route-table-id $PRI_RT_TB"; fi - if [ $PRI_SUB != null ]; then _aws_cmd "delete-subnet --subnet-id $PRI_SUB"; fi + export ELB=$(aws elb describe-load-balancers --output json | jq -r '.LoadBalancerDescriptions[]'| jq -r 'select(.VPCId == '\"${VPC}\"')' | jq -r '.LoadBalancerName') + if [ $ELB != "" ]; then aws elb delete-load-balancer --load-balancer-name $ELB; fi + + for ZONE in a b c; + do + echo "Delete Subnets and Route tables" + export PRI_RT_TB=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=private-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r '.RouteTables[0].RouteTableId') + export RT_TB_ASSO_ID=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=private-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r '.RouteTables[0].Associations[0].RouteTableAssociationId') + export PRI_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=private-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Subnets[0].SubnetId") + + if [ $PRI_RT_TB != null ]; then _aws_cmd "delete-route --route-table-id $PRI_RT_TB --destination-cidr-block 0.0.0.0/0"; fi + if [ $RT_TB_ASSO_ID != null ]; then _aws_cmd "disassociate-route-table --association-id $RT_TB_ASSO_ID"; fi + if [ $PRI_RT_TB != null ]; then _aws_cmd "delete-route-table --route-table-id $PRI_RT_TB"; fi + if [ $PRI_SUB != null ]; then _aws_cmd "delete-subnet --subnet-id $PRI_SUB"; fi + + export RT_TB_ASSO_ID=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=public-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r '.RouteTables[0].Associations[].RouteTableAssociationId') + export NGW=$(aws ec2 describe-nat-gateways --filter "Name=tag:Name,Values=ngw-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".NatGateways[]" | jq -r 'select(.State == "available")' | jq -r ".NatGatewayId") + export PUB_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=public-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Subnets[0].SubnetId") + export E_IP=$(aws ec2 describe-addresses --filters "Name=tag:Name,Values=eip-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Addresses[0].AllocationId") + + if [ $RT_TB_ASSO_ID != null ]; then for _id in $RT_TB_ASSO_ID; do _aws_cmd "disassociate-route-table --association-id $_id"; done; fi + if [ $NGW != null ]; then _aws_cmd "delete-nat-gateway --nat-gateway-id $NGW"; fi + if [ $PUB_SUB != null ]; then _aws_cmd "delete-subnet --subnet-id $PUB_SUB"; fi + if [ $E_IP != null ]; then _aws_cmd "release-address --allocation-id $E_IP"; fi + done export PUB_RT_TB=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=public-rt-table-$CLUSTER_NAME" --output json | jq -r '.RouteTables[0].RouteTableId') - export RT_TB_ASSO_ID=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=public-rt-table-$CLUSTER_NAME" --output json | jq -r '.RouteTables[0].Associations[0].RouteTableAssociationId') - export NGW=$(aws ec2 describe-nat-gateways --filter "Name=tag:Name,Values=ngw-$CLUSTER_NAME" --output json | jq -r ".NatGateways[]" | jq -r 'select(.State == "available")' | jq -r ".NatGatewayId") - export PUB_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=public-subnet-$CLUSTER_NAME" --output json | jq -r ".Subnets[0].SubnetId") - export E_IP=$(aws ec2 describe-addresses --filters "Name=tag:Name,Values=eip-$CLUSTER_NAME" --output json | jq -r ".Addresses[0].AllocationId") - + if [ $PUB_RT_TB != null ]; then _aws_cmd "delete-route --route-table-id $PUB_RT_TB --destination-cidr-block 0.0.0.0/0"; fi - if [ $RT_TB_ASSO_ID != null ]; then _aws_cmd "disassociate-route-table --association-id $RT_TB_ASSO_ID"; fi if [ $PUB_RT_TB != null ]; then _aws_cmd "delete-route-table --route-table-id $PUB_RT_TB"; fi - if [ $NGW != null ]; then _aws_cmd "delete-nat-gateway --nat-gateway-id $NGW"; fi - if [ $PUB_SUB != null ]; then _aws_cmd "delete-subnet --subnet-id $PUB_SUB"; fi - if [ $E_IP != null ]; then _aws_cmd "release-address --allocation-id $E_IP"; fi - + export IGW=$(aws ec2 describe-internet-gateways --filters "Name=tag:Name,Values=igw-$CLUSTER_NAME" --output json | jq -r ".InternetGateways[0].InternetGatewayId") if [ $IGW != null ]; then _aws_cmd "detach-internet-gateway --internet-gateway-id $IGW --vpc-id $VPC"; fi if [ $IGW != null ]; then _aws_cmd "delete-internet-gateway --internet-gateway-id $IGW"; fi @@ -248,15 +298,15 @@ _delete_aws_vpc(){ echo "Delete Security Group Rules" for g in $(aws ec2 describe-security-groups --filters "Name=vpc-id,Values=$VPC" --output json | jq -r ".SecurityGroups[].GroupId"); do - for r in $(aws ec2 describe-security-group-rules --filters "Name=group-id,Values=$g" --output json | jq -r ".SecurityGroupRules[]" | jq -r "select(.IsEgress == false)" | jq -r ".SecurityGroupRuleId"); - do - aws ec2 revoke-security-group-ingress --security-group-rule-ids $r --group-id $g - done - - for r in $(aws ec2 describe-security-group-rules --filters "Name=group-id,Values=$g" --output json | jq -r ".SecurityGroupRules[]" | jq -r "select(.IsEgress == true)" | jq -r ".SecurityGroupRuleId"); - do - aws ec2 revoke-security-group-egress --security-group-rule-ids $r --group-id $g - done + for r in $(aws ec2 describe-security-group-rules --filters "Name=group-id,Values=$g" --output json | jq -r ".SecurityGroupRules[]" | jq -r "select(.IsEgress == false)" | jq -r ".SecurityGroupRuleId"); + do + aws ec2 revoke-security-group-ingress --security-group-rule-ids $r --group-id $g + done + + for r in $(aws ec2 describe-security-group-rules --filters "Name=group-id,Values=$g" --output json | jq -r ".SecurityGroupRules[]" | jq -r "select(.IsEgress == true)" | jq -r ".SecurityGroupRuleId"); + do + aws ec2 revoke-security-group-egress --security-group-rule-ids $r --group-id $g + done done for g in $(aws ec2 describe-security-groups --filters "Name=vpc-id,Values=$VPC" --output json | jq -r ".SecurityGroups[]" | jq -r 'select(.GroupName != "default")' | jq -r ".GroupId"); @@ -270,6 +320,31 @@ _delete_aws_vpc(){ fi } +_oidc_config(){ + echo "${1} OIDC config, with prefix ${2}" + if [[ $1 == "create" ]]; then + echo "${1} OIDC config" + rosa create oidc-config --mode=auto --managed=false --prefix ${2} -y + export OIDC_CONFIG=$(rosa list oidc-config | grep ${2} | awk '{print$1}') + else + export OIDC_CONFIG=$(rosa list oidc-config | grep ${2} | awk '{print$1}') + if [ ! -z $OIDC_CONFIG ]; then rosa delete oidc-config --mode=auto --oidc-config-id ${OIDC_CONFIG} -y || true; fi # forcing exit 0, as this command may file if it is a shared oidc config + fi +} + +_get_sc_mc_details(){ + if [ -z $SVC_CLUSTER_NAME ]; then + echo "Find Service Cluster" + export SVC_CLUSTER_NAME=$(ocm describe cluster ${CLUSTER_NAME} | grep "Service Cluster" | awk '{print$3}') + fi + if [ -z $MGMT_CLUSTER_NAME ]; then + export MGMT_CLUSTER_NAME=$(ocm describe cluster ${CLUSTER_NAME} | grep "Management Cluster" | awk '{print$3}') + fi + echo "Read Management cluster details" + export MGMT_CLUSTER_DETAILS=$(ocm get /api/clusters_mgmt/v1/clusters | jq -r ".items[]" | jq -r 'select(.name == '\"$MGMT_CLUSTER_NAME\"')') + export NUMBER_OF_HC=$(cat ${json_file} | jq -r .number_of_hostedcluster) +} + setup(){ mkdir /home/airflow/workspace cd /home/airflow/workspace @@ -294,22 +369,24 @@ setup(){ export STAGE_CONFIG="" export MGMT_CLUSTER_NAME=$(cat ${json_file} | jq -r .staging_mgmt_cluster_name) export SVC_CLUSTER_NAME=$(cat ${json_file} | jq -r .staging_svc_cluster_name) + export STAGE_PROV_SHARD=$(cat ${json_file} | jq -r .staging_mgmt_provisioner_shards) + export OIDC_PREFIX=$(cat ${json_file} | jq -r .openshift_cluster_name) export CLUSTER_NAME="${CLUSTER_NAME}-${HOSTED_ID}" # perf-as3-hcp-1, perf-as3-hcp-2.. export KUBECONFIG_NAME=$(echo $KUBECONFIG_NAME | awk -F-kubeconfig '{print$1}')-$HOSTED_ID-kubeconfig export KUBEADMIN_NAME=$(echo $KUBEADMIN_NAME | awk -F-kubeadmin '{print$1}')-$HOSTED_ID-kubeadmin UUID=$(echo $AIRFLOW_CTX_DAG_RUN_ID | base64 | cut -c 1-32 ) - export UUID=${UUID,,} + export UUID=${UUID} fi + export OCM_CLI_VERSION=$(cat ${json_file} | jq -r .ocm_cli_version) + if [[ ${OCM_CLI_VERSION} != "container" ]]; then + OCM_CLI_FORK=$(cat ${json_file} | jq -r .ocm_cli_fork) + git clone -q --depth=1 --single-branch --branch ${OCM_CLI_VERSION} ${OCM_CLI_FORK} + pushd ocm-cli + sudo PATH=$PATH:/usr/bin:/usr/local/go/bin make + sudo mv ocm /usr/local/bin/ + popd + fi if [[ $INSTALL_METHOD == "osd" ]]; then - export OCM_CLI_VERSION=$(cat ${json_file} | jq -r .ocm_cli_version) - if [[ ${OCM_CLI_VERSION} != "container" ]]; then - OCM_CLI_FORK=$(cat ${json_file} | jq -r .ocm_cli_fork) - git clone -q --depth=1 --single-branch --branch ${OCM_CLI_VERSION} ${OCM_CLI_FORK} - pushd ocm-cli - sudo PATH=$PATH:/usr/bin:/usr/local/go/bin make - sudo mv ocm /usr/local/bin/ - popd - fi echo "Clean-up existing OSD access keys.." AWS_KEY=$(aws iam list-access-keys --user-name OsdCcsAdmin --output text --query 'AccessKeyMetadata[*].AccessKeyId') LEN_AWS_KEY=`echo $AWS_KEY | wc -w` @@ -356,6 +433,7 @@ setup(){ install(){ export COMPUTE_WORKERS_TYPE=$(cat ${json_file} | jq -r .openshift_worker_instance_type) export CLUSTER_AUTOSCALE=$(cat ${json_file} | jq -r .cluster_autoscale) + export OIDC_CONFIG=$(cat ${json_file} | jq -r .oidc_config) if [[ $INSTALL_METHOD == "osd" ]]; then if [ "${MANAGED_OCP_VERSION}" == "latest" ] ; then export OCM_VERSION=$(ocm list versions --channel-group ${MANAGED_CHANNEL_GROUP} | grep ^${version} | sort -rV | head -1) @@ -379,21 +457,24 @@ install(){ INSTALLATION_PARAMS="${INSTALLATION_PARAMS} --sts -m auto --yes" fi if [ $HCP == "true" ]; then - export STAGE_PROV_SHARD=$(cat ${json_file} | jq -r .staging_mgmt_provisioner_shards) - echo "Read Management cluster details" - export MGMT_CLUSTER_DETAILS=$(ocm get /api/clusters_mgmt/v1/clusters | jq -r ".items[]" | jq -r 'select(.name == '\"$MGMT_CLUSTER_NAME\"')') - export NUMBER_OF_HC=$(cat ${json_file} | jq -r .number_of_hostedcluster) - echo "Index Managment cluster info" - index_metadata "management" _create_aws_vpc echo "Set start time of prom scrape" export START_TIME=$(date +"%s") if [ $STAGE_PROV_SHARD != "" ]; then STAGE_CONFIG="--properties provision_shard_id:${STAGE_PROV_SHARD}" fi - ROSA_HCP_PARAMS="--hosted-cp ${STAGE_CONFIG} --subnet-ids $PRI_SUB,$PUB_SUB --machine-cidr 10.0.0.0/16" + ALL_SUBNETS=$(aws ec2 describe-subnets --filters "Name=tag:HostedClusterName,Values=$CLUSTER_NAME" --output json | jq -r ".Subnets[].SubnetId") + SUBNETS_IDS="" + for _ID in ${ALL_SUBNETS}; + do + if [[ ${SUBNETS_IDS} == "" ]]; then SUBNETS_IDS=${_ID}; else SUBNETS_IDS=${SUBNETS_IDS}","${_ID}; fi + done + ROSA_HCP_PARAMS="--hosted-cp ${STAGE_CONFIG} --subnet-ids ${SUBNETS_IDS} --machine-cidr 10.0.0.0/16" + export OIDC_CONFIG=$(rosa list oidc-config | grep $OIDC_PREFIX | awk '{print$1}') + if [ -z $OIDC_CONFIG ]; then _oidc_config create $OIDC_PREFIX; fi + ROSA_HCP_PARAMS="${ROSA_HCP_PARAMS} --oidc-config-id ${OIDC_CONFIG}" else - INSTALLATION_PARAMS="${INSTALLATION_PARAMS} --multi-az" # Multi AZ is not supported on hosted-cp cluster + INSTALLATION_PARAMS="${INSTALLATION_PARAMS} --multi-az" # Multi AZ is default on hosted-cp cluster fi rosa create cluster --tags=User:${GITHUB_USERNAME} --cluster-name ${CLUSTER_NAME} --version "${ROSA_VERSION}" --channel-group=${MANAGED_CHANNEL_GROUP} --compute-machine-type ${COMPUTE_WORKERS_TYPE} --replicas ${COMPUTE_WORKERS_NUMBER} --network-type ${NETWORK_TYPE} ${INSTALLATION_PARAMS} ${ROSA_HCP_PARAMS} fi @@ -407,6 +488,14 @@ postinstall(){ sleep 120 export EXPIRATION_TIME=$(cat ${json_file} | jq -r .rosa_expiration_time) _download_kubeconfig "$(_get_cluster_id ${CLUSTER_NAME})" ./kubeconfig + if [ $HCP == "true" ]; then + _get_sc_mc_details + echo "Index Managment cluster info" + index_metadata "management" + _download_kubeconfig "$(ocm list clusters --no-headers --columns id ${MGMT_CLUSTER_NAME})" ./mgmt_kubeconfig + kubectl delete secret staging-mgmt-cluster-kubeconfig || true + kubectl create secret generic staging-mgmt-cluster-kubeconfig --from-file=config=./mgmt_kubeconfig + fi unset KUBECONFIG kubectl delete secret ${KUBECONFIG_NAME} || true kubectl create secret generic ${KUBECONFIG_NAME} --from-file=config=./kubeconfig @@ -421,7 +510,11 @@ postinstall(){ aws iam delete-access-key --user-name OsdCcsAdmin --access-key-id $AWS_ACCESS_KEY_ID || true else URL=$(rosa describe cluster -c $CLUSTER_NAME --output json | jq -r ".api.url") + START_TIMER=$(date +%s) PASSWORD=$(rosa create admin -c "$(_get_cluster_id ${CLUSTER_NAME})" -y 2>/dev/null | grep "oc login" | awk '{print $7}') + CURRENT_TIMER=$(date +%s) + DURATION=$(($CURRENT_TIMER - $START_TIMER)) + INDEXDATA+=("cluster_admin_create-${DURATION}") if [ $HCP == "true" ]; then _login_check $URL $PASSWORD; fi # set expiration to 24h rosa edit cluster -c "$(_get_cluster_id ${CLUSTER_NAME})" --expiration=${EXPIRATION_TIME}m @@ -484,22 +577,20 @@ EOF ) INSTALL_TIME=0 TOTAL_TIME=0 + WORKER_READY_TIME=0 for i in "${INDEXDATA[@]}" ; do IFS="-" ; set -- $i METADATA="${METADATA}, \"$1\":\"$2\"" - if [ $1 != "day2operations" ] && [ $1 != "login" ] ; then - INSTALL_TIME=$((${INSTALL_TIME} + $2)) - elif [ $1 == "day2operations" ]; then - WORKER_READY_TIME=$2 - elif [ $1 == "login" ]; then - LOGIN_TIME=$2 - else - TOTAL_TIME=$2 - fi + if [ $1 != "day2operations" ] && [ $1 != "login" ] ; then + INSTALL_TIME=$((${INSTALL_TIME} + $2)) + elif [ $1 == "day2operations" ]; then + WORKER_READY_TIME=$2 + else + TOTAL_TIME=$2 + fi done IFS=" " METADATA="${METADATA}, \"duration\":\"${INSTALL_TIME}\"" METADATA="${METADATA}, \"workers_ready\":\"$(($INSTALL_TIME + $WORKER_READY_TIME))\"" - METADATA="${METADATA}, \"cluster-admin-login\":\"${LOGIN_TIME}\"" METADATA="${METADATA} }" else METADATA=$(cat << EOF @@ -533,7 +624,6 @@ EOF "master_count": "$(oc get node -l node-role.kubernetes.io/master= --no-headers --ignore-not-found 2>/dev/null | wc -l)", "worker_count": "${COMPUTE_WORKERS_NUMBER}", "infra_count": "$(oc get node -l node-role.kubernetes.io/infra= --no-headers --ignore-not-found 2>/dev/null | wc -l)", -"workload_count": "$(oc get node -l node-role.kubernetes.io/workload= --no-headers --ignore-not-found 2>/dev/null | wc -l)", "total_node_count": "$(oc get nodes 2>/dev/null | wc -l)", "ocp_cluster_name": "$(oc get infrastructure.config.openshift.io cluster -o json 2>/dev/null | jq -r .status.infrastructureName)", "cluster_name": "${CLUSTER_NAME}", @@ -545,12 +635,12 @@ EOF TOTAL_TIME=0 for i in "${INDEXDATA[@]}" ; do IFS="-" ; set -- $i METADATA="${METADATA}, \"$1\":\"$2\"" - if [ $1 != "day2operations" ] && [ $1 != "cleanup" ] ; then - INSTALL_TIME=$((${INSTALL_TIME} + $2)) - TOTAL_TIME=$((${TOTAL_TIME} + $2)) - else - TOTAL_TIME=$((${TOTAL_TIME} + $2)) - fi + if [ $1 != "day2operations" ] && [ $1 != "cleanup" ] ; then + INSTALL_TIME=$((${INSTALL_TIME} + $2)) + TOTAL_TIME=$((${TOTAL_TIME} + $2)) + else + TOTAL_TIME=$((${TOTAL_TIME} + $2)) + fi done IFS=" " METADATA="${METADATA}, \"install_time\":\"${INSTALL_TIME}\"" @@ -567,27 +657,40 @@ index_mgmt_cluster_stat(){ echo "Indexing Management cluster stat..." cd /home/airflow/workspace echo "Installing kube-burner" - export KUBE_BURNER_RELEASE=${KUBE_BURNER_RELEASE:-1.3} + _download_kubeconfig "$(ocm list clusters --no-headers --columns id ${MGMT_CLUSTER_NAME})" ./mgmt_kubeconfig + export KUBE_BURNER_RELEASE=${KUBE_BURNER_RELEASE:-1.5} curl -L https://github.com/cloud-bulldozer/kube-burner/releases/download/v${KUBE_BURNER_RELEASE}/kube-burner-${KUBE_BURNER_RELEASE}-Linux-x86_64.tar.gz -o kube-burner.tar.gz sudo tar -xvzf kube-burner.tar.gz -C /usr/local/bin/ - echo "Cloning ${E2E_BENCHMARKING_REPO} from branch ${E2E_BENCHMARKING_BRANCH}" git clone -q -b ${E2E_BENCHMARKING_BRANCH} ${E2E_BENCHMARKING_REPO} --depth=1 --single-branch - export MGMT_CLUSTER_NAME="$MGMT_CLUSTER_NAME.*" - export SVC_CLUSTER_NAME="$SVC_CLUSTER_NAME.*" - export HOSTED_CLUSTER_NS=".*$CLUSTER_NAME" - export HOSTED_CLUSTER_NAME="$1-$CLUSTER_NAME" - export Q_TIME=$(date +"%s") - envsubst < /home/airflow/workspace/e2e-benchmarking/workloads/kube-burner/metrics-profiles/hypershift-metrics.yaml > hypershift-metrics.yaml + METRIC_PROFILE=/home/airflow/workspace/e2e-benchmarking/workloads/kube-burner-ocp-wrapper/metrics-profiles/mc-metrics.yml envsubst < /home/airflow/workspace/e2e-benchmarking/workloads/kube-burner/workloads/managed-services/baseconfig.yml > baseconfig.yml + cat baseconfig.yml + HCP_NAMESPACE="$(_get_cluster_id ${CLUSTER_NAME})-$CLUSTER_NAME" + MC_PROMETHEUS=https://$(oc --kubeconfig=./mgmt_kubeconfig get route -n openshift-monitoring prometheus-k8s -o jsonpath="{.spec.host}") + MC_PROMETHEUS_TOKEN=$(oc --kubeconfig=./mgmt_kubeconfig sa new-token -n openshift-monitoring prometheus-k8s) + Q_NODES="" + for n in $(curl -H "Authorization: Bearer ${MC_PROMETHEUS_TOKEN}" -k --silent --globoff ${MC_PROMETHEUS}/api/v1/query?query='sum(kube_node_role{role!~"master|infra|workload|obo"})by(node)&time='$(date +"%s")'' | jq -r '.data.result[].metric.node'); + do + if [[ ${Q_NODES} == "" ]]; then Q_NODES=${n}; else Q_NODES=${Q_NODES}"|"${n}; fi + done + MGMT_WORKER_NODES=${Q_NODES} + echo "Exporting required vars" + cat << EOF +MC_PROMETHEUS: ${MC_PROMETHEUS} +MC_PROMETHEUS_TOKEN: +HCP_NAMESPACE: ${HCP_NAMESPACE} +MGMT_WORKER_NODES: ${MGMT_WORKER_NODES} +elapsed: "20m:" + +EOF + export MC_PROMETHEUS MC_PROMETHEUS_TOKEN HCP_NAMESPACE MGMT_WORKER_NODES elapsed METADATA=$(cat << EOF { "uuid":"${UUID}", -"platform":"${PLATFORM}", -"sdn_type":"${NETWORK_TYPE}", "timestamp": "$(date +%s%3N)", -"cluster_name": "${HOSTED_CLUSTER_NAME}", -"mgmt_cluster_name": "${MGMT_CLUSTER_NAME}", -"svc_cluster_name": "${SVC_CLUSTER_NAME}" +"hostedClusterName": "${HC_INFRASTRUCTURE_NAME}", +"clusterName": "${HC_INFRASTRUCTURE_NAME}", +"mgmtClusterName": "${MGMT_CLUSTER_NAME}" } EOF ) @@ -595,7 +698,7 @@ EOF curl -k -sS -X POST -H "Content-type: application/json" ${ES_SERVER}/${ES_INDEX}/_doc -d "${METADATA}" -o /dev/null echo "Running kube-burner index.." - kube-burner index --uuid=${UUID} --prometheus-url=${PROM_URL} --start=$START_TIME --end=$END_TIME --step 2m --metrics-profile hypershift-metrics.yaml --config baseconfig.yml + kube-burner index --uuid=${UUID} --prometheus-url=${MC_PROMETHEUS} --token ${MC_PROMETHEUS_TOKEN} --start=$START_TIME --end=$END_TIME --step 2m --metrics-profile ${METRIC_PROFILE} --config ./baseconfig.yml --log-level debug echo "Finished indexing results" } @@ -606,6 +709,7 @@ cleanup(){ aws iam delete-access-key --user-name OsdCcsAdmin --access-key-id $AWS_ACCESS_KEY_ID || true else export ROSA_CLUSTER_ID=$(_get_cluster_id ${CLUSTER_NAME}) + export HC_INFRASTRUCTURE_NAME=${ROSA_CLUSTER_ID} CLEANUP_START_TIMING=$(date +%s) export START_TIME=$CLEANUP_START_TIMING rosa delete cluster -c ${ROSA_CLUSTER_ID} -y @@ -619,6 +723,7 @@ cleanup(){ export END_TIME=$(date +"%s") if [ $HCP == "true" ]; then _delete_aws_vpc + if [ -z $OIDC_CONFIG ]; then _oidc_config delete $OIDC_PREFIX; fi fi fi return 0 @@ -639,6 +744,7 @@ if [[ "$operation" == "install" ]]; then echo "pre-clean AWS resources" _delete_aws_vpc install + export HC_INFRASTRUCTURE_NAME=$(_get_cluster_id ${CLUSTER_NAME}) index_mgmt_cluster_stat "install-metrics" else install @@ -659,6 +765,7 @@ if [[ "$operation" == "install" ]]; then elif [[ "$operation" == "cleanup" ]]; then printf "Running Cleanup Steps" + if [ $HCP == "true" ]; then _get_sc_mc_details; fi cleanup index_metadata if [ $HCP == "true" ]; then index_mgmt_cluster_stat "destroy-metrics"; fi diff --git a/dags/openshift_nightlies/scripts/run_benchmark.sh b/dags/openshift_nightlies/scripts/run_benchmark.sh index 705175d7e..50a310778 100755 --- a/dags/openshift_nightlies/scripts/run_benchmark.sh +++ b/dags/openshift_nightlies/scripts/run_benchmark.sh @@ -34,7 +34,7 @@ setup(){ if [[ ! -z $MGMT_KUBECONFIG_SECRET ]]; then unset KUBECONFIG # Unsetting Hostedcluster kubeconfig, will fall back to Airflow cluster kubeconfig kubectl get secret $MGMT_KUBECONFIG_SECRET -o json | jq -r '.data.config' | base64 -d > /home/airflow/workspace/mgmt_kubeconfig - export HYPERSHIFT_MANAGEMENT_KUBECONFIG="/home/airflow/workspace/mgmt_kubeconfig" + export MC_KUBECONFIG="/home/airflow/workspace/mgmt_kubeconfig" export KUBECONFIG=/home/airflow/workspace/config fi } diff --git a/dags/openshift_nightlies/tasks/benchmarks/e2e.py b/dags/openshift_nightlies/tasks/benchmarks/e2e.py index c731c5343..757c4af7c 100644 --- a/dags/openshift_nightlies/tasks/benchmarks/e2e.py +++ b/dags/openshift_nightlies/tasks/benchmarks/e2e.py @@ -88,12 +88,9 @@ def __init__(self, dag, config: DagConfig, release: OpenshiftRelease, task_group cluster_name = release._generate_cluster_name() self.env = { **self.env, - "THANOS_RECEIVER_URL": var_loader.get_secret("thanos_receiver_url"), - "PROM_URL": var_loader.get_secret("thanos_querier_url"), "MGMT_CLUSTER_NAME": f"{self.install_vars['staging_mgmt_cluster_name']}.*", "SVC_CLUSTER_NAME": f"{self.install_vars['staging_svc_cluster_name']}.*", - "HOSTED_CLUSTER_NS": f".*-{cluster_name}-{self.task_group}", - "MGMT_KUBECONFIG_SECRET": f"{release.get_release_name()}-kubeconfig", + "MGMT_KUBECONFIG_SECRET": "staging-mgmt-cluster-kubeconfig", **self._insert_kube_env() } diff --git a/dags/openshift_nightlies/tasks/install/rosa/defaults.json b/dags/openshift_nightlies/tasks/install/rosa/defaults.json index de912e8ee..45da90e38 100644 --- a/dags/openshift_nightlies/tasks/install/rosa/defaults.json +++ b/dags/openshift_nightlies/tasks/install/rosa/defaults.json @@ -37,6 +37,8 @@ "ocm_cli_version": "container", "rosa_hcp": "false", "staging_mgmt_cluster_name": "", + "staging_svc_cluster_name": "", "staging_mgmt_provisioner_shards": "", - "aws_region": "us-west-2" + "aws_region": "us-west-2", + "oidc_config": "" } From f984ea73ea84bfb2e8bed91f84a14f4ae2f2ba07 Mon Sep 17 00:00:00 2001 From: Murali Krishnasamy <70236227+mukrishn@users.noreply.github.com> Date: Wed, 12 Jul 2023 14:00:13 -0400 Subject: [PATCH 11/26] Hypershift postinstall to add firewall rules (#338) * dynamic post install tasks * removed k8s config library and other unused code --- dags/openshift_nightlies/dag.py | 4 +-- .../scripts/utils/rosa_post_install.py | 36 +++++++------------ .../tasks/install/rosa/rosa.py | 4 ++- .../tasks/utils/rosa_post_install.py | 8 ++--- 4 files changed, 21 insertions(+), 31 deletions(-) diff --git a/dags/openshift_nightlies/dag.py b/dags/openshift_nightlies/dag.py index d073db172..33033e470 100644 --- a/dags/openshift_nightlies/dag.py +++ b/dags/openshift_nightlies/dag.py @@ -160,9 +160,9 @@ def build(self): hosted_installer = self._get_hypershift_openshift_installer() wait_task = hosted_installer.wait_task() wait_before_cleanup = hosted_installer.wait_task(id="wait_before_cleanup") - for c_id, install_hc, cleanup_hc in install_cluster: + for c_id, install_hc, postinstall_hc, cleanup_hc in install_cluster: benchmark = self._add_benchmarks(task_group=c_id) - install_hc >> wait_task >> benchmark >> wait_before_cleanup >> cleanup_hc + install_hc >> postinstall_hc >> wait_task >> benchmark >> wait_before_cleanup >> cleanup_hc else: install_cluster = installer.get_install_task() final_status = final_dag_status.get_task(self.dag) diff --git a/dags/openshift_nightlies/scripts/utils/rosa_post_install.py b/dags/openshift_nightlies/scripts/utils/rosa_post_install.py index bb44f4b44..5cd1e75ab 100755 --- a/dags/openshift_nightlies/scripts/utils/rosa_post_install.py +++ b/dags/openshift_nightlies/scripts/utils/rosa_post_install.py @@ -11,8 +11,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from kubernetes import client, config -from openshift.dynamic import DynamicClient import sys import argparse import subprocess @@ -20,7 +18,7 @@ import json # Make aws related config changes such as security group rules etc -def _aws_config(nodes,clustername,jsonfile): +def _aws_config(clustername,jsonfile,kubeconfig): try: json_file = json.load(open(jsonfile)) except Exception as err: @@ -30,7 +28,13 @@ def _aws_config(nodes,clustername,jsonfile): my_env = os.environ.copy() my_env['AWS_ACCESS_KEY_ID'] = json_file['aws_access_key_id'] my_env['AWS_SECRET_ACCESS_KEY'] = json_file['aws_secret_access_key'] - my_env['AWS_DEFAULT_REGION'] = json_file['aws_region_for_openshift'] + my_env['AWS_DEFAULT_REGION'] = json_file['aws_region'] + if "rosa_hcp" in json_file and json_file["rosa_hcp"] == "true": + clustername_check_cmd = ["oc get infrastructures.config.openshift.io cluster -o json --kubeconfig " + kubeconfig + " | jq -r '.status.platformStatus.aws.resourceTags[] | select( .key == \"api.openshift.com/name\" ).value'"] + print(clustername_check_cmd) + process = subprocess.Popen(clustername_check_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, env=my_env) + stdout,stderr = process.communicate() + clustername = stdout.decode("utf-8").replace('\n','').replace(' ','') vpc_cmd = ["aws ec2 describe-instances --query 'Reservations[*].Instances[*].[InstanceId,Tags[?Key==`Name`].Value|[0],State.Name,PrivateIpAddress,PublicIpAddress, PrivateDnsName, VpcId]' --output text | column -t | grep " + clustername + "| awk '{print $7}' | grep -v '^$' | sort -u"] print(vpc_cmd) process = subprocess.Popen(vpc_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, env=my_env) @@ -38,9 +42,7 @@ def _aws_config(nodes,clustername,jsonfile): print("VPC:") print(stdout) print(stderr) - cluster_vpc = stdout.decode("utf-8") - cluster_vpc = cluster_vpc.replace('\n','') - cluster_vpc = cluster_vpc.replace(' ','') + cluster_vpc = stdout.decode("utf-8").replace('\n','').replace(' ','') sec_grp_cmd = ["aws ec2 describe-security-groups --filters \"Name=vpc-id,Values=" + cluster_vpc + "\" --output json | jq .SecurityGroups[].GroupId"] print(sec_grp_cmd) process = subprocess.Popen(sec_grp_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, env=my_env) @@ -48,11 +50,8 @@ def _aws_config(nodes,clustername,jsonfile): print("Security Groups:") print(stdout) print(stderr) - sec_group = stdout.decode("utf-8") + sec_group = stdout.decode("utf-8").replace(' ','').replace('"','').replace('\n',' ') - sec_group = sec_group.replace(' ','') - sec_group = sec_group.replace('"','') - sec_group = sec_group.replace('\n',' ') sec_group_list = list(sec_group.split(" ")) print(sec_group_list) @@ -94,28 +93,17 @@ def main(): help='Optional configuration file including all the dag vars') args = parser.parse_args() - if args.incluster.lower() == "true": - config.load_incluster_config() - k8s_config = client.Configuration() - k8s_client = client.api_client.ApiClient(configuration=k8s_config) - elif args.kubeconfig: - k8s_client = config.new_client_from_config(args.kubeconfig) - else: - k8s_client = config.new_client_from_config() - - dyn_client = DynamicClient(k8s_client) - nodes = dyn_client.resources.get(api_version='v1', kind='Node') - if args.kubeconfig: cmd = ["oc get infrastructures.config.openshift.io cluster -o jsonpath={.status.infrastructureName} --kubeconfig " + args.kubeconfig] else: cmd = ["oc get infrastructures.config.openshift.io cluster -o jsonpath={.status.infrastructureName}"] + process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) stdout,stderr = process.communicate() clustername = stdout.decode("utf-8") # AWS configuration - _aws_config(nodes,clustername,args.jsonfile) + _aws_config(clustername,args.jsonfile,args.kubeconfig) if __name__ == '__main__': diff --git a/dags/openshift_nightlies/tasks/install/rosa/rosa.py b/dags/openshift_nightlies/tasks/install/rosa/rosa.py index 2e4bfe380..9f277ae60 100644 --- a/dags/openshift_nightlies/tasks/install/rosa/rosa.py +++ b/dags/openshift_nightlies/tasks/install/rosa/rosa.py @@ -4,6 +4,7 @@ from openshift_nightlies.util import var_loader, kubeconfig, constants, executor from openshift_nightlies.tasks.install.openshift import AbstractOpenshiftInstaller +from openshift_nightlies.tasks.utils import rosa_post_install from common.models.dag_config import DagConfig from openshift_nightlies.models.release import OpenshiftRelease @@ -22,6 +23,7 @@ class RosaInstaller(AbstractOpenshiftInstaller): def __init__(self, dag, config: DagConfig, release: OpenshiftRelease): super().__init__(dag, config, release) self.exec_config = executor.get_default_executor_config(self.dag_config, executor_image="airflow-managed-services") + self.rosa_postinstall_setup = rosa_post_install.Diagnosis(dag, config, release) def get_type(self): if self.config['rosa_hcp'] == "true": @@ -32,7 +34,7 @@ def get_type(self): def get_install_hcp_task(self): for iteration in range(self.config['number_of_hostedcluster']): c_id = f"{'hcp-'+str(iteration+1)}" # adding 1 to name the cluster hcp-1, hcp-2.. - yield c_id, self._get_task(operation="install", id=c_id), self._get_task(operation="cleanup", id=c_id) + yield c_id, self._get_task(operation="install", id=c_id), self.rosa_postinstall_setup._get_rosa_postinstallation(id=c_id), self._get_task(operation="cleanup", id=c_id) # Create Airflow Task for Install/Cleanup steps def _get_task(self, operation="install", id="", trigger_rule="all_success"): diff --git a/dags/openshift_nightlies/tasks/utils/rosa_post_install.py b/dags/openshift_nightlies/tasks/utils/rosa_post_install.py index 017bd79b4..6c581e99e 100755 --- a/dags/openshift_nightlies/tasks/utils/rosa_post_install.py +++ b/dags/openshift_nightlies/tasks/utils/rosa_post_install.py @@ -41,11 +41,11 @@ def __init__(self, dag, config: DagConfig, release: OpenshiftRelease): super().__init__() - self.exec_config = executor.get_executor_config_with_cluster_access(self.config, self.release, executor_image="airflow-managed-services") - - def _get_rosa_postinstallation(self, operation="postinstall", trigger_rule="all_success"): + def _get_rosa_postinstallation(self, operation="postinstall", id="", trigger_rule="all_success"): + self.exec_config = executor.get_executor_config_with_cluster_access(self.config, self.release, executor_image="airflow-managed-services", task_group=id) + task_prefix=f"{id}-" return BashOperator( - task_id=f"{operation}_rosa", + task_id=f"{task_prefix if id != '' else ''}{operation}-rosa", depends_on_past=False, bash_command=f"python {constants.root_dag_dir}/scripts/utils/rosa_post_install.py --jsonfile /tmp/{self.release_name}-postinstall-task.json --kubeconfig /home/airflow/auth/config", retries=3, From 7b57a4bfff2f5eac582456efc71814a2c3084b90 Mon Sep 17 00:00:00 2001 From: "Joe Talerico (rook)" Date: Thu, 13 Jul 2023 07:25:09 -0400 Subject: [PATCH 12/26] Add report index back in (#339) Mistakenly removed... Signed-off-by: Joe Talerico Co-authored-by: Joe Talerico --- dags/openshift_nightlies/scripts/index.sh | 125 ++++++++++++++++++ .../tasks/benchmarks/e2e.py | 14 +- .../tasks/index/__init__.py | 0 .../tasks/index/defaults.json | 1 + .../openshift_nightlies/tasks/index/status.py | 64 +++++++++ .../tasks/install/openshift.py | 8 +- 6 files changed, 205 insertions(+), 7 deletions(-) create mode 100755 dags/openshift_nightlies/scripts/index.sh create mode 100644 dags/openshift_nightlies/tasks/index/__init__.py create mode 100644 dags/openshift_nightlies/tasks/index/defaults.json create mode 100644 dags/openshift_nightlies/tasks/index/status.py diff --git a/dags/openshift_nightlies/scripts/index.sh b/dags/openshift_nightlies/scripts/index.sh new file mode 100755 index 000000000..317da2ca9 --- /dev/null +++ b/dags/openshift_nightlies/scripts/index.sh @@ -0,0 +1,125 @@ +#!/bin/bash + +set -exo pipefail + +export dag_id=${AIRFLOW_CTX_DAG_ID} +export execution_date=${AIRFLOW_CTX_EXECUTION_DATE} +export dag_run_id=${AIRFLOW_CTX_DAG_RUN_ID} +export ci="AIRFLOW" +printenv + +# Get Airflow URL +export airflow_base_url="http://$(kubectl get route/airflow -n airflow -o jsonpath='{.spec.host}')" + +setup(){ + # Generate a uuid + export UUID=${UUID:-$(uuidgen)} + + # Elasticsearch Config + export ES_SERVER=$ES_SERVER + export ES_INDEX=$ES_INDEX + + # Timestamp + timestamp=`date +"%Y-%m-%dT%T.%3N"` + + # Setup Kubeconfig + export KUBECONFIG=/home/airflow/auth/config + curl -sS https://mirror.openshift.com/pub/openshift-v4/clients/ocp/latest/openshift-client-linux.tar.gz | tar xz oc + export PATH=$PATH:/home/airflow/.local/bin:$(pwd) + # Get OpenShift cluster details + cluster_name=$(oc get infrastructure cluster -o jsonpath='{.status.infrastructureName}') || echo "Cluster Install Failed" + cluster_version=$(oc version -o json | jq -r '.openshiftVersion') || echo "Cluster Install Failed" + network_type=$(oc get network.config/cluster -o jsonpath='{.status.networkType}') || echo "Cluster Install Failed" + platform=$(oc get infrastructure cluster -o jsonpath='{.status.platformStatus.type}') || echo "Cluster Install Failed" + masters=$(oc get nodes -l node-role.kubernetes.io/master --no-headers=true | wc -l) || true + workers=$(oc get nodes -l node-role.kubernetes.io/worker --no-headers=true | wc -l) || true + infra=$(oc get nodes -l node-role.kubernetes.io/infra --no-headers=true | wc -l) || true + worker_type=$(oc get nodes -l node-role.kubernetes.io/worker -o jsonpath='{.items[].metadata.labels.beta\.kubernetes\.io/instance-type}') || true + infra_type=$(oc get nodes -l node-role.kubernetes.io/infra -o jsonpath='{.items[].metadata.labels.beta\.kubernetes\.io/instance-type}') || true + master_type=$(oc get nodes -l node-role.kubernetes.io/master -o jsonpath='{.items[].metadata.labels.beta\.kubernetes\.io/instance-type}') || true + all=$(oc get nodes --no-headers=true | wc -l) || true + # ReleaseStream is piped in via environment variables + release_stream=${RELEASE_STREAM} +} + +index_task(){ + task_json=$1 + + state=$(echo $task_json | jq -r '.state') + task_id=$(echo $task_json | jq -r '.task_id') + + if [[ $task_id == "$AIRFLOW_CTX_TASK_ID" || $task_id == "cleanup" ]]; then + echo "Index Task doesn't index itself or cleanup step, skipping." + else + start_date=$(echo $task_json | jq -r '.start_date') + end_date=$(echo $task_json | jq -r '.end_date') + + if [[ -z $start_date ]]; then + start_date=$end_date + fi + + if [[ -z $start_date || -z $end_date ]]; then + duration=0 + else + end_ts=$(date -d $end_date +%s) + start_ts=$(date -d $start_date +%s) + duration=$(( $end_ts - $start_ts )) + fi + + encoded_execution_date=$(python3 -c "import urllib.parse; print(urllib.parse.quote(input()))" <<< "$execution_date") + build_url="${airflow_base_url}/task?dag_id=${dag_id}&task_id=${task_id}&execution_date=${encoded_execution_date}" + + curl --insecure -X POST -H "Content-Type: application/json" -H "Cache-Control: no-cache" -d '{ + "ci_system" : "'$ci'", + "uuid" : "'$UUID'", + "release_stream": "'$RELEASE_STREAM'", + "platform": "'$platform'", + "master_count": '$masters', + "worker_count": '$workers', + "infra_count": '$infra', + "master_type": "'$master_type'", + "worker_type": "'$worker_type'", + "infra_type": "'$infra_type'", + "total_count": '$all', + "cluster_name": "'$cluster_name'", + "cluster_version": "'$cluster_version'", + "network_type": "'$network_type'", + "build_tag": "'$task_id'", + "node_name": "'$HOSTNAME'", + "job_status": "'$state'", + "build_url": "'$build_url'", + "upstream_job": "'$dag_id'", + "upstream_job_build": "'$dag_run_id'", + "execution_date": "'$execution_date'", + "job_duration": "'$duration'", + "start_date": "'$start_date'", + "end_date": "'$end_date'", + "timestamp": "'$start_date'" + }' $ES_SERVER/$ES_INDEX/_doc/$dag_id%2F$dag_run_id%2F$task_id + + fi + +} + + +index_tasks(){ + + task_states=$(AIRFLOW__LOGGING__LOGGING_LEVEL=ERROR airflow tasks states-for-dag-run $dag_id $execution_date -o json) + task_json=$( echo $task_states | jq -c ".[] | select( .task_id == \"$TASK\")") + index_task $task_json + +} + +# Defaults +if [[ -z $ES_SERVER ]]; then + echo "Elastic server is not defined, please check" + help + exit 1 +fi + +if [[ -z $ES_INDEX ]]; then + export ES_INDEX=perf_scale_ci +fi + +setup +index_tasks diff --git a/dags/openshift_nightlies/tasks/benchmarks/e2e.py b/dags/openshift_nightlies/tasks/benchmarks/e2e.py index 757c4af7c..b3b86e3c0 100644 --- a/dags/openshift_nightlies/tasks/benchmarks/e2e.py +++ b/dags/openshift_nightlies/tasks/benchmarks/e2e.py @@ -1,6 +1,7 @@ from os import environ from openshift_nightlies.util import var_loader, executor, constants +from openshift_nightlies.tasks.index.status import StatusIndexer from openshift_nightlies.models.release import OpenshiftRelease from common.models.dag_config import DagConfig @@ -65,7 +66,7 @@ def __init__(self, dag, config: DagConfig, release: OpenshiftRelease, task_group "ORCHESTRATION_USER": self.config['provisioner_user'], "ORCHESTRATION_HOST": self.config['provisioner_hostname'] } - + if self.release.platform == "rosa": self.rosa_creds = var_loader.get_secret("rosa_creds", deserialize_json=True) self.aws_creds = var_loader.get_secret("aws_creds", deserialize_json=True) @@ -83,9 +84,9 @@ def __init__(self, dag, config: DagConfig, release: OpenshiftRelease, task_group "OCM_TOKEN": self.ocm_creds['ocm_token'] } self.install_vars = var_loader.build_task_vars( - release, task="install") + release, task="install") if self.install_vars['rosa_hcp'] == "true": - cluster_name = release._generate_cluster_name() + cluster_name = release._generate_cluster_name() self.env = { **self.env, "MGMT_CLUSTER_NAME": f"{self.install_vars['staging_mgmt_cluster_name']}.*", @@ -108,7 +109,7 @@ def __init__(self, dag, config: DagConfig, release: OpenshiftRelease, task_group def get_benchmarks(self): benchmarks = self._get_benchmarks(self.vars["benchmarks"]) - return benchmarks + return benchmarks def _git_name(self): git_username = var_loader.get_git_user() @@ -128,6 +129,9 @@ def _get_benchmarks(self, benchmarks): benchmarks[index] = self._get_benchmarks(benchmark['benchmarks']) return benchmarks + def _add_indexer(self, benchmark): + indexer = StatusIndexer(self.dag, self.dag_config, self.release, benchmark.task_id, task_group=self.task_group).get_index_task() + benchmark >> indexer def _get_benchmark(self, benchmark): env = {**self.env, **benchmark.get('env', {}), **{"ES_SERVER": var_loader.get_secret('elasticsearch'), "KUBEADMIN_PASSWORD": environ.get("KUBEADMIN_PASSWORD", "")}} @@ -156,7 +160,7 @@ def _get_benchmark(self, benchmark): execution_timeout=timedelta(seconds=21600), executor_config=self.exec_config ) - + self._add_indexer(task) return task # This Helper Injects Airflow environment variables into the task execution runtime diff --git a/dags/openshift_nightlies/tasks/index/__init__.py b/dags/openshift_nightlies/tasks/index/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dags/openshift_nightlies/tasks/index/defaults.json b/dags/openshift_nightlies/tasks/index/defaults.json new file mode 100644 index 000000000..9e26dfeeb --- /dev/null +++ b/dags/openshift_nightlies/tasks/index/defaults.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/dags/openshift_nightlies/tasks/index/status.py b/dags/openshift_nightlies/tasks/index/status.py new file mode 100644 index 000000000..2bf9229df --- /dev/null +++ b/dags/openshift_nightlies/tasks/index/status.py @@ -0,0 +1,64 @@ +from os import environ + + +from openshift_nightlies.util import var_loader, executor, constants +from openshift_nightlies.models.release import OpenshiftRelease +from common.models.dag_config import DagConfig + +import json +import requests + +from airflow.operators.bash import BashOperator +from airflow.models import Variable +from kubernetes.client import models as k8s + +# Defines Task for Indexing Task Status in ElasticSearch +class StatusIndexer(): + def __init__(self, dag, config: DagConfig, release: OpenshiftRelease, task, task_group=""): + + # General DAG Configuration + self.dag = dag + self.release = release + self.config = config + self.task_group = task_group + self.exec_config = executor.get_executor_config_with_cluster_access(self.config, self.release, task_group=self.task_group) + + # Specific Task Configuration + self.vars = var_loader.build_task_vars(release, task="index") + + # Upstream task this is to index + self.task = task + self.env = { + "RELEASE_STREAM": self.release.release_stream, + "TASK": self.task + } + + self.git_user = var_loader.get_git_user() + if self.git_user == 'cloud-bulldozer': + self.env["ES_INDEX"] = "perf_scale_ci" + else: + self.env["ES_INDEX"] = f"{self.git_user}_playground" + + + # Create Airflow Task for Indexing Results into ElasticSearch + def get_index_task(self): + env = { + **self.env, + **{"ES_SERVER": var_loader.get_secret('elasticsearch')}, + **environ + } + if self.task != "install": + command = f'UUID={{{{ ti.xcom_pull("{self.task}") }}}} {constants.root_dag_dir}/scripts/index.sh ' + else: + command = f'{constants.root_dag_dir}/scripts/index.sh ' + + return BashOperator( + task_id=f"index-{self.task}", + depends_on_past=False, + bash_command=command, + retries=3, + dag=self.dag, + trigger_rule="all_done", + executor_config=self.exec_config, + env=env + ) \ No newline at end of file diff --git a/dags/openshift_nightlies/tasks/install/openshift.py b/dags/openshift_nightlies/tasks/install/openshift.py index d582a4e24..30dc13d71 100644 --- a/dags/openshift_nightlies/tasks/install/openshift.py +++ b/dags/openshift_nightlies/tasks/install/openshift.py @@ -1,6 +1,7 @@ import abc from openshift_nightlies.util import var_loader, executor, constants from openshift_nightlies.models.release import OpenshiftRelease +from openshift_nightlies.tasks.index.status import StatusIndexer from common.models.dag_config import DagConfig from os import environ import json @@ -44,7 +45,7 @@ def __init__(self, dag, config: DagConfig, release: OpenshiftRelease): self.exec_config = executor.get_default_executor_config(self.dag_config) # Merge all variables, prioritizing Airflow Secrets over git based vars - self.config = { + self.config = { **self.vars, **self.ansible_orchestrator, **self.install_secrets, @@ -69,7 +70,10 @@ def _get_task(self, operation="install", trigger_rule="all_success"): raise NotImplementedError() def get_install_task(self): - return self._get_task(operation="install") + indexer = StatusIndexer(self.dag, self.dag_config, self.release, "install").get_index_task() + install_task = self._get_task(operation="install") + install_task >> indexer + return install_task def get_cleanup_task(self): # trigger_rule = "all_done" means this task will run when every other task has finished, whether it fails or succeededs From 92390a5a6f51b15750b2879b798af875675c2993 Mon Sep 17 00:00:00 2001 From: Murali Krishnasamy <70236227+mukrishn@users.noreply.github.com> Date: Thu, 13 Jul 2023 12:11:53 -0400 Subject: [PATCH 13/26] add new machinepool and balance infra component (#337) * add new machinepool and balance infra component migrate prometheus-k8s pods to new infra nodepools migrate router-ingress pods to new infra nodepools validate after migration * setting kubeconfig back to sailplane cluster * hcp data-plane-v2 dag * new install config for dataplane * 2 infra nodes only for hosted-cp * move kubeadmin secret creation around * updated infra labels and taints * reattempting migration incase of failure --- .../install/rosa/rosa-hcp-ovn-data-plane.json | 28 +++++ .../config/install/rosa/rosa-hcp-ovn.json | 9 +- dags/openshift_nightlies/manifest.yaml | 5 + .../scripts/install/rosa.sh | 101 ++++++++++++++++-- .../tasks/install/rosa/defaults.json | 3 +- 5 files changed, 135 insertions(+), 11 deletions(-) create mode 100644 dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn-data-plane.json diff --git a/dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn-data-plane.json b/dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn-data-plane.json new file mode 100644 index 000000000..380b11b35 --- /dev/null +++ b/dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn-data-plane.json @@ -0,0 +1,28 @@ +{ + "rosa_hcp": "true", + "aws_profile": "", + "aws_access_key_id": "", + "aws_secret_access_key": "", + "aws_authentication_method": "sts", + "aws_region": "us-east-2", + "rosa_environment": "staging", + "rosa_cli_version": "master", + "ocm_cli_version": "v0.1.67", + "ocm_environment": "stage", + "managed_channel_group": "nightly", + "managed_ocp_version": "latest", + "openshift_worker_count": 9, + "openshift_network_type": "OVNKubernetes", + "openshift_worker_instance_type": "m5.2xlarge", + "machineset_metadata_label_prefix": "machine.openshift.io", + "staging_mgmt_provisioner_shards": "b4bb294b-a76c-11ed-91b2-0a580a831ba1", + "number_of_hostedcluster": 2, + "hcp_install_interval": 60, + "extra_machinepool": [{ + "name": "infra", + "replica": "1", + "instance_type": "c5.4xlarge", + "labels": "node-role.kubernetes.io/infra=", + "taints": "node-role.kubernetes.io/infra=:NoSchedule" + }] +} diff --git a/dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn.json b/dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn.json index c144f6820..9d07be452 100644 --- a/dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn.json +++ b/dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn.json @@ -17,5 +17,12 @@ "machineset_metadata_label_prefix": "machine.openshift.io", "staging_mgmt_provisioner_shards": "b4bb294b-a76c-11ed-91b2-0a580a831ba1", "number_of_hostedcluster": 2, - "hcp_install_interval": 60 + "hcp_install_interval": 60, + "extra_machinepool": [{ + "name": "infra", + "replica": "1", + "instance_type": "r5.xlarge", + "labels": "node-role.kubernetes.io/infra=", + "taints": "node-role.kubernetes.io/infra=:NoSchedule" + }] } diff --git a/dags/openshift_nightlies/manifest.yaml b/dags/openshift_nightlies/manifest.yaml index 0f836d8ce..e96123e2b 100644 --- a/dags/openshift_nightlies/manifest.yaml +++ b/dags/openshift_nightlies/manifest.yaml @@ -142,6 +142,11 @@ platforms: config: install: rosa/rosa-hcp-ovn.json benchmarks: hosted-control-plane-p75.json + - name: rosa-hcp-data-plane + schedule: "1 12 * * 3" + config: + install: rosa/rosa-hcp-ovn-data-plane.json + benchmarks: data-plane-v2.json rogcp: versions: ["4.12", "4.13"] diff --git a/dags/openshift_nightlies/scripts/install/rosa.sh b/dags/openshift_nightlies/scripts/install/rosa.sh index 01a2f3317..8e53dea91 100755 --- a/dags/openshift_nightlies/scripts/install/rosa.sh +++ b/dags/openshift_nightlies/scripts/install/rosa.sh @@ -43,14 +43,14 @@ _wait_for_nodes_ready(){ NODES_COUNT=$2 ALL_READY_ITERATIONS=4 #reduced extra buffers for hosted cp clusters else - # Node count is number of workers + 3 masters + 3 infra - NODES_COUNT=$(($2+6)) + # Node count is number of workers + 3 infra + NODES_COUNT=$(($2+3)) fi # 30 seconds per node, waiting for all nodes ready to finalize while [ ${ITERATIONS} -le $((${NODES_COUNT}*5)) ] ; do - NODES_READY_COUNT=$(oc get nodes | grep " Ready " | wc -l) + NODES_READY_COUNT=$(oc get nodes -l $3 | grep " Ready " | wc -l) if [ ${NODES_READY_COUNT} -ne ${NODES_COUNT} ] ; then - echo "WARNING: ${ITERATIONS}/${NODES_COUNT} iterations. ${NODES_READY_COUNT}/${NODES_COUNT} nodes ready. Waiting 30 seconds for next check" + echo "WARNING: ${ITERATIONS}/${NODES_COUNT} iterations. ${NODES_READY_COUNT}/${NODES_COUNT} $3 nodes ready. Waiting 30 seconds for next check" # ALL_READY_ITERATIONS=0 ITERATIONS=$((${ITERATIONS}+1)) sleep 30 @@ -66,7 +66,7 @@ _wait_for_nodes_ready(){ fi done END_CLUSTER_STATUS="Ready. No Workers" - echo "ERROR: Not all nodes (${NODES_READY_COUNT}/${NODES_COUNT}) are ready after about $((${NODES_COUNT}*3)) minutes, dumping oc get nodes..." + echo "ERROR: Not all $3 nodes (${NODES_READY_COUNT}/${NODES_COUNT}) are ready after about $((${NODES_COUNT}*3)) minutes, dumping oc get nodes..." oc get nodes exit 1 } @@ -137,6 +137,86 @@ _adm_logic_check(){ echo "Failed to execute oc adm commands after 100 attempts with 5 sec interval" } +_balance_infra(){ + if [[ $1 == "prometheus-k8s" ]] ; then + echo "Initiate migration of prometheus componenets to infra nodepools" + oc get pods -n openshift-monitoring -o wide | grep prometheus-k8s + oc get sts prometheus-k8s -n openshift-monitoring + echo "Restart stateful set pods" + oc rollout restart -n openshift-monitoring statefulset/prometheus-k8s + echo "Wait till they are completely restarted" + oc rollout status -n openshift-monitoring statefulset/prometheus-k8s + echo "Check pods status again and the hosting nodes" + oc get pods -n openshift-monitoring -o wide | grep prometheus-k8s + else + echo "Initiate migration of ingress router-default pods to infra nodepools" + echo "Add toleration to use infra nodes" + oc patch ingresscontroller -n openshift-ingress-operator default --type merge --patch '{"spec":{"nodePlacement":{"nodeSelector":{"matchLabels":{"node-role.kubernetes.io/infra":""}},"tolerations":[{"effect":"NoSchedule","key":"node-role.kubernetes.io/infra","operator":"Exists"}]}}}' + echo "Wait till it gets rolled out" + sleep 60 + oc get pods -n openshift-ingress -o wide + fi +} + +_check_infra(){ + TRY=0 + while [ $TRY -le 3 ]; do # Attempts three times to migrate pods + FLAG_ERROR="" + _balance_infra $1 + for node in $(oc get pods -n $2 -o wide | grep -i $1 | grep -i running | awk '{print$7}'); + do + if [[ $(oc get nodes | grep infra | awk '{print$1}' | grep $node) != "" ]]; then + echo "$node is an infra node" + else + echo "$1 pod on $node is not an infra node, retrying" + FLAG_ERROR=true + fi + done + if [[ $FLAG_ERROR == "" ]]; then return 0; else TRY=$((TRY+1)); fi + done + echo "Failed to move $1 pods in $2 namespace" + exit 1 +} + +_wait_for_extra_nodes_ready(){ + export NODE_LABLES=$(cat ${json_file} | jq -r .extra_machinepool[].labels) + for label in $NODE_LABLES; + do + REPLICA=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.labels == '\"$label\"')'.replica) + NODES_COUNT=$((REPLICA*3)) + if [[ $label == *"infra"* ]] ; then NODES_COUNT=$((REPLICA*2)); fi + _wait_for_nodes_ready $CLUSTER_NAME $NODES_COUNT $label + if [[ $label == *"infra"* ]] ; then + _check_infra prometheus-k8s openshift-monitoring + _check_infra router openshift-ingress + fi + done + return 0 +} + +_add_machinepool(){ + export MACHINEPOOLS=$(cat ${json_file} | jq -r .extra_machinepool[].name) + for mcp in $MACHINEPOOLS; + do + echo "Add an extra machinepool - $mcp to cluster" + ZONES="a b c" + MC_NAME=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.name) + REPLICA=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.replica) + INS_TYPE=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.instance_type) + LABELS=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.labels) + TAINTS=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.taints) + if [[ $MC_NAME == *"infra"* ]]; then ZONES="a b"; fi + for ZONE in $ZONES; + do + if [[ $(rosa list machinepool --cluster "$(_get_cluster_id ${CLUSTER_NAME})" | grep $MC_NAME-$ZONE) == "" ]]; then + rosa create machinepool --cluster "$(_get_cluster_id ${CLUSTER_NAME})" --name $MC_NAME-$ZONE --instance-type ${INS_TYPE} --replicas $REPLICA --availability-zone $AWS_REGION$ZONE --labels $LABELS --taints $TAINTS + fi + done + done + _wait_for_extra_nodes_ready + return 0 +} + _wait_for_cluster_ready(){ START_TIMER=$(date +%s) echo "INFO: Installation starts at $(date -d @${START_TIMER})" @@ -169,11 +249,12 @@ _wait_for_cluster_ready(){ echo "Set end time of prom scrape" export END_TIME=$(date +"%s") START_TIMER=$(date +%s) - _wait_for_nodes_ready $1 ${COMPUTE_WORKERS_NUMBER} + _wait_for_nodes_ready $1 ${COMPUTE_WORKERS_NUMBER} "node-role.kubernetes.io/worker" CURRENT_TIMER=$(date +%s) # Time since rosa cluster is ready until all nodes are ready DURATION=$(($CURRENT_TIMER - $START_TIMER)) INDEXDATA+=("day2operations-${DURATION}") + if [ $HCP == "true" ]; then _add_machinepool $URL $PASSWORD; fi if [[ $INSTALL_METHOD == "osd" ]]; then echo "INFO: Cluster and nodes on ready status.." else @@ -478,12 +559,12 @@ install(){ fi rosa create cluster --tags=User:${GITHUB_USERNAME} --cluster-name ${CLUSTER_NAME} --version "${ROSA_VERSION}" --channel-group=${MANAGED_CHANNEL_GROUP} --compute-machine-type ${COMPUTE_WORKERS_TYPE} --replicas ${COMPUTE_WORKERS_NUMBER} --network-type ${NETWORK_TYPE} ${INSTALLATION_PARAMS} ${ROSA_HCP_PARAMS} fi - _wait_for_cluster_ready ${CLUSTER_NAME} postinstall return 0 } postinstall(){ + _wait_for_cluster_ready ${CLUSTER_NAME} # sleeping to address issue #324 sleep 120 export EXPIRATION_TIME=$(cat ${json_file} | jq -r .rosa_expiration_time) @@ -508,6 +589,8 @@ postinstall(){ ocm patch /api/clusters_mgmt/v1/clusters/"$(_get_cluster_id ${CLUSTER_NAME})" <<< ${EXPIRATION_STRING} echo "Cluster is ready, deleting OSD access keys now.." aws iam delete-access-key --user-name OsdCcsAdmin --access-key-id $AWS_ACCESS_KEY_ID || true + kubectl delete secret ${KUBEADMIN_NAME} || true + kubectl create secret generic ${KUBEADMIN_NAME} --from-literal=KUBEADMIN_PASSWORD=${PASSWORD} else URL=$(rosa describe cluster -c $CLUSTER_NAME --output json | jq -r ".api.url") START_TIMER=$(date +%s) @@ -515,12 +598,12 @@ postinstall(){ CURRENT_TIMER=$(date +%s) DURATION=$(($CURRENT_TIMER - $START_TIMER)) INDEXDATA+=("cluster_admin_create-${DURATION}") + kubectl delete secret ${KUBEADMIN_NAME} || true + kubectl create secret generic ${KUBEADMIN_NAME} --from-literal=KUBEADMIN_PASSWORD=${PASSWORD} if [ $HCP == "true" ]; then _login_check $URL $PASSWORD; fi # set expiration to 24h rosa edit cluster -c "$(_get_cluster_id ${CLUSTER_NAME})" --expiration=${EXPIRATION_TIME}m fi - kubectl delete secret ${KUBEADMIN_NAME} || true - kubectl create secret generic ${KUBEADMIN_NAME} --from-literal=KUBEADMIN_PASSWORD=${PASSWORD} if [ $HCP == "true" ]; then index_metadata "cluster-install"; fi return 0 } diff --git a/dags/openshift_nightlies/tasks/install/rosa/defaults.json b/dags/openshift_nightlies/tasks/install/rosa/defaults.json index 45da90e38..c3ee1decc 100644 --- a/dags/openshift_nightlies/tasks/install/rosa/defaults.json +++ b/dags/openshift_nightlies/tasks/install/rosa/defaults.json @@ -40,5 +40,6 @@ "staging_svc_cluster_name": "", "staging_mgmt_provisioner_shards": "", "aws_region": "us-west-2", - "oidc_config": "" + "oidc_config": "", + "extra_machinepool": [] } From d563189b519a899233fefe4e974d4be624c642d6 Mon Sep 17 00:00:00 2001 From: "Joe Talerico (rook)" Date: Thu, 13 Jul 2023 16:11:11 -0400 Subject: [PATCH 14/26] Updating Manifest file (#340) Removing OSP and commenting out some things that we do not run in the main airflow. Signed-off-by: Joe Talerico Co-authored-by: Joe Talerico --- dags/openshift_nightlies/manifest.yaml | 56 +++++++------------ .../openshift_nightlies/util/test_manifest.py | 14 +---- 2 files changed, 23 insertions(+), 47 deletions(-) diff --git a/dags/openshift_nightlies/manifest.yaml b/dags/openshift_nightlies/manifest.yaml index e96123e2b..0fda36bc6 100644 --- a/dags/openshift_nightlies/manifest.yaml +++ b/dags/openshift_nightlies/manifest.yaml @@ -1,8 +1,4 @@ versions: - - version: "4.11" - alias: old - releaseStream: 4.11.0-0.nightly - baremetalReleaseStream: latest-4.11 - version: "4.12" alias: stable releaseStream: 4.12.0-0.nightly @@ -34,8 +30,8 @@ dagConfig: platforms: cloud: versions: ["4.12", "4.13", "4.14"] - providers: ["aws", "aws-arm", "azure","gcp"] - # providers: ["aws", "gcp", "azure", "alibaba"] + providers: ["aws"] + #providers: ["aws", "aws-arm", "azure","gcp"] variants: - name: ovn-small-cp schedule: "0 12 * * 3" @@ -57,36 +53,26 @@ platforms: config: install: ovn-dp-v2.json benchmarks: data-plane-v2.json - - name: ovn-data-plane - config: - install: ovn-dp.json - benchmarks: data-plane.json - - name: acs - config: - install: acs.json - benchmarks: acs.json - baremetal: - build: dev - versions: ["4.11", "4.12", "4.13"] - variants: - - name: jetski - config: - install: baremetal/jetski.json - benchmarks: baremetal-benchmarks - openstack: - versions: ["4.11", "4.12", "4.13"] - variants: - - name: ovnk-control-plane - schedule: "0 0 * * 3" - config: - install: openstack/ovnk.json - benchmarks: osp-large-control-plane.json - - name: ovnk-data-plane - schedule: "0 0 * * 4" - config: - install: openstack/ovnk.json - benchmarks: osp-data-plane.json + # Removing old dp tests and ACS as we don't use them in the main dag + # - name: ovn-data-plane + # config: + # install: ovn-dp.json + # benchmarks: data-plane.json + # - name: acs + # config: + # install: acs.json + # benchmarks: acs.json + + #Disabling Baremetal as we don't use it in the main dag. + # baremetal: + # build: dev + # versions: ["4.11", "4.12", "4.13"] + # variants: + # - name: jetski + # config: + # install: baremetal/jetski.json + # benchmarks: baremetal-benchmarks # Do not program concurrent builds of ROSA/ROGCP/ARO rosa: diff --git a/dags/tests/openshift_nightlies/util/test_manifest.py b/dags/tests/openshift_nightlies/util/test_manifest.py index 350ec1bb5..61445e3ab 100644 --- a/dags/tests/openshift_nightlies/util/test_manifest.py +++ b/dags/tests/openshift_nightlies/util/test_manifest.py @@ -26,7 +26,7 @@ def mocked_releases(self,mocked_manifest): return mocked_manifest.get_releases() def test_manifest_patched_secret_populates(self,mocked_manifest): - assert len( mocked_manifest.latest_releases ) == 8 + assert len( mocked_manifest.latest_releases ) == 6 def assert_amd_installer(self,stream): assert "arm64" not in stream @@ -67,17 +67,7 @@ def test_cloudreleases_amd(self,mocked_releases): self.assert_amd_installer(release["release"].get_latest_release()[self.INSTALL_BINARY]) self.assert_amd_client(release["release"].get_latest_release()[self.CLIENT_BINARY]) hits += 1 - assert hits == 6 - - def test_cloudreleases_arm(self,mocked_releases): - releases = mocked_releases - hits = 0 - for release in releases: - if "4.12-aws-arm" in release["release"].get_release_name(): - self.assert_arm_installer(release["release"].get_latest_release()[self.INSTALL_BINARY]) - self.assert_arm_client(release["release"].get_latest_release()[self.CLIENT_BINARY]) - hits += 1 - assert hits == 6 + assert hits == 4 def test_endwith(self): assert "aws-arm".endswith("arm") From 81bf4ccf8578433c9a0ed30e2c714ea56dd27cd8 Mon Sep 17 00:00:00 2001 From: Krishna Harsha Voora <14876995+krishvoor@users.noreply.github.com> Date: Fri, 14 Jul 2023 13:14:09 +0530 Subject: [PATCH 15/26] [data-plane-v2]: Rename default configuration file (#341) Signed-off-by: Krishna Harsha Voora --- dags/openshift_nightlies/config/benchmarks/data-plane-v2.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/openshift_nightlies/config/benchmarks/data-plane-v2.json b/dags/openshift_nightlies/config/benchmarks/data-plane-v2.json index f1a5fd8cd..fa796d4d7 100644 --- a/dags/openshift_nightlies/config/benchmarks/data-plane-v2.json +++ b/dags/openshift_nightlies/config/benchmarks/data-plane-v2.json @@ -21,7 +21,7 @@ "command": "./run.sh", "env": { "BASELINE_UUID": "rosa-4.12-9w-2r-c5.4xlarge", - "CONFIG": "config/aws-standard.yml", + "CONFIG": "config/standard.yml", "ES_INDEX": "ingress-performance", "BASELINE_INDEX": "ingress-performance-baseline", "TOLERANCY": "20" From 426a7481df1269f9a997ad48f7d99dc8ee4bc472 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Sevilla?= Date: Tue, 18 Jul 2023 22:18:13 +0200 Subject: [PATCH 16/26] kube-burner v1.7.3 (#342) Signed-off-by: Raul Sevilla --- images/airflow/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/images/airflow/Dockerfile b/images/airflow/Dockerfile index c15ed949a..3e5c60d5e 100644 --- a/images/airflow/Dockerfile +++ b/images/airflow/Dockerfile @@ -14,6 +14,6 @@ RUN curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | b ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6 -RUN curl -L https://github.com/cloud-bulldozer/kube-burner/releases/download/v1.7.2/kube-burner-V1.7.2-linux-x86_64.tar.gz | tar xz -C /usr/bin kube-burner +RUN curl -L https://github.com/cloud-bulldozer/kube-burner/releases/download/v1.7.3/kube-burner-V1.7.3-linux-x86_64.tar.gz | tar xz -C /usr/bin kube-burner RUN curl -L https://github.com/cloud-bulldozer/k8s-netperf/releases/download/v0.1.11/k8s-netperf_Linux_v0.1.11_x86_64.tar.gz | tar xz -C /usr/bin k8s-netperf USER airflow From 9ec40467968d6518eb5eab3c9ec25f5b02221265 Mon Sep 17 00:00:00 2001 From: Murali Krishnasamy <70236227+mukrishn@users.noreply.github.com> Date: Tue, 1 Aug 2023 15:03:11 -0400 Subject: [PATCH 17/26] code clean up: rosa classic scripts (#345) * added rosahcp script * trimmed down extra char * updated workload HCP light-weight c-d-v2 iteration will be 7 iteration/node * removed extra env vars * increasing node ready timeout * mgmt cluster env var * scrapping metric for longer duration * updated query logic to index MC stat --- .../benchmarks/hcp-small-control-plane.json | 37 + .../benchmarks/hosted-control-plane-p75.json | 6 +- .../benchmarks/hosted-control-plane-p90.json | 21 +- .../data-plane.json} | 1 - .../p75-control-plane.json} | 3 +- .../install/rosa-hcp/small-control-plane.json | 27 + .../config/install/rosa/ovn-osd.json | 16 - dags/openshift_nightlies/dag.py | 61 +- dags/openshift_nightlies/manifest.yaml | 54 +- dags/openshift_nightlies/models/release.py | 2 +- .../scripts/install/rosa-hcp.sh | 803 ++++++++++++++++++ .../scripts/install/rosa.sh | 698 ++------------- .../tasks/benchmarks/e2e.py | 34 +- .../tasks/install/rosa/defaults.json | 7 +- .../tasks/install/rosa/rosa.py | 13 +- .../tasks/install/rosahcp/__init__.py | 0 .../tasks/install/rosahcp/defaults.json | 45 + .../tasks/install/rosahcp/rosahcp.py | 63 ++ dags/openshift_nightlies/util/manifest.py | 29 + dags/openshift_nightlies/util/var_loader.py | 2 +- 20 files changed, 1160 insertions(+), 762 deletions(-) create mode 100644 dags/openshift_nightlies/config/benchmarks/hcp-small-control-plane.json rename dags/openshift_nightlies/config/install/{rosa/rosa-hcp-ovn-data-plane.json => rosa-hcp/data-plane.json} (97%) rename dags/openshift_nightlies/config/install/{rosa/rosa-hcp-ovn.json => rosa-hcp/p75-control-plane.json} (94%) create mode 100644 dags/openshift_nightlies/config/install/rosa-hcp/small-control-plane.json delete mode 100644 dags/openshift_nightlies/config/install/rosa/ovn-osd.json create mode 100755 dags/openshift_nightlies/scripts/install/rosa-hcp.sh create mode 100644 dags/openshift_nightlies/tasks/install/rosahcp/__init__.py create mode 100644 dags/openshift_nightlies/tasks/install/rosahcp/defaults.json create mode 100644 dags/openshift_nightlies/tasks/install/rosahcp/rosahcp.py diff --git a/dags/openshift_nightlies/config/benchmarks/hcp-small-control-plane.json b/dags/openshift_nightlies/config/benchmarks/hcp-small-control-plane.json new file mode 100644 index 000000000..d6eaa6bb7 --- /dev/null +++ b/dags/openshift_nightlies/config/benchmarks/hcp-small-control-plane.json @@ -0,0 +1,37 @@ +{ + "benchmarks": [ + { + "name": "node-density", + "workload": "kube-burner-ocp-wrapper", + "trigger_rule": "all_done", + "command": "./run.sh", + "env": { + "WORKLOAD": "node-density", + "LOG_LEVEL": "debug" + } + }, + { + "name": "node-desnity-cni", + "workload": "kube-burner-ocp-wrapper", + "trigger_rule": "all_done", + "command": "./run.sh", + "env": { + "WORKLOAD": "node-density-cni", + "LOG_LEVEL": "debug" + } + }, + { + "name": "cluster-density-v2", + "workload": "kube-burner-ocp-wrapper", + "trigger_rule": "all_done", + "command": "./run.sh", + "env": { + "WORKLOAD": "cluster-density-v2", + "ITERATIONS": "500", + "LOG_LEVEL": "debug", + "CHURN": "true", + "EXTRA_FLAGS": "--churn-duration=1h --churn-percent=10 --churn-delay=30s" + } + } + ] +} diff --git a/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p75.json b/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p75.json index d64034267..47e7ea0dc 100644 --- a/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p75.json +++ b/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p75.json @@ -1,13 +1,13 @@ { "benchmarks": [ { - "name": "cluster-density-ms-p75", + "name": "cluster-density-v2-p75", "workload": "kube-burner-ocp-wrapper", "trigger_rule": "all_done", "command": "./run.sh", "env": { - "WORKLOAD": "cluster-density-ms", - "ITERATIONS": "75", + "WORKLOAD": "cluster-density-v2", + "ITERATIONS": "63", "LOG_LEVEL": "debug", "EXTRA_FLAGS": "--churn-duration=1h --churn-percent=10 --churn-delay=30s" } diff --git a/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p90.json b/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p90.json index 769a6699e..af6671cb3 100644 --- a/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p90.json +++ b/dags/openshift_nightlies/config/benchmarks/hosted-control-plane-p90.json @@ -1,22 +1,15 @@ { "benchmarks": [ { - "name": "cluster-density-ms-p90", - "workload": "kube-burner", + "name": "cluster-density-v2-p90", + "workload": "kube-burner-ocp-wrapper", + "trigger_rule": "all_done", "command": "./run.sh", "env": { - "WORKLOAD": "cluster-density-ms", - "JOB_ITERATIONS": "100", - "JOB_TIMEOUT": "18000", - "STEP_SIZE": "2m", - "HYPERSHIFT": "true", - "METRICS_PROFILE": "metrics-profiles/hypershift-metrics.yaml", - "QPS": "20", - "BURST": "20", - "LOG_LEVEL": "info", - "PLATFORM_ALERTS": "false", - "CLEANUP_WHEN_FINISH": "true", - "CLEANUP": "true" + "WORKLOAD": "cluster-density-v2", + "ITERATIONS": "84", + "LOG_LEVEL": "debug", + "EXTRA_FLAGS": "--churn-duration=1h --churn-percent=10 --churn-delay=30s" } } ] diff --git a/dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn-data-plane.json b/dags/openshift_nightlies/config/install/rosa-hcp/data-plane.json similarity index 97% rename from dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn-data-plane.json rename to dags/openshift_nightlies/config/install/rosa-hcp/data-plane.json index 380b11b35..9bfef13dc 100644 --- a/dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn-data-plane.json +++ b/dags/openshift_nightlies/config/install/rosa-hcp/data-plane.json @@ -1,5 +1,4 @@ { - "rosa_hcp": "true", "aws_profile": "", "aws_access_key_id": "", "aws_secret_access_key": "", diff --git a/dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn.json b/dags/openshift_nightlies/config/install/rosa-hcp/p75-control-plane.json similarity index 94% rename from dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn.json rename to dags/openshift_nightlies/config/install/rosa-hcp/p75-control-plane.json index 9d07be452..11110dd1f 100644 --- a/dags/openshift_nightlies/config/install/rosa/rosa-hcp-ovn.json +++ b/dags/openshift_nightlies/config/install/rosa-hcp/p75-control-plane.json @@ -1,5 +1,4 @@ { - "rosa_hcp": "true", "aws_profile": "", "aws_access_key_id": "", "aws_secret_access_key": "", @@ -16,7 +15,7 @@ "openshift_worker_instance_type": "m5.2xlarge", "machineset_metadata_label_prefix": "machine.openshift.io", "staging_mgmt_provisioner_shards": "b4bb294b-a76c-11ed-91b2-0a580a831ba1", - "number_of_hostedcluster": 2, + "number_of_hostedcluster": 10, "hcp_install_interval": 60, "extra_machinepool": [{ "name": "infra", diff --git a/dags/openshift_nightlies/config/install/rosa-hcp/small-control-plane.json b/dags/openshift_nightlies/config/install/rosa-hcp/small-control-plane.json new file mode 100644 index 000000000..c107fca5c --- /dev/null +++ b/dags/openshift_nightlies/config/install/rosa-hcp/small-control-plane.json @@ -0,0 +1,27 @@ +{ + "aws_profile": "", + "aws_access_key_id": "", + "aws_secret_access_key": "", + "aws_authentication_method": "sts", + "aws_region": "us-east-2", + "rosa_environment": "staging", + "rosa_cli_version": "container", + "ocm_cli_version": "container", + "ocm_environment": "stage", + "managed_channel_group": "nightly", + "managed_ocp_version": "latest", + "openshift_worker_count": 24, + "openshift_network_type": "OVNKubernetes", + "openshift_worker_instance_type": "m5.2xlarge", + "machineset_metadata_label_prefix": "machine.openshift.io", + "staging_mgmt_provisioner_shards": "b4bb294b-a76c-11ed-91b2-0a580a831ba1", + "number_of_hostedcluster": 1, + "hcp_install_interval": 10, + "extra_machinepool": [{ + "name": "infra", + "replica": "1", + "instance_type": "r5.xlarge", + "labels": "node-role.kubernetes.io/infra=", + "taints": "node-role.kubernetes.io/infra=:NoSchedule" + }] +} diff --git a/dags/openshift_nightlies/config/install/rosa/ovn-osd.json b/dags/openshift_nightlies/config/install/rosa/ovn-osd.json deleted file mode 100644 index 68d1c4ba3..000000000 --- a/dags/openshift_nightlies/config/install/rosa/ovn-osd.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "cluster_install_method": "osd", - "aws_profile": "", - "aws_access_key_id": "", - "aws_secret_access_key": "", - "rosa_environment": "staging", - "rosa_cli_version": "container", - "ocm_environment": "stage", - "managed_channel_group": "nightly", - "managed_ocp_version": "latest", - "openshift_worker_count": 27, - "openshift_network_type": "OVNKubernetes", - "openshift_worker_instance_type": "m5.2xlarge", - "machineset_metadata_label_prefix": "machine.openshift.io", - "openshift_workload_node_instance_type": "m5.2xlarge" - } diff --git a/dags/openshift_nightlies/dag.py b/dags/openshift_nightlies/dag.py index 33033e470..9e6f5dd71 100644 --- a/dags/openshift_nightlies/dag.py +++ b/dags/openshift_nightlies/dag.py @@ -14,6 +14,7 @@ from openshift_nightlies.tasks.install.openstack import jetpack from openshift_nightlies.tasks.install.baremetal import jetski, webfuse from openshift_nightlies.tasks.install.rosa import rosa +from openshift_nightlies.tasks.install.rosahcp import rosahcp from openshift_nightlies.tasks.install.rogcp import rogcp from openshift_nightlies.tasks.install.hypershift import hypershift from openshift_nightlies.tasks.install.prebuilt import initialize_cluster @@ -155,30 +156,21 @@ def _get_openshift_installer(self): class RosaNightlyDAG(AbstractOpenshiftNightlyDAG): def build(self): installer = self._get_openshift_installer() - if installer.get_type() == "rosa_hcp": - install_cluster = installer.get_install_hcp_task() - hosted_installer = self._get_hypershift_openshift_installer() - wait_task = hosted_installer.wait_task() - wait_before_cleanup = hosted_installer.wait_task(id="wait_before_cleanup") - for c_id, install_hc, postinstall_hc, cleanup_hc in install_cluster: - benchmark = self._add_benchmarks(task_group=c_id) - install_hc >> postinstall_hc >> wait_task >> benchmark >> wait_before_cleanup >> cleanup_hc + install_cluster = installer.get_install_task() + final_status = final_dag_status.get_task(self.dag) + with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks: + must_gather = self._get_scale_ci_diagnosis().get_must_gather("must-gather") + benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks() + chain(*benchmark_tasks) + # Configure must_gather as downstream of all benchmark tasks + for benchmark in benchmark_tasks: + benchmark >> must_gather + rosa_post_installation = self._get_rosa_postinstall_setup()._get_rosa_postinstallation() + if self.config.cleanup_on_success: + cleanup_cluster = installer.get_cleanup_task() + install_cluster >> rosa_post_installation >> benchmarks >> cleanup_cluster >> final_status else: - install_cluster = installer.get_install_task() - final_status = final_dag_status.get_task(self.dag) - with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks: - must_gather = self._get_scale_ci_diagnosis().get_must_gather("must-gather") - benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks() - chain(*benchmark_tasks) - # Configure must_gather as downstream of all benchmark tasks - for benchmark in benchmark_tasks: - benchmark >> must_gather - rosa_post_installation = self._get_rosa_postinstall_setup()._get_rosa_postinstallation() - if self.config.cleanup_on_success: - cleanup_cluster = installer.get_cleanup_task() - install_cluster >> rosa_post_installation >> benchmarks >> cleanup_cluster >> final_status - else: - install_cluster >> rosa_post_installation >> benchmarks >> final_status + install_cluster >> rosa_post_installation >> benchmarks >> final_status def _get_openshift_installer(self): return rosa.RosaInstaller(self.dag, self.config, self.release) @@ -186,15 +178,28 @@ def _get_openshift_installer(self): def _get_e2e_benchmarks(self, task_group="benchmarks"): return e2e.E2EBenchmarks(self.dag, self.config, self.release, task_group) +class RosaHCPNightlyDAG(AbstractOpenshiftNightlyDAG): + def build(self): + installer = self._get_openshift_installer() + install_cluster = installer.get_install_hcp_task() + wait_task = installer.wait_task() + wait_before_cleanup = installer.wait_task(id="wait_before_cleanup") + for c_id, install_hc, postinstall_hc, cleanup_hc in install_cluster: + benchmark = self._add_benchmarks(task_group=c_id) + install_hc >> postinstall_hc >> wait_task >> benchmark >> wait_before_cleanup >> cleanup_hc + + def _get_openshift_installer(self): + return rosahcp.RosaHCPInstaller(self.dag, self.config, self.release) + + def _get_e2e_benchmarks(self, task_group="benchmarks"): + return e2e.E2EBenchmarks(self.dag, self.config, self.release, task_group) + def _add_benchmarks(self, task_group): with TaskGroup(task_group, prefix_group_id=False, dag=self.dag) as benchmarks: benchmark_tasks = self._get_e2e_benchmarks(task_group).get_benchmarks() chain(*benchmark_tasks) return benchmarks - def _get_hypershift_openshift_installer(self): - return hypershift.HypershiftInstaller(self.dag, self.config, self.release) - class RoGCPNightlyDAG(AbstractOpenshiftNightlyDAG): def build(self): @@ -242,7 +247,7 @@ def build(self): install_mgmt_cluster >> rosa_post_installation >> install_hc >> wait_task >> benchmark def _get_openshift_installer(self): - return rosa.RosaInstaller(self.dag, self.config, self.release) + return rosahcp.RosaHCPInstaller(self.dag, self.config, self.release) def _get_hypershift_openshift_installer(self): return hypershift.HypershiftInstaller(self.dag, self.config, self.release) @@ -303,6 +308,8 @@ def build_releases(): nightly = OpenstackNightlyDAG(openshift_release, dag_config) elif openshift_release.platform == "rosa": nightly = RosaNightlyDAG(openshift_release, dag_config) + elif openshift_release.platform == "rosahcp": + nightly = RosaHCPNightlyDAG(openshift_release, dag_config) elif openshift_release.platform == "rogcp": nightly = RoGCPNightlyDAG(openshift_release, dag_config) elif openshift_release.platform == "hypershift": diff --git a/dags/openshift_nightlies/manifest.yaml b/dags/openshift_nightlies/manifest.yaml index 0fda36bc6..6dfc0d24c 100644 --- a/dags/openshift_nightlies/manifest.yaml +++ b/dags/openshift_nightlies/manifest.yaml @@ -103,16 +103,6 @@ platforms: config: install: rosa/iam-ovn.json benchmarks: control-plane.json - - name: osd-ovn-control-plane - schedule: "30 12 * * 1,3,5" - config: - install: rosa/ovn-osd.json - benchmarks: control-plane.json - - name: osd-ovn-data-plane - schedule: "30 1 * * 1,3,5" # an hour gap for OSD to avoid OsdCcsAdmin key limit - config: - install: rosa/ovn-osd.json - benchmarks: data-plane-mgs.json - name: ocm-api-load schedule: "None" config: @@ -123,16 +113,25 @@ platforms: config: install: rosa/upgrade.json benchmarks: upgrade.json - - name: rosa-hcp-control-plane + + rosahcp: + versions: ["4.12", "4.13"] + variants: + - name: p75-control-plane schedule: "0 12 * * 3" config: - install: rosa/rosa-hcp-ovn.json + install: rosa-hcp/p75-control-plane.json benchmarks: hosted-control-plane-p75.json - - name: rosa-hcp-data-plane + - name: data-plane-v2 schedule: "1 12 * * 3" config: - install: rosa/rosa-hcp-ovn-data-plane.json + install: rosa-hcp/data-plane.json benchmarks: data-plane-v2.json + - name: small-control-plane + schedule: "2 12 * * 3" + config: + install: rosa-hcp/small-control-plane.json + benchmarks: hcp-small-control-plane.json rogcp: versions: ["4.12", "4.13"] @@ -148,33 +147,6 @@ platforms: install: rogcp/ovn.json benchmarks: data-plane-mgs.json - hypershift: - versions: ["4.12", "4.13"] - variants: - - name: management-control-plane - schedule: "30 3 * * 1,3,5" # an hour gap for OSD to avoid OsdCcsAdmin key limit - config: - install: hypershift/none-type.json - benchmarks: management-control-plane.json - - name: ovn-control-plane-p75 - schedule: "30 4 * * 1,3,5" # an hour gap for OSD to avoid OsdCcsAdmin key limit - config: - install: hypershift/ovn-p75.json - benchmarks: hosted-control-plane-p75.json - - name: ovn-control-plane-p90 - schedule: "30 5 * * 1,3,5" # an hour gap for OSD to avoid OsdCcsAdmin key limit - config: - install: hypershift/ovn-p90.json - benchmarks: hosted-control-plane-p90.json - - name: chaos-ovn-control-plane-p75 - config: - install: hypershift/ovn-p75.json - benchmarks: hosted-control-plane-chaos-p75.json - - name: chaos-ovn-control-plane-p90 - config: - install: hypershift/ovn-p90.json - benchmarks: hosted-control-plane-chaos-p90.json - prebuilt: versions: ["4.x"] variants: diff --git a/dags/openshift_nightlies/models/release.py b/dags/openshift_nightlies/models/release.py index 67f6c250e..831deaf5f 100644 --- a/dags/openshift_nightlies/models/release.py +++ b/dags/openshift_nightlies/models/release.py @@ -41,7 +41,7 @@ def _generate_cluster_name(self): else: cluster_name = f"{git_user}-{git_branch}-{release_name}" - if self.platform == 'rosa' or self.platform == 'rogcp' or self.platform == 'hypershift': + if self.platform == 'rosa' or self.platform == 'rogcp' or self.platform == 'hypershift' or self.platform == 'rosahcp': #Only 15 chars are allowed cluster_version = str(self.version).replace(".","") return "perf-"+md5(cluster_name.encode("ascii")).hexdigest()[:3] diff --git a/dags/openshift_nightlies/scripts/install/rosa-hcp.sh b/dags/openshift_nightlies/scripts/install/rosa-hcp.sh new file mode 100755 index 000000000..4b5a27b87 --- /dev/null +++ b/dags/openshift_nightlies/scripts/install/rosa-hcp.sh @@ -0,0 +1,803 @@ +#!/bin/bash +# shellcheck disable=SC2155 +set -ex + +export INDEXDATA=() + +while getopts v:a:j:o: flag +do + case "${flag}" in + v) version=${OPTARG};; + j) json_file=${OPTARG};; + o) operation=${OPTARG};; + *) echo "ERROR: invalid parameter ${flag}" ;; + esac +done + +_get_cluster_id(){ + if [[ $INSTALL_METHOD == "osd" ]]; then + echo "$(ocm list clusters --no-headers --columns id $1)" + else + echo "$(rosa list clusters -o json | jq -r '.[] | select(.name == '\"$1\"') | .id')" + fi +} + +_download_kubeconfig(){ + ocm get /api/clusters_mgmt/v1/clusters/$1/credentials | jq -r .kubeconfig > $2 +} + +_get_cluster_status(){ + if [[ $INSTALL_METHOD == "osd" ]]; then + echo "$(ocm list clusters --no-headers --columns state $1 | xargs)" + else + echo "$(rosa list clusters -o json | jq -r '.[] | select(.name == '\"$1\"') | .status.state')" + fi +} + +_wait_for_nodes_ready(){ + _download_kubeconfig "$(_get_cluster_id $1)" ./kubeconfig + export KUBECONFIG=./kubeconfig + ALL_READY_ITERATIONS=0 + ITERATIONS=0 + NODES_COUNT=$2 + # 30 seconds per node, waiting for all nodes ready to finalize + while [ ${ITERATIONS} -le $((NODES_COUNT*10)) ] ; do + NODES_READY_COUNT=$(oc get nodes -l $3 | grep " Ready " | wc -l) + if [ ${NODES_READY_COUNT} -ne ${NODES_COUNT} ] ; then + echo "WARNING: ${ITERATIONS}/${NODES_COUNT} iterations. ${NODES_READY_COUNT}/${NODES_COUNT} $3 nodes ready. Waiting 30 seconds for next check" + # ALL_READY_ITERATIONS=0 + ITERATIONS=$((${ITERATIONS}+1)) + sleep 30 + else + if [ ${ALL_READY_ITERATIONS} -eq 2 ] ; then + echo "INFO: ${ALL_READY_ITERATIONS}/5. All nodes ready, continuing process" + return 0 + else + echo "INFO: ${ALL_READY_ITERATIONS}/5. All nodes ready. Waiting 60 seconds for next check" + ALL_READY_ITERATIONS=$((${ALL_READY_ITERATIONS}+1)) + sleep 60 + fi + fi + done + END_CLUSTER_STATUS="Ready. No Workers" + echo "ERROR: Not all $3 nodes (${NODES_READY_COUNT}/${NODES_COUNT}) are ready after about $((${NODES_COUNT}*3)) minutes, dumping oc get nodes..." + oc get nodes + exit 1 +} + +_aws_cmd(){ + ITR=0 + while [ $ITR -le 30 ]; do + if [[ "$(aws ec2 $1 2>&1)" == *"error"* ]]; then + echo "Failed to $1, retrying after 30 seconds" + ITR=$(($ITR+1)) + sleep 10 + else + return 0 + fi + done + echo "Failed to $1 after 10 minutes of multiple retries" + exit 1 +} + +_login_check(){ + echo "Trying to oc login with password" + ITR=1 + START_TIMER=$(date +%s) + while [ $ITR -le 100 ]; do + if [[ "$(oc login $1 --username cluster-admin --password $2 --insecure-skip-tls-verify=true --request-timeout=30s 2>&1)" == *"failed"* ]]; then + echo "Attempt $ITR: Failed to login $1, retrying after 5 seconds" + ITR=$(($ITR+1)) + sleep 5 + RECHECK=1 + else + if [[ $RECHECK -eq 10 ]]; then + CURRENT_TIMER=$(date +%s) + # Time since rosa cluster is ready until all nodes are ready + DURATION=$(($CURRENT_TIMER - $START_TIMER)) + INDEXDATA+=("cluster_admin_login-${DURATION}") + _adm_logic_check $1 $2 + return 0 + else + echo "Rechecking login for $((10-$RECHECK)) more times" + RECHECK=$(($RECHECK+1)) + sleep 1 + fi + fi + done + END_CLUSTER_STATUS="Ready. Not Access" + echo "Failed to login after 100 attempts with 5 sec interval" +} + +_adm_logic_check(){ + ITR=1 + START_TIMER=$(date +%s) + while [ $ITR -le 100 ]; do + oc login $1 --username cluster-admin --password $2 --insecure-skip-tls-verify=true --request-timeout=30s + CHECK=$(oc adm top images 2>&1 > /dev/null) + if [[ $? != 0 ]]; then + echo "Attempt $ITR: Failed to login $1, retrying after 5 seconds" + ITR=$(($ITR+1)) + sleep 5 + else + CURRENT_TIMER=$(date +%s) + # Time since rosa cluster is ready until all nodes are ready + DURATION=$(($CURRENT_TIMER - $START_TIMER)) + INDEXDATA+=("cluster_oc_adm-${DURATION}") + return 0 + fi + done + END_CLUSTER_STATUS="Ready. Not Access" + echo "Failed to execute oc adm commands after 100 attempts with 5 sec interval" +} + +_balance_infra(){ + if [[ $1 == "prometheus-k8s" ]] ; then + echo "Initiate migration of prometheus componenets to infra nodepools" + oc get pods -n openshift-monitoring -o wide | grep prometheus-k8s + oc get sts prometheus-k8s -n openshift-monitoring + echo "Restart stateful set pods" + oc rollout restart -n openshift-monitoring statefulset/prometheus-k8s + echo "Wait till they are completely restarted" + oc rollout status -n openshift-monitoring statefulset/prometheus-k8s + echo "Check pods status again and the hosting nodes" + oc get pods -n openshift-monitoring -o wide | grep prometheus-k8s + else + echo "Initiate migration of ingress router-default pods to infra nodepools" + echo "Add toleration to use infra nodes" + oc patch ingresscontroller -n openshift-ingress-operator default --type merge --patch '{"spec":{"nodePlacement":{"nodeSelector":{"matchLabels":{"node-role.kubernetes.io/infra":""}},"tolerations":[{"effect":"NoSchedule","key":"node-role.kubernetes.io/infra","operator":"Exists"}]}}}' + echo "Wait till it gets rolled out" + sleep 60 + oc get pods -n openshift-ingress -o wide + fi +} + +_check_infra(){ + TRY=0 + while [ $TRY -le 3 ]; do # Attempts three times to migrate pods + FLAG_ERROR="" + _balance_infra $1 + for node in $(oc get pods -n $2 -o wide | grep -i $1 | grep -i running | awk '{print$7}'); + do + if [[ $(oc get nodes | grep infra | awk '{print$1}' | grep $node) != "" ]]; then + echo "$node is an infra node" + else + echo "$1 pod on $node is not an infra node, retrying" + FLAG_ERROR=true + fi + done + if [[ $FLAG_ERROR == "" ]]; then return 0; else TRY=$((TRY+1)); fi + done + echo "Failed to move $1 pods in $2 namespace" + exit 1 +} + +_wait_for_extra_nodes_ready(){ + export NODE_LABLES=$(cat ${json_file} | jq -r .extra_machinepool[].labels) + for label in $NODE_LABLES; + do + REPLICA=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.labels == '\"$label\"')'.replica) + NODES_COUNT=$((REPLICA*3)) + if [[ $label == *"infra"* ]] ; then NODES_COUNT=$((REPLICA*2)); fi + _wait_for_nodes_ready $CLUSTER_NAME $NODES_COUNT $label + if [[ $label == *"infra"* ]] ; then + _check_infra prometheus-k8s openshift-monitoring + _check_infra router openshift-ingress + fi + done + return 0 +} + +_add_machinepool(){ + export MACHINEPOOLS=$(cat ${json_file} | jq -r .extra_machinepool[].name) + for mcp in $MACHINEPOOLS; + do + echo "Add an extra machinepool - $mcp to cluster" + ZONES="a b c" + MC_NAME=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.name) + REPLICA=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.replica) + INS_TYPE=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.instance_type) + LABELS=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.labels) + TAINTS=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.taints) + if [[ $MC_NAME == *"infra"* ]]; then ZONES="a b"; fi + for ZONE in $ZONES; + do + if [[ $(rosa list machinepool --cluster "$(_get_cluster_id ${CLUSTER_NAME})" | grep $MC_NAME-$ZONE) == "" ]]; then + rosa create machinepool --cluster "$(_get_cluster_id ${CLUSTER_NAME})" --name $MC_NAME-$ZONE --instance-type ${INS_TYPE} --replicas $REPLICA --availability-zone $AWS_REGION$ZONE --labels $LABELS --taints $TAINTS + fi + done + done + _wait_for_extra_nodes_ready + return 0 +} + +_wait_for_cluster_ready(){ + START_TIMER=$(date +%s) + echo "INFO: Installation starts at $(date -d @${START_TIMER})" + echo "INFO: Waiting about 180 iterations, counting only when cluster enters on installing status" + ITERATIONS=0 + PREVIOUS_STATUS="" + # 90 iterations, sleeping 60 seconds, 1.5 hours of wait + # Only increasing iterations on installing status + while [ ${ITERATIONS} -le 90 ] ; do + CLUSTER_STATUS=$(_get_cluster_status $1) + CURRENT_TIMER=$(date +%s) + if [ ${CLUSTER_STATUS} != ${PREVIOUS_STATUS} ] && [ ${PREVIOUS_STATUS} != "" ]; then + # When detected a status change, index timer and update start time for next status change + DURATION=$(($CURRENT_TIMER - $START_TIMER)) + INDEXDATA+=("${PREVIOUS_STATUS}"-"${DURATION}") + START_TIMER=${CURRENT_TIMER} + echo "INFO: Cluster status changed to ${CLUSTER_STATUS}" + if [ ${CLUSTER_STATUS} == "error" ] ; then + if [[ $INSTALL_METHOD == "osd" ]]; then + echo "ERROR: Cluster $1 not installed after 1.5 hours.." + else + rosa logs install -c $1 + rosa describe cluster -c $1 + fi + return 1 + fi + fi + if [ ${CLUSTER_STATUS} == "ready" ] ; then + END_CLUSTER_STATUS="Ready" + echo "Set end time of prom scrape" + export END_TIME=$(date +"%s") + START_TIMER=$(date +%s) + _wait_for_nodes_ready $1 ${COMPUTE_WORKERS_NUMBER} "node-role.kubernetes.io/worker" + CURRENT_TIMER=$(date +%s) + # Time since rosa cluster is ready until all nodes are ready + DURATION=$(($CURRENT_TIMER - $START_TIMER)) + INDEXDATA+=("day2operations-${DURATION}") + _add_machinepool $URL $PASSWORD + if [[ $INSTALL_METHOD == "osd" ]]; then + echo "INFO: Cluster and nodes on ready status.." + else + echo "INFO: Cluster and nodes on ready status at ${CURRENT_TIMER}, dumping installation logs..." + rosa logs install -c $1 + rosa describe cluster -c $1 + fi + return 0 + elif [ ${CLUSTER_STATUS} == "installing" ] ; then + echo "INFO: ${ITERATIONS}/90. Cluster on ${CLUSTER_STATUS} status, waiting 60 seconds for next check" + ITERATIONS=$((${ITERATIONS}+1)) + sleep 60 + else + # Sleep 1 to try to capture as much as posible states before installing + sleep 1 + fi + PREVIOUS_STATUS=${CLUSTER_STATUS} + done + if [[ $INSTALL_METHOD == "osd" ]]; then + echo "ERROR: Cluster $1 not installed after 3 hours.." + else + END_CLUSTER_STATUS="Not Ready" + echo "ERROR: Cluster $1 not installed after 90 iterations, dumping installation logs..." + rosa logs install -c $1 + rosa describe cluster -c $1 + fi + exit 1 +} + +_create_aws_vpc(){ + + echo "Create Internet Gateway" + aws ec2 create-internet-gateway --tag-specifications ResourceType=internet-gateway,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=igw-$CLUSTER_NAME}]" --output json + export IGW=$(aws ec2 describe-internet-gateways --filters "Name=tag:Name,Values=igw-$CLUSTER_NAME" --output json | jq -r ".InternetGateways[0].InternetGatewayId") + + echo "Create VPC and attach internet gateway" + aws ec2 create-vpc --cidr-block 10.0.0.0/16 --tag-specifications ResourceType=vpc,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=vpc-$CLUSTER_NAME}]" --output json + export VPC=$(aws ec2 describe-vpcs --filters "Name=tag:HostedClusterName,Values=$CLUSTER_NAME" --output json | jq -r '.Vpcs[0].VpcId') + + aws ec2 modify-vpc-attribute --vpc-id $VPC --enable-dns-support "{\"Value\":true}" + aws ec2 modify-vpc-attribute --vpc-id $VPC --enable-dns-hostnames "{\"Value\":true}" + aws ec2 attach-internet-gateway --vpc-id $VPC --internet-gateway-id $IGW + + aws ec2 create-route-table --vpc-id $VPC --tag-specifications ResourceType=route-table,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=public-rt-table-$CLUSTER_NAME}]" --output json + export PUB_RT_TB=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=public-rt-table-$CLUSTER_NAME" --output json | jq -r '.RouteTables[0].RouteTableId') + aws ec2 create-route --route-table-id $PUB_RT_TB --destination-cidr-block 0.0.0.0/0 --gateway-id $IGW + + ITR=0 + export ALL_PRI_RT_TB="" + for ZONE in a b c; + do + ITR=$((ITR+1)) + echo "Allocate Elastic IP" + aws ec2 allocate-address --tag-specifications ResourceType=elastic-ip,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=eip-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json + export E_IP=$(aws ec2 describe-addresses --filters "Name=tag:Name,Values=eip-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Addresses[0].AllocationId") + + echo "Create Subnets and Route tables" + aws ec2 create-subnet --vpc-id $VPC --cidr-block 10.0.$ITR.0/24 --availability-zone $AWS_REGION$ZONE --tag-specifications ResourceType=subnet,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=public-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json + export PUB_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=public-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Subnets[0].SubnetId") + aws ec2 create-nat-gateway --subnet-id $PUB_SUB --allocation-id $E_IP --tag-specifications ResourceType=natgateway,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=ngw-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json + export NGW=$(aws ec2 describe-nat-gateways --filter "Name=tag:Name,Values=ngw-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".NatGateways[]" | jq -r 'select(.State == "available" or .State == "pending")' | jq -r ".NatGatewayId") + echo "Wait until NatGateway $NGW is available" + aws ec2 wait nat-gateway-available --nat-gateway-ids $NGW + aws ec2 associate-route-table --route-table-id $PUB_RT_TB --subnet-id $PUB_SUB + + aws ec2 create-subnet --vpc-id $VPC --cidr-block 10.0.$((ITR+10)).0/24 --availability-zone $AWS_REGION$ZONE --tag-specifications ResourceType=subnet,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=private-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json + export PRI_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=private-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Subnets[0].SubnetId") + aws ec2 create-route-table --vpc-id $VPC --tag-specifications ResourceType=route-table,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=private-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json + export PRI_RT_TB=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=private-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r '.RouteTables[0].RouteTableId') + export ALL_PRI_RT_TB="${ALL_PRI_RT_TB} ${PRI_RT_TB}" + aws ec2 associate-route-table --route-table-id $PRI_RT_TB --subnet-id $PRI_SUB + aws ec2 create-route --route-table-id $PRI_RT_TB --destination-cidr-block 0.0.0.0/0 --gateway-id $NGW + done + + echo "Create private VPC endpoint to S3" + aws ec2 create-vpc-endpoint --vpc-id $VPC --service-name com.amazonaws.$AWS_REGION.s3 --route-table-ids $ALL_PRI_RT_TB --tag-specifications ResourceType=vpc-endpoint,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=vpce-$CLUSTER_NAME}]" +} + +_delete_aws_vpc(){ + echo "Delete Subnets, Routes, Gateways, VPC if exists" + export VPC=$(aws ec2 describe-vpcs --filters "Name=tag:HostedClusterName,Values=$CLUSTER_NAME" --output json | jq -r '.Vpcs[0].VpcId') + if [ $VPC != null ]; then + echo "Delete VPC Endpoint" + export VPCE=$(aws ec2 describe-vpc-endpoints --filters "Name=tag:Name,Values=vpce-$CLUSTER_NAME" --output json | jq -r '.VpcEndpoints[0].VpcEndpointId') + if [ $VPCE != null ]; then _aws_cmd "delete-vpc-endpoints --vpc-endpoint-ids $VPCE"; fi + + export ELB=$(aws elb describe-load-balancers --output json | jq -r '.LoadBalancerDescriptions[]'| jq -r 'select(.VPCId == '\"${VPC}\"')' | jq -r '.LoadBalancerName') + if [ $ELB != "" ]; then aws elb delete-load-balancer --load-balancer-name $ELB; fi + + for ZONE in a b c; + do + echo "Delete Subnets and Route tables" + export PRI_RT_TB=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=private-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r '.RouteTables[0].RouteTableId') + export RT_TB_ASSO_ID=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=private-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r '.RouteTables[0].Associations[0].RouteTableAssociationId') + export PRI_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=private-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Subnets[0].SubnetId") + + if [ $PRI_RT_TB != null ]; then _aws_cmd "delete-route --route-table-id $PRI_RT_TB --destination-cidr-block 0.0.0.0/0"; fi + if [ $RT_TB_ASSO_ID != null ]; then _aws_cmd "disassociate-route-table --association-id $RT_TB_ASSO_ID"; fi + if [ $PRI_RT_TB != null ]; then _aws_cmd "delete-route-table --route-table-id $PRI_RT_TB"; fi + if [ $PRI_SUB != null ]; then _aws_cmd "delete-subnet --subnet-id $PRI_SUB"; fi + + export RT_TB_ASSO_ID=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=public-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r '.RouteTables[0].Associations[].RouteTableAssociationId') + export NGW=$(aws ec2 describe-nat-gateways --filter "Name=tag:Name,Values=ngw-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".NatGateways[]" | jq -r 'select(.State == "available")' | jq -r ".NatGatewayId") + export PUB_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=public-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Subnets[0].SubnetId") + export E_IP=$(aws ec2 describe-addresses --filters "Name=tag:Name,Values=eip-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Addresses[0].AllocationId") + + if [ $RT_TB_ASSO_ID != null ]; then for _id in $RT_TB_ASSO_ID; do _aws_cmd "disassociate-route-table --association-id $_id"; done; fi + if [ $NGW != null ]; then _aws_cmd "delete-nat-gateway --nat-gateway-id $NGW"; fi + if [ $PUB_SUB != null ]; then _aws_cmd "delete-subnet --subnet-id $PUB_SUB"; fi + if [ $E_IP != null ]; then _aws_cmd "release-address --allocation-id $E_IP"; fi + done + + export PUB_RT_TB=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=public-rt-table-$CLUSTER_NAME" --output json | jq -r '.RouteTables[0].RouteTableId') + + if [ $PUB_RT_TB != null ]; then _aws_cmd "delete-route --route-table-id $PUB_RT_TB --destination-cidr-block 0.0.0.0/0"; fi + if [ $PUB_RT_TB != null ]; then _aws_cmd "delete-route-table --route-table-id $PUB_RT_TB"; fi + + export IGW=$(aws ec2 describe-internet-gateways --filters "Name=tag:Name,Values=igw-$CLUSTER_NAME" --output json | jq -r ".InternetGateways[0].InternetGatewayId") + if [ $IGW != null ]; then _aws_cmd "detach-internet-gateway --internet-gateway-id $IGW --vpc-id $VPC"; fi + if [ $IGW != null ]; then _aws_cmd "delete-internet-gateway --internet-gateway-id $IGW"; fi + + echo "Delete Security Group Rules" + for g in $(aws ec2 describe-security-groups --filters "Name=vpc-id,Values=$VPC" --output json | jq -r ".SecurityGroups[].GroupId"); + do + for r in $(aws ec2 describe-security-group-rules --filters "Name=group-id,Values=$g" --output json | jq -r ".SecurityGroupRules[]" | jq -r "select(.IsEgress == false)" | jq -r ".SecurityGroupRuleId"); + do + aws ec2 revoke-security-group-ingress --security-group-rule-ids $r --group-id $g + done + + for r in $(aws ec2 describe-security-group-rules --filters "Name=group-id,Values=$g" --output json | jq -r ".SecurityGroupRules[]" | jq -r "select(.IsEgress == true)" | jq -r ".SecurityGroupRuleId"); + do + aws ec2 revoke-security-group-egress --security-group-rule-ids $r --group-id $g + done + done + + for g in $(aws ec2 describe-security-groups --filters "Name=vpc-id,Values=$VPC" --output json | jq -r ".SecurityGroups[]" | jq -r 'select(.GroupName != "default")' | jq -r ".GroupId"); + do + echo "Delete Security Groups $g" + _aws_cmd "delete-security-group --group-id $g" + done + + echo "Delete VPC $VPC" + _aws_cmd "delete-vpc --vpc-id $VPC" + fi +} + +_oidc_config(){ + echo "${1} OIDC config, with prefix ${2}" + if [[ $1 == "create" ]]; then + echo "${1} OIDC config" + rosa create oidc-config --mode=auto --managed=false --prefix ${2} -y + export OIDC_CONFIG=$(rosa list oidc-config | grep ${2} | awk '{print$1}') + else + export OIDC_CONFIG=$(rosa list oidc-config | grep ${2} | awk '{print$1}') + if [ ! -z $OIDC_CONFIG ]; then rosa delete oidc-config --mode=auto --oidc-config-id ${OIDC_CONFIG} -y || true; fi # forcing exit 0, as this command may file if it is a shared oidc config + fi +} + +_get_sc_mc_details(){ + if [ -z $SVC_CLUSTER_NAME ]; then + echo "Find Service Cluster" + export SVC_CLUSTER_NAME=$(ocm describe cluster ${CLUSTER_NAME} | grep "Service Cluster" | awk '{print$3}') + fi + if [ -z $MGMT_CLUSTER_NAME ]; then + export MGMT_CLUSTER_NAME=$(ocm describe cluster ${CLUSTER_NAME} | grep "Management Cluster" | awk '{print$3}') + fi + echo "Read Management cluster details" + export MGMT_CLUSTER_DETAILS=$(ocm get /api/clusters_mgmt/v1/clusters | jq -r ".items[]" | jq -r 'select(.name == '\"$MGMT_CLUSTER_NAME\"')') + export NUMBER_OF_HC=$(cat ${json_file} | jq -r .number_of_hostedcluster) +} + +setup(){ + mkdir /home/airflow/workspace + cd /home/airflow/workspace + export PATH=$PATH:/usr/bin:/usr/local/go/bin + export HOME=/home/airflow + export AWS_REGION=$(cat ${json_file} | jq -r .aws_region) + export AWS_ACCOUNT_ID=$(cat ${json_file} | jq -r .aws_account_id) + export AWS_ACCESS_KEY_ID=$(cat ${json_file} | jq -r .aws_access_key_id) + export AWS_SECRET_ACCESS_KEY=$(cat ${json_file} | jq -r .aws_secret_access_key) + export AWS_AUTHENTICATION_METHOD=$(cat ${json_file} | jq -r .aws_authentication_method) + export ROSA_ENVIRONMENT=$(cat ${json_file} | jq -r .rosa_environment) + export ROSA_TOKEN=$(cat ${json_file} | jq -r .rosa_token_${ROSA_ENVIRONMENT}) + export MANAGED_OCP_VERSION=$(cat ${json_file} | jq -r .managed_ocp_version) + export MANAGED_CHANNEL_GROUP=$(cat ${json_file} | jq -r .managed_channel_group) + export CLUSTER_NAME=$(cat ${json_file} | jq -r .openshift_cluster_name) + export COMPUTE_WORKERS_NUMBER=$(cat ${json_file} | jq -r .openshift_worker_count) + export NETWORK_TYPE=$(cat ${json_file} | jq -r .openshift_network_type) + export ES_SERVER=$(cat ${json_file} | jq -r .es_server) + export STAGE_CONFIG="" + export MGMT_CLUSTER_NAME=$(cat ${json_file} | jq -r .staging_mgmt_cluster_name) + export SVC_CLUSTER_NAME=$(cat ${json_file} | jq -r .staging_svc_cluster_name) + export STAGE_PROV_SHARD=$(cat ${json_file} | jq -r .staging_mgmt_provisioner_shards) + export OIDC_PREFIX=$(cat ${json_file} | jq -r .openshift_cluster_name) + export CLUSTER_NAME="${CLUSTER_NAME}-${HOSTED_ID}" # perf-as3-hcp-1, perf-as3-hcp-2.. + export KUBECONFIG_NAME=$(echo $KUBECONFIG_NAME | awk -F-kubeconfig '{print$1}')-$HOSTED_ID-kubeconfig + export KUBEADMIN_NAME=$(echo $KUBEADMIN_NAME | awk -F-kubeadmin '{print$1}')-$HOSTED_ID-kubeadmin + UUID=$(echo $AIRFLOW_CTX_DAG_RUN_ID | base64 | cut -c 1-32 ) + export UUID=${UUID} + export OCM_CLI_VERSION=$(cat ${json_file} | jq -r .ocm_cli_version) + if [[ ${OCM_CLI_VERSION} != "container" ]]; then + OCM_CLI_FORK=$(cat ${json_file} | jq -r .ocm_cli_fork) + git clone -q --depth=1 --single-branch --branch ${OCM_CLI_VERSION} ${OCM_CLI_FORK} + pushd ocm-cli + sudo PATH=$PATH:/usr/bin:/usr/local/go/bin make + sudo mv ocm /usr/local/bin/ + popd + fi + if [[ $INSTALL_METHOD == "osd" ]]; then + echo "Clean-up existing OSD access keys.." + AWS_KEY=$(aws iam list-access-keys --user-name OsdCcsAdmin --output text --query 'AccessKeyMetadata[*].AccessKeyId') + LEN_AWS_KEY=`echo $AWS_KEY | wc -w` + if [[ ${LEN_AWS_KEY} -eq 2 ]]; then + aws iam delete-access-key --user-name OsdCcsAdmin --access-key-id `printf ${AWS_KEY[0]}` + fi + echo "Create new OSD access key.." + export ADMIN_KEY=$(aws iam create-access-key --user-name OsdCcsAdmin) + export AWS_ACCESS_KEY_ID=$(echo $ADMIN_KEY | jq -r '.AccessKey.AccessKeyId') + export AWS_SECRET_ACCESS_KEY=$(echo $ADMIN_KEY | jq -r '.AccessKey.SecretAccessKey') + ocm login --url=https://api.stage.openshift.com --token="${ROSA_TOKEN}" + ocm whoami + sleep 60 # it takes a few sec for new access key + echo "Check AWS Username..." + aws iam get-user | jq -r .User.UserName + else + export ROSA_CLI_VERSION=$(cat ${json_file} | jq -r .rosa_cli_version) + if [[ ${ROSA_CLI_VERSION} != "container" ]]; then + ROSA_CLI_FORK=$(cat ${json_file} | jq -r .rosa_cli_fork) + git clone -q --depth=1 --single-branch --branch ${ROSA_CLI_VERSION} ${ROSA_CLI_FORK} + pushd rosa + make + sudo mv rosa /usr/local/bin/ + popd + fi + ocm login --url=https://api.stage.openshift.com --token="${ROSA_TOKEN}" + ocm whoami + rosa login --env=${ROSA_ENVIRONMENT} + rosa whoami + rosa verify quota + rosa verify permissions + if [ "${MANAGED_OCP_VERSION}" == "latest" ] ; then + export ROSA_VERSION=$(rosa list versions -o json --channel-group=${MANAGED_CHANNEL_GROUP} | jq -r '.[] | select(.raw_id|startswith('\"${version}\"')) | .raw_id' | sort -rV | head -1) + elif [ "${MANAGED_OCP_VERSION}" == "prelatest" ] ; then + export ROSA_VERSION=$(rosa list versions -o json --channel-group=${MANAGED_CHANNEL_GROUP} | jq -r '.[] | select(.raw_id|startswith('\"${version}\"')) | .raw_id' | sort -rV | head -2 | tail -1) + else + export ROSA_VERSION=$(rosa list versions -o json --channel-group=${MANAGED_CHANNEL_GROUP} | jq -r '.[] | select(.raw_id|startswith('\"${version}\"')) | .raw_id' | grep ^${MANAGED_OCP_VERSION}$) + fi + [ -z "${ROSA_VERSION}" ] && echo "ERROR: Image not found for version (${version}) on ROSA ${MANAGED_CHANNEL_GROUP} channel group" && exit 1 + return 0 + fi +} + +install(){ + export COMPUTE_WORKERS_TYPE=$(cat ${json_file} | jq -r .openshift_worker_instance_type) + export CLUSTER_AUTOSCALE=$(cat ${json_file} | jq -r .cluster_autoscale) + export OIDC_CONFIG=$(cat ${json_file} | jq -r .oidc_config) + if [[ $INSTALL_METHOD == "osd" ]]; then + if [ "${MANAGED_OCP_VERSION}" == "latest" ] ; then + export OCM_VERSION=$(ocm list versions --channel-group ${MANAGED_CHANNEL_GROUP} | grep ^${version} | sort -rV | head -1) + elif [ "${MANAGED_OCP_VERSION}" == "prelatest" ] ; then + export OCM_VERSION=$(ocm list versions --channel-group ${MANAGED_CHANNEL_GROUP} | grep ^${version} | sort -rV | head -2 | tail -1) + else + export OCM_VERSION=$(ocm list versions --channel-group ${MANAGED_CHANNEL_GROUP} | grep ^${MANAGED_OCP_VERSION}) + fi + [ -z ${OCM_VERSION} ] && echo "ERROR: Image not found for version (${version}) on OCM ${MANAGED_CHANNEL_GROUP} channel group" && exit 1 + if [[ $CLUSTER_AUTOSCALE == "true" ]]; then + export MIN_COMPUTE_WORKERS_NUMBER=$(cat ${json_file} | jq -r .min_openshift_worker_count) + export CLUSTER_SIZE="--enable-autoscaling --min-replicas ${MIN_COMPUTE_WORKERS_NUMBER} --max-replicas ${COMPUTE_WORKERS_NUMBER}" + else + export CLUSTER_SIZE="--compute-nodes ${COMPUTE_WORKERS_NUMBER}" + fi + ocm create cluster --ccs --provider aws --region ${AWS_REGION} --aws-account-id ${AWS_ACCOUNT_ID} --aws-access-key-id ${AWS_ACCESS_KEY_ID} --aws-secret-access-key ${AWS_SECRET_ACCESS_KEY} --channel-group ${MANAGED_CHANNEL_GROUP} --version ${OCM_VERSION} --multi-az --compute-machine-type ${COMPUTE_WORKERS_TYPE} --network-type ${NETWORK_TYPE} ${CLUSTER_NAME} ${CLUSTER_SIZE} + else + export INSTALLATION_PARAMS="" + export ROSA_HCP_PARAMS="" + if [ $AWS_AUTHENTICATION_METHOD == "sts" ] ; then + INSTALLATION_PARAMS="${INSTALLATION_PARAMS} --sts -m auto --yes" + fi + _create_aws_vpc + echo "Set start time of prom scrape" + export START_TIME=$(date +"%s") + if [ $STAGE_PROV_SHARD != "" ]; then + STAGE_CONFIG="--properties provision_shard_id:${STAGE_PROV_SHARD}" + fi + ALL_SUBNETS=$(aws ec2 describe-subnets --filters "Name=tag:HostedClusterName,Values=$CLUSTER_NAME" --output json | jq -r ".Subnets[].SubnetId") + SUBNETS_IDS="" + for _ID in ${ALL_SUBNETS}; + do + if [[ ${SUBNETS_IDS} == "" ]]; then SUBNETS_IDS=${_ID}; else SUBNETS_IDS=${SUBNETS_IDS}","${_ID}; fi + done + ROSA_HCP_PARAMS="--hosted-cp ${STAGE_CONFIG} --subnet-ids ${SUBNETS_IDS} --machine-cidr 10.0.0.0/16" + export OIDC_CONFIG=$(rosa list oidc-config | grep $OIDC_PREFIX | awk '{print$1}') + if [ -z $OIDC_CONFIG ]; then _oidc_config create $OIDC_PREFIX; fi + ROSA_HCP_PARAMS="${ROSA_HCP_PARAMS} --oidc-config-id ${OIDC_CONFIG}" + rosa create cluster --tags=User:${GITHUB_USERNAME} --cluster-name ${CLUSTER_NAME} --version "${ROSA_VERSION}" --channel-group=${MANAGED_CHANNEL_GROUP} --compute-machine-type ${COMPUTE_WORKERS_TYPE} --replicas ${COMPUTE_WORKERS_NUMBER} --network-type ${NETWORK_TYPE} ${INSTALLATION_PARAMS} ${ROSA_HCP_PARAMS} + fi + postinstall + return 0 +} + +postinstall(){ + _wait_for_cluster_ready ${CLUSTER_NAME} + # sleeping to address issue #324 + sleep 120 + export EXPIRATION_TIME=$(cat ${json_file} | jq -r .rosa_expiration_time) + _download_kubeconfig "$(_get_cluster_id ${CLUSTER_NAME})" ./kubeconfig + _get_sc_mc_details + echo "Index Managment cluster info" + index_metadata "management" + _download_kubeconfig "$(ocm list clusters --no-headers --columns id ${MGMT_CLUSTER_NAME})" ./mgmt_kubeconfig + kubectl delete secret staging-mgmt-cluster-kubeconfig || true + kubectl create secret generic staging-mgmt-cluster-kubeconfig --from-file=config=./mgmt_kubeconfig + + unset KUBECONFIG + kubectl delete secret ${KUBECONFIG_NAME} || true + kubectl create secret generic ${KUBECONFIG_NAME} --from-file=config=./kubeconfig + if [[ $INSTALL_METHOD == "osd" ]]; then + export PASSWORD=$(echo ${CLUSTER_NAME} | md5sum | awk '{print $1}') + ocm create idp -n localauth -t htpasswd --username kubeadmin --password ${PASSWORD} -c ${CLUSTER_NAME} + ocm create user kubeadmin -c "$(_get_cluster_id ${CLUSTER_NAME})" --group=cluster-admins + # set expiration time + EXPIRATION_STRING=$(date -d "${EXPIRATION_TIME} minutes" '+{"expiration_timestamp": "%FT%TZ"}') + ocm patch /api/clusters_mgmt/v1/clusters/"$(_get_cluster_id ${CLUSTER_NAME})" <<< ${EXPIRATION_STRING} + echo "Cluster is ready, deleting OSD access keys now.." + aws iam delete-access-key --user-name OsdCcsAdmin --access-key-id $AWS_ACCESS_KEY_ID || true + kubectl delete secret ${KUBEADMIN_NAME} || true + kubectl create secret generic ${KUBEADMIN_NAME} --from-literal=KUBEADMIN_PASSWORD=${PASSWORD} + else + URL=$(rosa describe cluster -c $CLUSTER_NAME --output json | jq -r ".api.url") + START_TIMER=$(date +%s) + PASSWORD=$(rosa create admin -c "$(_get_cluster_id ${CLUSTER_NAME})" -y 2>/dev/null | grep "oc login" | awk '{print $7}') + CURRENT_TIMER=$(date +%s) + DURATION=$(($CURRENT_TIMER - $START_TIMER)) + INDEXDATA+=("cluster_admin_create-${DURATION}") + kubectl delete secret ${KUBEADMIN_NAME} || true + kubectl create secret generic ${KUBEADMIN_NAME} --from-literal=KUBEADMIN_PASSWORD=${PASSWORD} + _login_check $URL $PASSWORD + # set expiration to 24h + rosa edit cluster -c "$(_get_cluster_id ${CLUSTER_NAME})" --expiration=${EXPIRATION_TIME}m + fi + index_metadata "cluster-install" + return 0 +} + +index_metadata(){ + if [[ ! "${INDEXDATA[*]}" =~ "cleanup" ]] ; then + _download_kubeconfig "$(_get_cluster_id ${CLUSTER_NAME})" ./kubeconfig + export KUBECONFIG=./kubeconfig + fi + if [[ $INSTALL_METHOD == "osd" ]]; then + export PLATFORM="AWS-MS" + export CLUSTER_VERSION="${OCM_VERSION}" + else + export PLATFORM="ROSA" + export CLUSTER_VERSION="${ROSA_VERSION}" + fi + if [ "$1" == "management" ]; then + METADATA=$(cat << EOF +{ +"uuid" : "${UUID}", +"aws_authentication_method": "${AWS_AUTHENTICATION_METHOD}", +"version": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".openshift_version")", +"infra_id": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".infra_id")", +"cluster_name": "$MGMT_CLUSTER_NAME", +"cluster_id": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".id")", +"base_domain": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".dns.base_domain")", +"aws_region": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".region.id")", +"workers": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".nodes.autoscale_compute.max_replicas")", +"workers_type": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".nodes.compute_machine_type.id")", +"network_type": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".network.type")", +"install_method": "rosa", +"provision_shard": "$STAGE_PROV_SHARD", +"hostedclusters": "$NUMBER_OF_HC" +} +EOF +) + elif [ "$1" == "cluster-install" ]; then + METADATA=$(cat << EOF +{ +"uuid" : "${UUID}", +"aws_authentication_method": "${AWS_AUTHENTICATION_METHOD}", +"mgmt_cluster_name": "$MGMT_CLUSTER_NAME", +"workers": "$COMPUTE_WORKERS_NUMBER", +"cluster_name": "${CLUSTER_NAME}", +"cluster_id": "$(_get_cluster_id ${CLUSTER_NAME})", +"network_type": "${NETWORK_TYPE}", +"version": "${CLUSTER_VERSION}", +"operation": "install", +"install_method": "rosa", +"status": "$END_CLUSTER_STATUS", +"timestamp": "$(date +%s%3N)" +EOF +) + INSTALL_TIME=0 + TOTAL_TIME=0 + WORKER_READY_TIME=0 + for i in "${INDEXDATA[@]}" ; do IFS="-" ; set -- $i + METADATA="${METADATA}, \"$1\":\"$2\"" + if [ $1 != "day2operations" ] && [ $1 != "login" ] ; then + INSTALL_TIME=$((${INSTALL_TIME} + $2)) + elif [ $1 == "day2operations" ]; then + WORKER_READY_TIME=$2 + else + TOTAL_TIME=$2 + fi + done + IFS=" " + METADATA="${METADATA}, \"duration\":\"${INSTALL_TIME}\"" + METADATA="${METADATA}, \"workers_ready\":\"$(($INSTALL_TIME + $WORKER_READY_TIME))\"" + METADATA="${METADATA} }" + else + METADATA=$(cat << EOF +{ +"uuid" : "${UUID}", +"mgmt_cluster_name": "$MGMT_CLUSTER_NAME", +"workers": "$COMPUTE_WORKERS_NUMBER", +"cluster_name": "${CLUSTER_NAME}", +"cluster_id": "$ROSA_CLUSTER_ID", +"network_type": "${NETWORK_TYPE}", +"version": "${CLUSTER_VERSION}", +"operation": "destroy", +"install_method": "rosa", +"duration": "$DURATION", +"timestamp": "$(date +%s%3N)" +} +EOF +) + fi + printf "Indexing installation timings to ES" + curl -k -sS -X POST -H "Content-type: application/json" ${ES_SERVER}/hypershift-wrapper-timers/_doc -d "${METADATA}" -o /dev/null + + unset KUBECONFIG + return 0 +} + +index_mgmt_cluster_stat(){ + echo "Indexing Management cluster stat..." + cd /home/airflow/workspace + echo "Installing kube-burner" + _download_kubeconfig "$(ocm list clusters --no-headers --columns id ${MGMT_CLUSTER_NAME})" ./mgmt_kubeconfig + export KUBE_BURNER_RELEASE=${KUBE_BURNER_RELEASE:-1.5} + curl -L https://github.com/cloud-bulldozer/kube-burner/releases/download/v${KUBE_BURNER_RELEASE}/kube-burner-${KUBE_BURNER_RELEASE}-Linux-x86_64.tar.gz -o kube-burner.tar.gz + sudo tar -xvzf kube-burner.tar.gz -C /usr/local/bin/ + git clone -q -b ${E2E_BENCHMARKING_BRANCH} ${E2E_BENCHMARKING_REPO} --depth=1 --single-branch + METRIC_PROFILE=/home/airflow/workspace/e2e-benchmarking/workloads/kube-burner-ocp-wrapper/metrics-profiles/mc-metrics.yml + cat > baseconfig.yml << EOF +--- +global: + indexerConfig: + esServers: ["${ES_SERVER}"] + insecureSkipVerify: true + defaultIndex: ${ES_INDEX} + type: elastic +EOF + + HCP_NAMESPACE="$(_get_cluster_id ${CLUSTER_NAME})-$CLUSTER_NAME" + MC_PROMETHEUS=https://$(oc --kubeconfig=./mgmt_kubeconfig get route -n openshift-monitoring prometheus-k8s -o jsonpath="{.spec.host}") + MC_PROMETHEUS_TOKEN=$(oc --kubeconfig=./mgmt_kubeconfig sa new-token -n openshift-monitoring prometheus-k8s) + Q_NODES=$(curl -H "Authorization: Bearer ${MC_PROMETHEUS_TOKEN}" -k --silent --globoff ${MC_PROMETHEUS}/api/v1/query?query='sum(kube_node_role{role!~"master|infra|workload|obo"})by(node)&time='$(date +"%s")'' | jq -r '.data.result[].metric.node' | xargs) + MGMT_WORKER_NODES=${Q_NODES// /|} + echo "Exporting required vars" + cat << EOF +MC_PROMETHEUS: ${MC_PROMETHEUS} +MC_PROMETHEUS_TOKEN: +HCP_NAMESPACE: ${HCP_NAMESPACE} +MGMT_WORKER_NODES: ${MGMT_WORKER_NODES} +elapsed: "20m:" + +EOF + export MC_PROMETHEUS MC_PROMETHEUS_TOKEN HCP_NAMESPACE MGMT_WORKER_NODES elapsed + METADATA=$(cat << EOF +{ +"uuid":"${UUID}", +"timestamp": "$(date +%s%3N)", +"hostedClusterName": "${HC_INFRASTRUCTURE_NAME}", +"clusterName": "${HC_INFRASTRUCTURE_NAME}", +"mgmtClusterName": "${MGMT_CLUSTER_NAME}" +} +EOF +) + printf "Indexing metadata to ES" + curl -k -sS -X POST -H "Content-type: application/json" ${ES_SERVER}/${ES_INDEX}/_doc -d "${METADATA}" -o /dev/null + + echo "Running kube-burner index.." + kube-burner index --uuid=${UUID} --prometheus-url=${MC_PROMETHEUS} --token ${MC_PROMETHEUS_TOKEN} --start=$START_TIME --end=$((END_TIME+600)) --step 2m --metrics-profile ${METRIC_PROFILE} --config ./baseconfig.yml --log-level debug + echo "Finished indexing results" +} + +cleanup(){ + if [[ $INSTALL_METHOD == "osd" ]]; then + ocm delete cluster "$(_get_cluster_id ${CLUSTER_NAME})" + echo "Cluster is getting Uninstalled, deleting OSD access keys now.." + aws iam delete-access-key --user-name OsdCcsAdmin --access-key-id $AWS_ACCESS_KEY_ID || true + else + export ROSA_CLUSTER_ID=$(_get_cluster_id ${CLUSTER_NAME}) + export HC_INFRASTRUCTURE_NAME=${ROSA_CLUSTER_ID} + CLEANUP_START_TIMING=$(date +%s) + export START_TIME=$CLEANUP_START_TIMING + rosa delete cluster -c ${ROSA_CLUSTER_ID} -y + rosa logs uninstall -c ${ROSA_CLUSTER_ID} --watch + if [ $AWS_AUTHENTICATION_METHOD == "sts" ] ; then + rosa delete operator-roles -c ${ROSA_CLUSTER_ID} -m auto --yes || true + rosa delete oidc-provider -c ${ROSA_CLUSTER_ID} -m auto --yes || true + fi + DURATION=$(($(date +%s) - $CLEANUP_START_TIMING)) + INDEXDATA+=("cleanup-${DURATION}") + export END_TIME=$(date +"%s") + _delete_aws_vpc + if [ -z $OIDC_CONFIG ]; then _oidc_config delete $OIDC_PREFIX; fi + fi + return 0 +} + +export INSTALL_METHOD=$(cat ${json_file} | jq -r .cluster_install_method) +export HC_INTERVAL=$(cat ${json_file} | jq -r .hcp_install_interval) +SKEW_FACTOR=$(echo $HOSTED_ID|awk -F- '{print$2}') +sleep $(($HC_INTERVAL*$SKEW_FACTOR)) # 60*1, 60*2.. +setup + +if [[ "$operation" == "install" ]]; then + printf "INFO: Checking if cluster is already installed" + CLUSTER_STATUS=$(_get_cluster_status ${CLUSTER_NAME}) + if [ -z "${CLUSTER_STATUS}" ] ; then + printf "INFO: Cluster not found, installing..." + echo "pre-clean AWS resources" + _delete_aws_vpc + install + export HC_INFRASTRUCTURE_NAME=$(_get_cluster_id ${CLUSTER_NAME}) + index_mgmt_cluster_stat "install-metrics" + + elif [ "${CLUSTER_STATUS}" == "ready" ] ; then + printf "INFO: Cluster ${CLUSTER_NAME} already installed and ready, reusing..." + postinstall + elif [ "${CLUSTER_STATUS}" == "error" ] ; then + printf "INFO: Cluster ${CLUSTER_NAME} errored, cleaning them now..." + cleanup + printf "INFO: Fail this install to re-try a fresh install" + exit 1 + else + printf "INFO: Cluster ${CLUSTER_NAME} already installed but not ready, exiting..." + exit 1 + fi + +elif [[ "$operation" == "cleanup" ]]; then + printf "Running Cleanup Steps" + _get_sc_mc_details + cleanup + index_metadata + index_mgmt_cluster_stat "destroy-metrics" + rosa logout + ocm logout +fi diff --git a/dags/openshift_nightlies/scripts/install/rosa.sh b/dags/openshift_nightlies/scripts/install/rosa.sh index 8e53dea91..998e8b0cb 100755 --- a/dags/openshift_nightlies/scripts/install/rosa.sh +++ b/dags/openshift_nightlies/scripts/install/rosa.sh @@ -15,11 +15,7 @@ do done _get_cluster_id(){ - if [[ $INSTALL_METHOD == "osd" ]]; then - echo "$(ocm list clusters --no-headers --columns id $1)" - else - echo "$(rosa list clusters -o json | jq -r '.[] | select(.name == '\"$1\"') | .id')" - fi + echo "$(rosa list clusters -o json | jq -r '.[] | select(.name == '\"$1\"') | .id')" } _download_kubeconfig(){ @@ -27,11 +23,7 @@ _download_kubeconfig(){ } _get_cluster_status(){ - if [[ $INSTALL_METHOD == "osd" ]]; then - echo "$(ocm list clusters --no-headers --columns state $1 | xargs)" - else - echo "$(rosa list clusters -o json | jq -r '.[] | select(.name == '\"$1\"') | .status.state')" - fi + echo "$(rosa list clusters -o json | jq -r '.[] | select(.name == '\"$1\"') | .status.state')" } _wait_for_nodes_ready(){ @@ -39,13 +31,8 @@ _wait_for_nodes_ready(){ export KUBECONFIG=./kubeconfig ALL_READY_ITERATIONS=0 ITERATIONS=0 - if [ $HCP == "true" ]; then - NODES_COUNT=$2 - ALL_READY_ITERATIONS=4 #reduced extra buffers for hosted cp clusters - else - # Node count is number of workers + 3 infra - NODES_COUNT=$(($2+3)) - fi + # Node count is number of workers + 3 infra + NODES_COUNT=$(($2+3)) # 30 seconds per node, waiting for all nodes ready to finalize while [ ${ITERATIONS} -le $((${NODES_COUNT}*5)) ] ; do NODES_READY_COUNT=$(oc get nodes -l $3 | grep " Ready " | wc -l) @@ -71,152 +58,6 @@ _wait_for_nodes_ready(){ exit 1 } -_aws_cmd(){ - ITR=0 - while [ $ITR -le 30 ]; do - if [[ "$(aws ec2 $1 2>&1)" == *"error"* ]]; then - echo "Failed to $1, retrying after 30 seconds" - ITR=$(($ITR+1)) - sleep 10 - else - return 0 - fi - done - echo "Failed to $1 after 10 minutes of multiple retries" - exit 1 -} - -_login_check(){ - echo "Trying to oc login with password" - ITR=1 - START_TIMER=$(date +%s) - while [ $ITR -le 100 ]; do - if [[ "$(oc login $1 --username cluster-admin --password $2 --insecure-skip-tls-verify=true --request-timeout=30s 2>&1)" == *"failed"* ]]; then - echo "Attempt $ITR: Failed to login $1, retrying after 5 seconds" - ITR=$(($ITR+1)) - sleep 5 - RECHECK=1 - else - if [[ $RECHECK -eq 10 ]]; then - CURRENT_TIMER=$(date +%s) - # Time since rosa cluster is ready until all nodes are ready - DURATION=$(($CURRENT_TIMER - $START_TIMER)) - INDEXDATA+=("cluster_admin_login-${DURATION}") - _adm_logic_check $1 $2 - return 0 - else - echo "Rechecking login for $((10-$RECHECK)) more times" - RECHECK=$(($RECHECK+1)) - sleep 1 - fi - fi - done - END_CLUSTER_STATUS="Ready. Not Access" - echo "Failed to login after 100 attempts with 5 sec interval" -} - -_adm_logic_check(){ - ITR=1 - START_TIMER=$(date +%s) - while [ $ITR -le 100 ]; do - oc login $1 --username cluster-admin --password $2 --insecure-skip-tls-verify=true --request-timeout=30s - CHECK=$(oc adm top images 2>&1 > /dev/null) - if [[ $? != 0 ]]; then - echo "Attempt $ITR: Failed to login $1, retrying after 5 seconds" - ITR=$(($ITR+1)) - sleep 5 - else - CURRENT_TIMER=$(date +%s) - # Time since rosa cluster is ready until all nodes are ready - DURATION=$(($CURRENT_TIMER - $START_TIMER)) - INDEXDATA+=("cluster_oc_adm-${DURATION}") - return 0 - fi - done - END_CLUSTER_STATUS="Ready. Not Access" - echo "Failed to execute oc adm commands after 100 attempts with 5 sec interval" -} - -_balance_infra(){ - if [[ $1 == "prometheus-k8s" ]] ; then - echo "Initiate migration of prometheus componenets to infra nodepools" - oc get pods -n openshift-monitoring -o wide | grep prometheus-k8s - oc get sts prometheus-k8s -n openshift-monitoring - echo "Restart stateful set pods" - oc rollout restart -n openshift-monitoring statefulset/prometheus-k8s - echo "Wait till they are completely restarted" - oc rollout status -n openshift-monitoring statefulset/prometheus-k8s - echo "Check pods status again and the hosting nodes" - oc get pods -n openshift-monitoring -o wide | grep prometheus-k8s - else - echo "Initiate migration of ingress router-default pods to infra nodepools" - echo "Add toleration to use infra nodes" - oc patch ingresscontroller -n openshift-ingress-operator default --type merge --patch '{"spec":{"nodePlacement":{"nodeSelector":{"matchLabels":{"node-role.kubernetes.io/infra":""}},"tolerations":[{"effect":"NoSchedule","key":"node-role.kubernetes.io/infra","operator":"Exists"}]}}}' - echo "Wait till it gets rolled out" - sleep 60 - oc get pods -n openshift-ingress -o wide - fi -} - -_check_infra(){ - TRY=0 - while [ $TRY -le 3 ]; do # Attempts three times to migrate pods - FLAG_ERROR="" - _balance_infra $1 - for node in $(oc get pods -n $2 -o wide | grep -i $1 | grep -i running | awk '{print$7}'); - do - if [[ $(oc get nodes | grep infra | awk '{print$1}' | grep $node) != "" ]]; then - echo "$node is an infra node" - else - echo "$1 pod on $node is not an infra node, retrying" - FLAG_ERROR=true - fi - done - if [[ $FLAG_ERROR == "" ]]; then return 0; else TRY=$((TRY+1)); fi - done - echo "Failed to move $1 pods in $2 namespace" - exit 1 -} - -_wait_for_extra_nodes_ready(){ - export NODE_LABLES=$(cat ${json_file} | jq -r .extra_machinepool[].labels) - for label in $NODE_LABLES; - do - REPLICA=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.labels == '\"$label\"')'.replica) - NODES_COUNT=$((REPLICA*3)) - if [[ $label == *"infra"* ]] ; then NODES_COUNT=$((REPLICA*2)); fi - _wait_for_nodes_ready $CLUSTER_NAME $NODES_COUNT $label - if [[ $label == *"infra"* ]] ; then - _check_infra prometheus-k8s openshift-monitoring - _check_infra router openshift-ingress - fi - done - return 0 -} - -_add_machinepool(){ - export MACHINEPOOLS=$(cat ${json_file} | jq -r .extra_machinepool[].name) - for mcp in $MACHINEPOOLS; - do - echo "Add an extra machinepool - $mcp to cluster" - ZONES="a b c" - MC_NAME=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.name) - REPLICA=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.replica) - INS_TYPE=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.instance_type) - LABELS=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.labels) - TAINTS=$(cat ${json_file} | jq -r .extra_machinepool[] | jq -r 'select(.name == '\"$mcp\"')'.taints) - if [[ $MC_NAME == *"infra"* ]]; then ZONES="a b"; fi - for ZONE in $ZONES; - do - if [[ $(rosa list machinepool --cluster "$(_get_cluster_id ${CLUSTER_NAME})" | grep $MC_NAME-$ZONE) == "" ]]; then - rosa create machinepool --cluster "$(_get_cluster_id ${CLUSTER_NAME})" --name $MC_NAME-$ZONE --instance-type ${INS_TYPE} --replicas $REPLICA --availability-zone $AWS_REGION$ZONE --labels $LABELS --taints $TAINTS - fi - done - done - _wait_for_extra_nodes_ready - return 0 -} - _wait_for_cluster_ready(){ START_TIMER=$(date +%s) echo "INFO: Installation starts at $(date -d @${START_TIMER})" @@ -235,12 +76,8 @@ _wait_for_cluster_ready(){ START_TIMER=${CURRENT_TIMER} echo "INFO: Cluster status changed to ${CLUSTER_STATUS}" if [ ${CLUSTER_STATUS} == "error" ] ; then - if [[ $INSTALL_METHOD == "osd" ]]; then - echo "ERROR: Cluster $1 not installed after 1.5 hours.." - else - rosa logs install -c $1 - rosa describe cluster -c $1 - fi + rosa logs install -c $1 + rosa describe cluster -c $1 return 1 fi fi @@ -254,14 +91,9 @@ _wait_for_cluster_ready(){ # Time since rosa cluster is ready until all nodes are ready DURATION=$(($CURRENT_TIMER - $START_TIMER)) INDEXDATA+=("day2operations-${DURATION}") - if [ $HCP == "true" ]; then _add_machinepool $URL $PASSWORD; fi - if [[ $INSTALL_METHOD == "osd" ]]; then - echo "INFO: Cluster and nodes on ready status.." - else - echo "INFO: Cluster and nodes on ready status at ${CURRENT_TIMER}, dumping installation logs..." - rosa logs install -c $1 - rosa describe cluster -c $1 - fi + echo "INFO: Cluster and nodes on ready status at ${CURRENT_TIMER}, dumping installation logs..." + rosa logs install -c $1 + rosa describe cluster -c $1 return 0 elif [ ${CLUSTER_STATUS} == "installing" ] ; then echo "INFO: ${ITERATIONS}/90. Cluster on ${CLUSTER_STATUS} status, waiting 60 seconds for next check" @@ -273,157 +105,12 @@ _wait_for_cluster_ready(){ fi PREVIOUS_STATUS=${CLUSTER_STATUS} done - if [[ $INSTALL_METHOD == "osd" ]]; then - echo "ERROR: Cluster $1 not installed after 3 hours.." - else - END_CLUSTER_STATUS="Not Ready" - echo "ERROR: Cluster $1 not installed after 90 iterations, dumping installation logs..." - rosa logs install -c $1 - rosa describe cluster -c $1 - fi - exit 1 -} - -_create_aws_vpc(){ - - echo "Create Internet Gateway" - aws ec2 create-internet-gateway --tag-specifications ResourceType=internet-gateway,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=igw-$CLUSTER_NAME}]" --output json - export IGW=$(aws ec2 describe-internet-gateways --filters "Name=tag:Name,Values=igw-$CLUSTER_NAME" --output json | jq -r ".InternetGateways[0].InternetGatewayId") - - echo "Create VPC and attach internet gateway" - aws ec2 create-vpc --cidr-block 10.0.0.0/16 --tag-specifications ResourceType=vpc,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=vpc-$CLUSTER_NAME}]" --output json - export VPC=$(aws ec2 describe-vpcs --filters "Name=tag:HostedClusterName,Values=$CLUSTER_NAME" --output json | jq -r '.Vpcs[0].VpcId') - - aws ec2 modify-vpc-attribute --vpc-id $VPC --enable-dns-support "{\"Value\":true}" - aws ec2 modify-vpc-attribute --vpc-id $VPC --enable-dns-hostnames "{\"Value\":true}" - aws ec2 attach-internet-gateway --vpc-id $VPC --internet-gateway-id $IGW - - aws ec2 create-route-table --vpc-id $VPC --tag-specifications ResourceType=route-table,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=public-rt-table-$CLUSTER_NAME}]" --output json - export PUB_RT_TB=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=public-rt-table-$CLUSTER_NAME" --output json | jq -r '.RouteTables[0].RouteTableId') - aws ec2 create-route --route-table-id $PUB_RT_TB --destination-cidr-block 0.0.0.0/0 --gateway-id $IGW - - ITR=0 - export ALL_PRI_RT_TB="" - for ZONE in a b c; - do - ITR=$((ITR+1)) - echo "Allocate Elastic IP" - aws ec2 allocate-address --tag-specifications ResourceType=elastic-ip,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=eip-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json - export E_IP=$(aws ec2 describe-addresses --filters "Name=tag:Name,Values=eip-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Addresses[0].AllocationId") - - echo "Create Subnets and Route tables" - aws ec2 create-subnet --vpc-id $VPC --cidr-block 10.0.$ITR.0/24 --availability-zone $AWS_REGION$ZONE --tag-specifications ResourceType=subnet,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=public-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json - export PUB_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=public-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Subnets[0].SubnetId") - aws ec2 create-nat-gateway --subnet-id $PUB_SUB --allocation-id $E_IP --tag-specifications ResourceType=natgateway,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=ngw-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json - export NGW=$(aws ec2 describe-nat-gateways --filter "Name=tag:Name,Values=ngw-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".NatGateways[]" | jq -r 'select(.State == "available" or .State == "pending")' | jq -r ".NatGatewayId") - echo "Wait until NatGateway $NGW is available" - aws ec2 wait nat-gateway-available --nat-gateway-ids $NGW - aws ec2 associate-route-table --route-table-id $PUB_RT_TB --subnet-id $PUB_SUB - - aws ec2 create-subnet --vpc-id $VPC --cidr-block 10.0.$((ITR+10)).0/24 --availability-zone $AWS_REGION$ZONE --tag-specifications ResourceType=subnet,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=private-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json - export PRI_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=private-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Subnets[0].SubnetId") - aws ec2 create-route-table --vpc-id $VPC --tag-specifications ResourceType=route-table,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=private-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE}]" --output json - export PRI_RT_TB=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=private-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r '.RouteTables[0].RouteTableId') - export ALL_PRI_RT_TB="${ALL_PRI_RT_TB} ${PRI_RT_TB}" - aws ec2 associate-route-table --route-table-id $PRI_RT_TB --subnet-id $PRI_SUB - aws ec2 create-route --route-table-id $PRI_RT_TB --destination-cidr-block 0.0.0.0/0 --gateway-id $NGW - done - - echo "Create private VPC endpoint to S3" - aws ec2 create-vpc-endpoint --vpc-id $VPC --service-name com.amazonaws.$AWS_REGION.s3 --route-table-ids $ALL_PRI_RT_TB --tag-specifications ResourceType=vpc-endpoint,Tags="[{Key=HostedClusterName,Value=$CLUSTER_NAME},{Key=Name,Value=vpce-$CLUSTER_NAME}]" -} - -_delete_aws_vpc(){ - echo "Delete Subnets, Routes, Gateways, VPC if exists" - export VPC=$(aws ec2 describe-vpcs --filters "Name=tag:HostedClusterName,Values=$CLUSTER_NAME" --output json | jq -r '.Vpcs[0].VpcId') - if [ $VPC != null ]; then - echo "Delete VPC Endpoint" - export VPCE=$(aws ec2 describe-vpc-endpoints --filters "Name=tag:Name,Values=vpce-$CLUSTER_NAME" --output json | jq -r '.VpcEndpoints[0].VpcEndpointId') - if [ $VPCE != null ]; then _aws_cmd "delete-vpc-endpoints --vpc-endpoint-ids $VPCE"; fi - - export ELB=$(aws elb describe-load-balancers --output json | jq -r '.LoadBalancerDescriptions[]'| jq -r 'select(.VPCId == '\"${VPC}\"')' | jq -r '.LoadBalancerName') - if [ $ELB != "" ]; then aws elb delete-load-balancer --load-balancer-name $ELB; fi - - for ZONE in a b c; - do - echo "Delete Subnets and Route tables" - export PRI_RT_TB=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=private-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r '.RouteTables[0].RouteTableId') - export RT_TB_ASSO_ID=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=private-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r '.RouteTables[0].Associations[0].RouteTableAssociationId') - export PRI_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=private-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Subnets[0].SubnetId") - - if [ $PRI_RT_TB != null ]; then _aws_cmd "delete-route --route-table-id $PRI_RT_TB --destination-cidr-block 0.0.0.0/0"; fi - if [ $RT_TB_ASSO_ID != null ]; then _aws_cmd "disassociate-route-table --association-id $RT_TB_ASSO_ID"; fi - if [ $PRI_RT_TB != null ]; then _aws_cmd "delete-route-table --route-table-id $PRI_RT_TB"; fi - if [ $PRI_SUB != null ]; then _aws_cmd "delete-subnet --subnet-id $PRI_SUB"; fi - - export RT_TB_ASSO_ID=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=public-rt-table-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r '.RouteTables[0].Associations[].RouteTableAssociationId') - export NGW=$(aws ec2 describe-nat-gateways --filter "Name=tag:Name,Values=ngw-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".NatGateways[]" | jq -r 'select(.State == "available")' | jq -r ".NatGatewayId") - export PUB_SUB=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=public-subnet-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Subnets[0].SubnetId") - export E_IP=$(aws ec2 describe-addresses --filters "Name=tag:Name,Values=eip-$CLUSTER_NAME-$AWS_REGION$ZONE" --output json | jq -r ".Addresses[0].AllocationId") - - if [ $RT_TB_ASSO_ID != null ]; then for _id in $RT_TB_ASSO_ID; do _aws_cmd "disassociate-route-table --association-id $_id"; done; fi - if [ $NGW != null ]; then _aws_cmd "delete-nat-gateway --nat-gateway-id $NGW"; fi - if [ $PUB_SUB != null ]; then _aws_cmd "delete-subnet --subnet-id $PUB_SUB"; fi - if [ $E_IP != null ]; then _aws_cmd "release-address --allocation-id $E_IP"; fi - done - - export PUB_RT_TB=$(aws ec2 describe-route-tables --filters "Name=tag:Name,Values=public-rt-table-$CLUSTER_NAME" --output json | jq -r '.RouteTables[0].RouteTableId') - - if [ $PUB_RT_TB != null ]; then _aws_cmd "delete-route --route-table-id $PUB_RT_TB --destination-cidr-block 0.0.0.0/0"; fi - if [ $PUB_RT_TB != null ]; then _aws_cmd "delete-route-table --route-table-id $PUB_RT_TB"; fi - - export IGW=$(aws ec2 describe-internet-gateways --filters "Name=tag:Name,Values=igw-$CLUSTER_NAME" --output json | jq -r ".InternetGateways[0].InternetGatewayId") - if [ $IGW != null ]; then _aws_cmd "detach-internet-gateway --internet-gateway-id $IGW --vpc-id $VPC"; fi - if [ $IGW != null ]; then _aws_cmd "delete-internet-gateway --internet-gateway-id $IGW"; fi + END_CLUSTER_STATUS="Not Ready" + echo "ERROR: Cluster $1 not installed after 90 iterations, dumping installation logs..." + rosa logs install -c $1 + rosa describe cluster -c $1 - echo "Delete Security Group Rules" - for g in $(aws ec2 describe-security-groups --filters "Name=vpc-id,Values=$VPC" --output json | jq -r ".SecurityGroups[].GroupId"); - do - for r in $(aws ec2 describe-security-group-rules --filters "Name=group-id,Values=$g" --output json | jq -r ".SecurityGroupRules[]" | jq -r "select(.IsEgress == false)" | jq -r ".SecurityGroupRuleId"); - do - aws ec2 revoke-security-group-ingress --security-group-rule-ids $r --group-id $g - done - - for r in $(aws ec2 describe-security-group-rules --filters "Name=group-id,Values=$g" --output json | jq -r ".SecurityGroupRules[]" | jq -r "select(.IsEgress == true)" | jq -r ".SecurityGroupRuleId"); - do - aws ec2 revoke-security-group-egress --security-group-rule-ids $r --group-id $g - done - done - - for g in $(aws ec2 describe-security-groups --filters "Name=vpc-id,Values=$VPC" --output json | jq -r ".SecurityGroups[]" | jq -r 'select(.GroupName != "default")' | jq -r ".GroupId"); - do - echo "Delete Security Groups $g" - _aws_cmd "delete-security-group --group-id $g" - done - - echo "Delete VPC $VPC" - _aws_cmd "delete-vpc --vpc-id $VPC" - fi -} - -_oidc_config(){ - echo "${1} OIDC config, with prefix ${2}" - if [[ $1 == "create" ]]; then - echo "${1} OIDC config" - rosa create oidc-config --mode=auto --managed=false --prefix ${2} -y - export OIDC_CONFIG=$(rosa list oidc-config | grep ${2} | awk '{print$1}') - else - export OIDC_CONFIG=$(rosa list oidc-config | grep ${2} | awk '{print$1}') - if [ ! -z $OIDC_CONFIG ]; then rosa delete oidc-config --mode=auto --oidc-config-id ${OIDC_CONFIG} -y || true; fi # forcing exit 0, as this command may file if it is a shared oidc config - fi -} - -_get_sc_mc_details(){ - if [ -z $SVC_CLUSTER_NAME ]; then - echo "Find Service Cluster" - export SVC_CLUSTER_NAME=$(ocm describe cluster ${CLUSTER_NAME} | grep "Service Cluster" | awk '{print$3}') - fi - if [ -z $MGMT_CLUSTER_NAME ]; then - export MGMT_CLUSTER_NAME=$(ocm describe cluster ${CLUSTER_NAME} | grep "Management Cluster" | awk '{print$3}') - fi - echo "Read Management cluster details" - export MGMT_CLUSTER_DETAILS=$(ocm get /api/clusters_mgmt/v1/clusters | jq -r ".items[]" | jq -r 'select(.name == '\"$MGMT_CLUSTER_NAME\"')') - export NUMBER_OF_HC=$(cat ${json_file} | jq -r .number_of_hostedcluster) + exit 1 } setup(){ @@ -444,20 +131,7 @@ setup(){ export COMPUTE_WORKERS_NUMBER=$(cat ${json_file} | jq -r .openshift_worker_count) export NETWORK_TYPE=$(cat ${json_file} | jq -r .openshift_network_type) export ES_SERVER=$(cat ${json_file} | jq -r .es_server) - export HCP=$(cat ${json_file} | jq -r .rosa_hcp) export UUID=$(uuidgen) - if [ $HCP == "true" ]; then - export STAGE_CONFIG="" - export MGMT_CLUSTER_NAME=$(cat ${json_file} | jq -r .staging_mgmt_cluster_name) - export SVC_CLUSTER_NAME=$(cat ${json_file} | jq -r .staging_svc_cluster_name) - export STAGE_PROV_SHARD=$(cat ${json_file} | jq -r .staging_mgmt_provisioner_shards) - export OIDC_PREFIX=$(cat ${json_file} | jq -r .openshift_cluster_name) - export CLUSTER_NAME="${CLUSTER_NAME}-${HOSTED_ID}" # perf-as3-hcp-1, perf-as3-hcp-2.. - export KUBECONFIG_NAME=$(echo $KUBECONFIG_NAME | awk -F-kubeconfig '{print$1}')-$HOSTED_ID-kubeconfig - export KUBEADMIN_NAME=$(echo $KUBEADMIN_NAME | awk -F-kubeadmin '{print$1}')-$HOSTED_ID-kubeadmin - UUID=$(echo $AIRFLOW_CTX_DAG_RUN_ID | base64 | cut -c 1-32 ) - export UUID=${UUID} - fi export OCM_CLI_VERSION=$(cat ${json_file} | jq -r .ocm_cli_version) if [[ ${OCM_CLI_VERSION} != "container" ]]; then OCM_CLI_FORK=$(cat ${json_file} | jq -r .ocm_cli_fork) @@ -467,98 +141,42 @@ setup(){ sudo mv ocm /usr/local/bin/ popd fi - if [[ $INSTALL_METHOD == "osd" ]]; then - echo "Clean-up existing OSD access keys.." - AWS_KEY=$(aws iam list-access-keys --user-name OsdCcsAdmin --output text --query 'AccessKeyMetadata[*].AccessKeyId') - LEN_AWS_KEY=`echo $AWS_KEY | wc -w` - if [[ ${LEN_AWS_KEY} -eq 2 ]]; then - aws iam delete-access-key --user-name OsdCcsAdmin --access-key-id `printf ${AWS_KEY[0]}` - fi - echo "Create new OSD access key.." - export ADMIN_KEY=$(aws iam create-access-key --user-name OsdCcsAdmin) - export AWS_ACCESS_KEY_ID=$(echo $ADMIN_KEY | jq -r '.AccessKey.AccessKeyId') - export AWS_SECRET_ACCESS_KEY=$(echo $ADMIN_KEY | jq -r '.AccessKey.SecretAccessKey') - ocm login --url=https://api.stage.openshift.com --token="${ROSA_TOKEN}" - ocm whoami - sleep 60 # it takes a few sec for new access key - echo "Check AWS Username..." - aws iam get-user | jq -r .User.UserName + export ROSA_CLI_VERSION=$(cat ${json_file} | jq -r .rosa_cli_version) + if [[ ${ROSA_CLI_VERSION} != "container" ]]; then + ROSA_CLI_FORK=$(cat ${json_file} | jq -r .rosa_cli_fork) + git clone -q --depth=1 --single-branch --branch ${ROSA_CLI_VERSION} ${ROSA_CLI_FORK} + pushd rosa + make + sudo mv rosa /usr/local/bin/ + popd + fi + ocm login --url=https://api.stage.openshift.com --token="${ROSA_TOKEN}" + ocm whoami + rosa login --env=${ROSA_ENVIRONMENT} + rosa whoami + rosa verify quota + rosa verify permissions + if [ "${MANAGED_OCP_VERSION}" == "latest" ] ; then + export ROSA_VERSION=$(rosa list versions -o json --channel-group=${MANAGED_CHANNEL_GROUP} | jq -r '.[] | select(.raw_id|startswith('\"${version}\"')) | .raw_id' | sort -rV | head -1) + elif [ "${MANAGED_OCP_VERSION}" == "prelatest" ] ; then + export ROSA_VERSION=$(rosa list versions -o json --channel-group=${MANAGED_CHANNEL_GROUP} | jq -r '.[] | select(.raw_id|startswith('\"${version}\"')) | .raw_id' | sort -rV | head -2 | tail -1) else - export ROSA_CLI_VERSION=$(cat ${json_file} | jq -r .rosa_cli_version) - if [[ ${ROSA_CLI_VERSION} != "container" ]]; then - ROSA_CLI_FORK=$(cat ${json_file} | jq -r .rosa_cli_fork) - git clone -q --depth=1 --single-branch --branch ${ROSA_CLI_VERSION} ${ROSA_CLI_FORK} - pushd rosa - make - sudo mv rosa /usr/local/bin/ - popd - fi - ocm login --url=https://api.stage.openshift.com --token="${ROSA_TOKEN}" - ocm whoami - rosa login --env=${ROSA_ENVIRONMENT} - rosa whoami - rosa verify quota - rosa verify permissions - if [ "${MANAGED_OCP_VERSION}" == "latest" ] ; then - export ROSA_VERSION=$(rosa list versions -o json --channel-group=${MANAGED_CHANNEL_GROUP} | jq -r '.[] | select(.raw_id|startswith('\"${version}\"')) | .raw_id' | sort -rV | head -1) - elif [ "${MANAGED_OCP_VERSION}" == "prelatest" ] ; then - export ROSA_VERSION=$(rosa list versions -o json --channel-group=${MANAGED_CHANNEL_GROUP} | jq -r '.[] | select(.raw_id|startswith('\"${version}\"')) | .raw_id' | sort -rV | head -2 | tail -1) - else - export ROSA_VERSION=$(rosa list versions -o json --channel-group=${MANAGED_CHANNEL_GROUP} | jq -r '.[] | select(.raw_id|startswith('\"${version}\"')) | .raw_id' | grep ^${MANAGED_OCP_VERSION}$) - fi - [ -z "${ROSA_VERSION}" ] && echo "ERROR: Image not found for version (${version}) on ROSA ${MANAGED_CHANNEL_GROUP} channel group" && exit 1 - return 0 + export ROSA_VERSION=$(rosa list versions -o json --channel-group=${MANAGED_CHANNEL_GROUP} | jq -r '.[] | select(.raw_id|startswith('\"${version}\"')) | .raw_id' | grep ^${MANAGED_OCP_VERSION}$) fi + [ -z "${ROSA_VERSION}" ] && echo "ERROR: Image not found for version (${version}) on ROSA ${MANAGED_CHANNEL_GROUP} channel group" && exit 1 + return 0 } install(){ export COMPUTE_WORKERS_TYPE=$(cat ${json_file} | jq -r .openshift_worker_instance_type) export CLUSTER_AUTOSCALE=$(cat ${json_file} | jq -r .cluster_autoscale) export OIDC_CONFIG=$(cat ${json_file} | jq -r .oidc_config) - if [[ $INSTALL_METHOD == "osd" ]]; then - if [ "${MANAGED_OCP_VERSION}" == "latest" ] ; then - export OCM_VERSION=$(ocm list versions --channel-group ${MANAGED_CHANNEL_GROUP} | grep ^${version} | sort -rV | head -1) - elif [ "${MANAGED_OCP_VERSION}" == "prelatest" ] ; then - export OCM_VERSION=$(ocm list versions --channel-group ${MANAGED_CHANNEL_GROUP} | grep ^${version} | sort -rV | head -2 | tail -1) - else - export OCM_VERSION=$(ocm list versions --channel-group ${MANAGED_CHANNEL_GROUP} | grep ^${MANAGED_OCP_VERSION}) - fi - [ -z ${OCM_VERSION} ] && echo "ERROR: Image not found for version (${version}) on OCM ${MANAGED_CHANNEL_GROUP} channel group" && exit 1 - if [[ $CLUSTER_AUTOSCALE == "true" ]]; then - export MIN_COMPUTE_WORKERS_NUMBER=$(cat ${json_file} | jq -r .min_openshift_worker_count) - export CLUSTER_SIZE="--enable-autoscaling --min-replicas ${MIN_COMPUTE_WORKERS_NUMBER} --max-replicas ${COMPUTE_WORKERS_NUMBER}" - else - export CLUSTER_SIZE="--compute-nodes ${COMPUTE_WORKERS_NUMBER}" - fi - ocm create cluster --ccs --provider aws --region ${AWS_REGION} --aws-account-id ${AWS_ACCOUNT_ID} --aws-access-key-id ${AWS_ACCESS_KEY_ID} --aws-secret-access-key ${AWS_SECRET_ACCESS_KEY} --channel-group ${MANAGED_CHANNEL_GROUP} --version ${OCM_VERSION} --multi-az --compute-machine-type ${COMPUTE_WORKERS_TYPE} --network-type ${NETWORK_TYPE} ${CLUSTER_NAME} ${CLUSTER_SIZE} - else - export INSTALLATION_PARAMS="" - export ROSA_HCP_PARAMS="" - if [ $AWS_AUTHENTICATION_METHOD == "sts" ] ; then - INSTALLATION_PARAMS="${INSTALLATION_PARAMS} --sts -m auto --yes" - fi - if [ $HCP == "true" ]; then - _create_aws_vpc - echo "Set start time of prom scrape" - export START_TIME=$(date +"%s") - if [ $STAGE_PROV_SHARD != "" ]; then - STAGE_CONFIG="--properties provision_shard_id:${STAGE_PROV_SHARD}" - fi - ALL_SUBNETS=$(aws ec2 describe-subnets --filters "Name=tag:HostedClusterName,Values=$CLUSTER_NAME" --output json | jq -r ".Subnets[].SubnetId") - SUBNETS_IDS="" - for _ID in ${ALL_SUBNETS}; - do - if [[ ${SUBNETS_IDS} == "" ]]; then SUBNETS_IDS=${_ID}; else SUBNETS_IDS=${SUBNETS_IDS}","${_ID}; fi - done - ROSA_HCP_PARAMS="--hosted-cp ${STAGE_CONFIG} --subnet-ids ${SUBNETS_IDS} --machine-cidr 10.0.0.0/16" - export OIDC_CONFIG=$(rosa list oidc-config | grep $OIDC_PREFIX | awk '{print$1}') - if [ -z $OIDC_CONFIG ]; then _oidc_config create $OIDC_PREFIX; fi - ROSA_HCP_PARAMS="${ROSA_HCP_PARAMS} --oidc-config-id ${OIDC_CONFIG}" - else - INSTALLATION_PARAMS="${INSTALLATION_PARAMS} --multi-az" # Multi AZ is default on hosted-cp cluster - fi - rosa create cluster --tags=User:${GITHUB_USERNAME} --cluster-name ${CLUSTER_NAME} --version "${ROSA_VERSION}" --channel-group=${MANAGED_CHANNEL_GROUP} --compute-machine-type ${COMPUTE_WORKERS_TYPE} --replicas ${COMPUTE_WORKERS_NUMBER} --network-type ${NETWORK_TYPE} ${INSTALLATION_PARAMS} ${ROSA_HCP_PARAMS} + export INSTALLATION_PARAMS="" + if [ $AWS_AUTHENTICATION_METHOD == "sts" ] ; then + INSTALLATION_PARAMS="${INSTALLATION_PARAMS} --sts -m auto --yes" fi + INSTALLATION_PARAMS="${INSTALLATION_PARAMS} --multi-az" # Multi AZ is default on hosted-cp cluster + rosa create cluster --tags=User:${GITHUB_USERNAME} --cluster-name ${CLUSTER_NAME} --version "${ROSA_VERSION}" --channel-group=${MANAGED_CHANNEL_GROUP} --compute-machine-type ${COMPUTE_WORKERS_TYPE} --replicas ${COMPUTE_WORKERS_NUMBER} --network-type ${NETWORK_TYPE} ${INSTALLATION_PARAMS} postinstall return 0 } @@ -569,42 +187,19 @@ postinstall(){ sleep 120 export EXPIRATION_TIME=$(cat ${json_file} | jq -r .rosa_expiration_time) _download_kubeconfig "$(_get_cluster_id ${CLUSTER_NAME})" ./kubeconfig - if [ $HCP == "true" ]; then - _get_sc_mc_details - echo "Index Managment cluster info" - index_metadata "management" - _download_kubeconfig "$(ocm list clusters --no-headers --columns id ${MGMT_CLUSTER_NAME})" ./mgmt_kubeconfig - kubectl delete secret staging-mgmt-cluster-kubeconfig || true - kubectl create secret generic staging-mgmt-cluster-kubeconfig --from-file=config=./mgmt_kubeconfig - fi unset KUBECONFIG kubectl delete secret ${KUBECONFIG_NAME} || true kubectl create secret generic ${KUBECONFIG_NAME} --from-file=config=./kubeconfig - if [[ $INSTALL_METHOD == "osd" ]]; then - export PASSWORD=$(echo ${CLUSTER_NAME} | md5sum | awk '{print $1}') - ocm create idp -n localauth -t htpasswd --username kubeadmin --password ${PASSWORD} -c ${CLUSTER_NAME} - ocm create user kubeadmin -c "$(_get_cluster_id ${CLUSTER_NAME})" --group=cluster-admins - # set expiration time - EXPIRATION_STRING=$(date -d "${EXPIRATION_TIME} minutes" '+{"expiration_timestamp": "%FT%TZ"}') - ocm patch /api/clusters_mgmt/v1/clusters/"$(_get_cluster_id ${CLUSTER_NAME})" <<< ${EXPIRATION_STRING} - echo "Cluster is ready, deleting OSD access keys now.." - aws iam delete-access-key --user-name OsdCcsAdmin --access-key-id $AWS_ACCESS_KEY_ID || true - kubectl delete secret ${KUBEADMIN_NAME} || true - kubectl create secret generic ${KUBEADMIN_NAME} --from-literal=KUBEADMIN_PASSWORD=${PASSWORD} - else - URL=$(rosa describe cluster -c $CLUSTER_NAME --output json | jq -r ".api.url") - START_TIMER=$(date +%s) - PASSWORD=$(rosa create admin -c "$(_get_cluster_id ${CLUSTER_NAME})" -y 2>/dev/null | grep "oc login" | awk '{print $7}') - CURRENT_TIMER=$(date +%s) - DURATION=$(($CURRENT_TIMER - $START_TIMER)) - INDEXDATA+=("cluster_admin_create-${DURATION}") - kubectl delete secret ${KUBEADMIN_NAME} || true - kubectl create secret generic ${KUBEADMIN_NAME} --from-literal=KUBEADMIN_PASSWORD=${PASSWORD} - if [ $HCP == "true" ]; then _login_check $URL $PASSWORD; fi - # set expiration to 24h - rosa edit cluster -c "$(_get_cluster_id ${CLUSTER_NAME})" --expiration=${EXPIRATION_TIME}m - fi - if [ $HCP == "true" ]; then index_metadata "cluster-install"; fi + URL=$(rosa describe cluster -c $CLUSTER_NAME --output json | jq -r ".api.url") + START_TIMER=$(date +%s) + PASSWORD=$(rosa create admin -c "$(_get_cluster_id ${CLUSTER_NAME})" -y 2>/dev/null | grep "oc login" | awk '{print $7}') + CURRENT_TIMER=$(date +%s) + DURATION=$(($CURRENT_TIMER - $START_TIMER)) + INDEXDATA+=("cluster_admin_create-${DURATION}") + kubectl delete secret ${KUBEADMIN_NAME} || true + kubectl create secret generic ${KUBEADMIN_NAME} --from-literal=KUBEADMIN_PASSWORD=${PASSWORD} + # set expiration to 24h + rosa edit cluster -c "$(_get_cluster_id ${CLUSTER_NAME})" --expiration=${EXPIRATION_TIME}m return 0 } @@ -613,90 +208,10 @@ index_metadata(){ _download_kubeconfig "$(_get_cluster_id ${CLUSTER_NAME})" ./kubeconfig export KUBECONFIG=./kubeconfig fi - if [[ $INSTALL_METHOD == "osd" ]]; then - export PLATFORM="AWS-MS" - export CLUSTER_VERSION="${OCM_VERSION}" - else - export PLATFORM="ROSA" - export CLUSTER_VERSION="${ROSA_VERSION}" - fi - if [ $HCP == "true" ]; then - if [ "$1" == "management" ]; then - METADATA=$(cat << EOF -{ -"uuid" : "${UUID}", -"aws_authentication_method": "${AWS_AUTHENTICATION_METHOD}", -"version": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".openshift_version")", -"infra_id": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".infra_id")", -"cluster_name": "$MGMT_CLUSTER_NAME", -"cluster_id": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".id")", -"base_domain": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".dns.base_domain")", -"aws_region": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".region.id")", -"workers": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".nodes.autoscale_compute.max_replicas")", -"workers_type": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".nodes.compute_machine_type.id")", -"network_type": "$(echo $MGMT_CLUSTER_DETAILS | jq -r ".network.type")", -"install_method": "rosa", -"provision_shard": "$STAGE_PROV_SHARD", -"hostedclusters": "$NUMBER_OF_HC" -} -EOF -) - elif [ "$1" == "cluster-install" ]; then - METADATA=$(cat << EOF -{ -"uuid" : "${UUID}", -"aws_authentication_method": "${AWS_AUTHENTICATION_METHOD}", -"mgmt_cluster_name": "$MGMT_CLUSTER_NAME", -"workers": "$COMPUTE_WORKERS_NUMBER", -"cluster_name": "${CLUSTER_NAME}", -"cluster_id": "$(_get_cluster_id ${CLUSTER_NAME})", -"network_type": "${NETWORK_TYPE}", -"version": "${CLUSTER_VERSION}", -"operation": "install", -"install_method": "rosa", -"status": "$END_CLUSTER_STATUS", -"timestamp": "$(date +%s%3N)" -EOF -) - INSTALL_TIME=0 - TOTAL_TIME=0 - WORKER_READY_TIME=0 - for i in "${INDEXDATA[@]}" ; do IFS="-" ; set -- $i - METADATA="${METADATA}, \"$1\":\"$2\"" - if [ $1 != "day2operations" ] && [ $1 != "login" ] ; then - INSTALL_TIME=$((${INSTALL_TIME} + $2)) - elif [ $1 == "day2operations" ]; then - WORKER_READY_TIME=$2 - else - TOTAL_TIME=$2 - fi - done - IFS=" " - METADATA="${METADATA}, \"duration\":\"${INSTALL_TIME}\"" - METADATA="${METADATA}, \"workers_ready\":\"$(($INSTALL_TIME + $WORKER_READY_TIME))\"" - METADATA="${METADATA} }" - else - METADATA=$(cat << EOF -{ -"uuid" : "${UUID}", -"mgmt_cluster_name": "$MGMT_CLUSTER_NAME", -"workers": "$COMPUTE_WORKERS_NUMBER", -"cluster_name": "${CLUSTER_NAME}", -"cluster_id": "$ROSA_CLUSTER_ID", -"network_type": "${NETWORK_TYPE}", -"version": "${CLUSTER_VERSION}", -"operation": "destroy", -"install_method": "rosa", -"duration": "$DURATION", -"timestamp": "$(date +%s%3N)" -} -EOF -) - fi - printf "Indexing installation timings to ES" - curl -k -sS -X POST -H "Content-type: application/json" ${ES_SERVER}/hypershift-wrapper-timers/_doc -d "${METADATA}" -o /dev/null - else - METADATA=$(cat << EOF + export PLATFORM="ROSA" + export CLUSTER_VERSION="${ROSA_VERSION}" + + METADATA=$(cat << EOF { "uuid" : "${UUID}", "platform": "${PLATFORM}", @@ -713,7 +228,6 @@ EOF "timestamp": "$(date +%s%3N)" EOF ) - INSTALL_TIME=0 TOTAL_TIME=0 for i in "${INDEXDATA[@]}" ; do IFS="-" ; set -- $i @@ -731,91 +245,27 @@ EOF METADATA="${METADATA} }" printf "Indexing installation timings to ES" curl -k -sS -X POST -H "Content-type: application/json" ${ES_SERVER}/managedservices-timings/_doc -d "${METADATA}" -o /dev/null - fi unset KUBECONFIG return 0 } -index_mgmt_cluster_stat(){ - echo "Indexing Management cluster stat..." - cd /home/airflow/workspace - echo "Installing kube-burner" - _download_kubeconfig "$(ocm list clusters --no-headers --columns id ${MGMT_CLUSTER_NAME})" ./mgmt_kubeconfig - export KUBE_BURNER_RELEASE=${KUBE_BURNER_RELEASE:-1.5} - curl -L https://github.com/cloud-bulldozer/kube-burner/releases/download/v${KUBE_BURNER_RELEASE}/kube-burner-${KUBE_BURNER_RELEASE}-Linux-x86_64.tar.gz -o kube-burner.tar.gz - sudo tar -xvzf kube-burner.tar.gz -C /usr/local/bin/ - git clone -q -b ${E2E_BENCHMARKING_BRANCH} ${E2E_BENCHMARKING_REPO} --depth=1 --single-branch - METRIC_PROFILE=/home/airflow/workspace/e2e-benchmarking/workloads/kube-burner-ocp-wrapper/metrics-profiles/mc-metrics.yml - envsubst < /home/airflow/workspace/e2e-benchmarking/workloads/kube-burner/workloads/managed-services/baseconfig.yml > baseconfig.yml - cat baseconfig.yml - HCP_NAMESPACE="$(_get_cluster_id ${CLUSTER_NAME})-$CLUSTER_NAME" - MC_PROMETHEUS=https://$(oc --kubeconfig=./mgmt_kubeconfig get route -n openshift-monitoring prometheus-k8s -o jsonpath="{.spec.host}") - MC_PROMETHEUS_TOKEN=$(oc --kubeconfig=./mgmt_kubeconfig sa new-token -n openshift-monitoring prometheus-k8s) - Q_NODES="" - for n in $(curl -H "Authorization: Bearer ${MC_PROMETHEUS_TOKEN}" -k --silent --globoff ${MC_PROMETHEUS}/api/v1/query?query='sum(kube_node_role{role!~"master|infra|workload|obo"})by(node)&time='$(date +"%s")'' | jq -r '.data.result[].metric.node'); - do - if [[ ${Q_NODES} == "" ]]; then Q_NODES=${n}; else Q_NODES=${Q_NODES}"|"${n}; fi - done - MGMT_WORKER_NODES=${Q_NODES} - echo "Exporting required vars" - cat << EOF -MC_PROMETHEUS: ${MC_PROMETHEUS} -MC_PROMETHEUS_TOKEN: -HCP_NAMESPACE: ${HCP_NAMESPACE} -MGMT_WORKER_NODES: ${MGMT_WORKER_NODES} -elapsed: "20m:" - -EOF - export MC_PROMETHEUS MC_PROMETHEUS_TOKEN HCP_NAMESPACE MGMT_WORKER_NODES elapsed - METADATA=$(cat << EOF -{ -"uuid":"${UUID}", -"timestamp": "$(date +%s%3N)", -"hostedClusterName": "${HC_INFRASTRUCTURE_NAME}", -"clusterName": "${HC_INFRASTRUCTURE_NAME}", -"mgmtClusterName": "${MGMT_CLUSTER_NAME}" -} -EOF -) - printf "Indexing metadata to ES" - curl -k -sS -X POST -H "Content-type: application/json" ${ES_SERVER}/${ES_INDEX}/_doc -d "${METADATA}" -o /dev/null - - echo "Running kube-burner index.." - kube-burner index --uuid=${UUID} --prometheus-url=${MC_PROMETHEUS} --token ${MC_PROMETHEUS_TOKEN} --start=$START_TIME --end=$END_TIME --step 2m --metrics-profile ${METRIC_PROFILE} --config ./baseconfig.yml --log-level debug - echo "Finished indexing results" -} - cleanup(){ - if [[ $INSTALL_METHOD == "osd" ]]; then - ocm delete cluster "$(_get_cluster_id ${CLUSTER_NAME})" - echo "Cluster is getting Uninstalled, deleting OSD access keys now.." - aws iam delete-access-key --user-name OsdCcsAdmin --access-key-id $AWS_ACCESS_KEY_ID || true - else - export ROSA_CLUSTER_ID=$(_get_cluster_id ${CLUSTER_NAME}) - export HC_INFRASTRUCTURE_NAME=${ROSA_CLUSTER_ID} - CLEANUP_START_TIMING=$(date +%s) - export START_TIME=$CLEANUP_START_TIMING - rosa delete cluster -c ${ROSA_CLUSTER_ID} -y - rosa logs uninstall -c ${ROSA_CLUSTER_ID} --watch - if [ $AWS_AUTHENTICATION_METHOD == "sts" ] ; then - rosa delete operator-roles -c ${ROSA_CLUSTER_ID} -m auto --yes || true - rosa delete oidc-provider -c ${ROSA_CLUSTER_ID} -m auto --yes || true - fi - DURATION=$(($(date +%s) - $CLEANUP_START_TIMING)) - INDEXDATA+=("cleanup-${DURATION}") - export END_TIME=$(date +"%s") - if [ $HCP == "true" ]; then - _delete_aws_vpc - if [ -z $OIDC_CONFIG ]; then _oidc_config delete $OIDC_PREFIX; fi - fi + export ROSA_CLUSTER_ID=$(_get_cluster_id ${CLUSTER_NAME}) + export HC_INFRASTRUCTURE_NAME=${ROSA_CLUSTER_ID} + CLEANUP_START_TIMING=$(date +%s) + export START_TIME=$CLEANUP_START_TIMING + rosa delete cluster -c ${ROSA_CLUSTER_ID} -y + rosa logs uninstall -c ${ROSA_CLUSTER_ID} --watch + if [ $AWS_AUTHENTICATION_METHOD == "sts" ] ; then + rosa delete operator-roles -c ${ROSA_CLUSTER_ID} -m auto --yes || true + rosa delete oidc-provider -c ${ROSA_CLUSTER_ID} -m auto --yes || true fi + DURATION=$(($(date +%s) - $CLEANUP_START_TIMING)) + INDEXDATA+=("cleanup-${DURATION}") + export END_TIME=$(date +"%s") return 0 } -export INSTALL_METHOD=$(cat ${json_file} | jq -r .cluster_install_method) -export HC_INTERVAL=$(cat ${json_file} | jq -r .hcp_install_interval) -SKEW_FACTOR=$(echo $HOSTED_ID|awk -F- '{print$2}') -sleep $(($HC_INTERVAL*$SKEW_FACTOR)) # 60*1, 60*2.. setup if [[ "$operation" == "install" ]]; then @@ -823,16 +273,8 @@ if [[ "$operation" == "install" ]]; then CLUSTER_STATUS=$(_get_cluster_status ${CLUSTER_NAME}) if [ -z "${CLUSTER_STATUS}" ] ; then printf "INFO: Cluster not found, installing..." - if [ $HCP == "true" ]; then - echo "pre-clean AWS resources" - _delete_aws_vpc - install - export HC_INFRASTRUCTURE_NAME=$(_get_cluster_id ${CLUSTER_NAME}) - index_mgmt_cluster_stat "install-metrics" - else - install - index_metadata - fi + install + index_metadata elif [ "${CLUSTER_STATUS}" == "ready" ] ; then printf "INFO: Cluster ${CLUSTER_NAME} already installed and ready, reusing..." postinstall @@ -848,10 +290,8 @@ if [[ "$operation" == "install" ]]; then elif [[ "$operation" == "cleanup" ]]; then printf "Running Cleanup Steps" - if [ $HCP == "true" ]; then _get_sc_mc_details; fi cleanup index_metadata - if [ $HCP == "true" ]; then index_mgmt_cluster_stat "destroy-metrics"; fi rosa logout ocm logout fi diff --git a/dags/openshift_nightlies/tasks/benchmarks/e2e.py b/dags/openshift_nightlies/tasks/benchmarks/e2e.py index b3b86e3c0..9608888bc 100644 --- a/dags/openshift_nightlies/tasks/benchmarks/e2e.py +++ b/dags/openshift_nightlies/tasks/benchmarks/e2e.py @@ -85,15 +85,31 @@ def __init__(self, dag, config: DagConfig, release: OpenshiftRelease, task_group } self.install_vars = var_loader.build_task_vars( release, task="install") - if self.install_vars['rosa_hcp'] == "true": - cluster_name = release._generate_cluster_name() - self.env = { - **self.env, - "MGMT_CLUSTER_NAME": f"{self.install_vars['staging_mgmt_cluster_name']}.*", - "SVC_CLUSTER_NAME": f"{self.install_vars['staging_svc_cluster_name']}.*", - "MGMT_KUBECONFIG_SECRET": "staging-mgmt-cluster-kubeconfig", - **self._insert_kube_env() - } + + if self.release.platform == "rosahcp": + self.rosa_creds = var_loader.get_secret("rosa_creds", deserialize_json=True) + self.aws_creds = var_loader.get_secret("aws_creds", deserialize_json=True) + self.ocm_creds = var_loader.get_secret("ocm_creds", deserialize_json=True) + self.environment = self.vars["environment"] if "environment" in self.vars else "staging" + self.env = { + **self.env, + "ROSA_CLUSTER_NAME": release._generate_cluster_name(), + "ROSA_ENVIRONMENT": self.environment, + "ROSA_TOKEN": self.rosa_creds['rosa_token_'+self.environment], + "AWS_ACCESS_KEY_ID": self.aws_creds['aws_access_key_id'], + "AWS_SECRET_ACCESS_KEY": self.aws_creds['aws_secret_access_key'], + "AWS_DEFAULT_REGION": self.aws_creds['aws_region_for_openshift'], + "AWS_ACCOUNT_ID": self.aws_creds['aws_account_id'], + "OCM_TOKEN": self.ocm_creds['ocm_token'] + } + self.install_vars = var_loader.build_task_vars( + release, task="install") + cluster_name = release._generate_cluster_name() + self.env = { + **self.env, + "MGMT_KUBECONFIG_SECRET": "staging-mgmt-cluster-kubeconfig", + **self._insert_kube_env() + } if self.release.platform == "hypershift": mgmt_cluster_name = release._generate_cluster_name() diff --git a/dags/openshift_nightlies/tasks/install/rosa/defaults.json b/dags/openshift_nightlies/tasks/install/rosa/defaults.json index c3ee1decc..29f67d108 100644 --- a/dags/openshift_nightlies/tasks/install/rosa/defaults.json +++ b/dags/openshift_nightlies/tasks/install/rosa/defaults.json @@ -36,10 +36,5 @@ "ocm_cli_fork": "https://github.com/openshift-online/ocm-cli", "ocm_cli_version": "container", "rosa_hcp": "false", - "staging_mgmt_cluster_name": "", - "staging_svc_cluster_name": "", - "staging_mgmt_provisioner_shards": "", - "aws_region": "us-west-2", - "oidc_config": "", - "extra_machinepool": [] + "aws_region": "us-west-2" } diff --git a/dags/openshift_nightlies/tasks/install/rosa/rosa.py b/dags/openshift_nightlies/tasks/install/rosa/rosa.py index 9f277ae60..e5b2b9e02 100644 --- a/dags/openshift_nightlies/tasks/install/rosa/rosa.py +++ b/dags/openshift_nightlies/tasks/install/rosa/rosa.py @@ -25,17 +25,6 @@ def __init__(self, dag, config: DagConfig, release: OpenshiftRelease): self.exec_config = executor.get_default_executor_config(self.dag_config, executor_image="airflow-managed-services") self.rosa_postinstall_setup = rosa_post_install.Diagnosis(dag, config, release) - def get_type(self): - if self.config['rosa_hcp'] == "true": - return "rosa_hcp" - else: - return "rosa" - - def get_install_hcp_task(self): - for iteration in range(self.config['number_of_hostedcluster']): - c_id = f"{'hcp-'+str(iteration+1)}" # adding 1 to name the cluster hcp-1, hcp-2.. - yield c_id, self._get_task(operation="install", id=c_id), self.rosa_postinstall_setup._get_rosa_postinstallation(id=c_id), self._get_task(operation="cleanup", id=c_id) - # Create Airflow Task for Install/Cleanup steps def _get_task(self, operation="install", id="", trigger_rule="all_success"): self._setup_task(operation=operation) @@ -47,7 +36,7 @@ def _get_task(self, operation="install", id="", trigger_rule="all_success"): "PROM_URL": var_loader.get_secret("thanos_querier_url"), **self.env } - env = {**self.env, **{"HOSTED_ID": id}} + env = {**self.env} command=f"{constants.root_dag_dir}/scripts/install/rosa.sh -v {self.release.version} -j /tmp/{self.release_name}-{operation}-task.json -o {operation}" return BashOperator( diff --git a/dags/openshift_nightlies/tasks/install/rosahcp/__init__.py b/dags/openshift_nightlies/tasks/install/rosahcp/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dags/openshift_nightlies/tasks/install/rosahcp/defaults.json b/dags/openshift_nightlies/tasks/install/rosahcp/defaults.json new file mode 100644 index 000000000..b44d4992f --- /dev/null +++ b/dags/openshift_nightlies/tasks/install/rosahcp/defaults.json @@ -0,0 +1,45 @@ +{ + "openshift_cluster_name": "", + "openshift_install_ssh_pub_key_file": "/home/airflow/workspace/perf-dept/ssh_keys/id_rsa_pbench_ec2.pub", + "openshift_cidr": "10.128.0.0/10", + "openshift_machine_cidr": "10.0.0.0/16", + "openshift_service_network": "172.30.0.0/16", + "openshift_host_prefix": "22", + "openshift_network_type": "", + "openshift_toggle_workload_node": true, + "kubeconfig_path": "", + "watch_nodes": true, + "watch_cluster_operators": true, + "watch_namepsaces": [ + "openshift-etcd", + "openshift-apiserver", + "openshift-kube-apiserver", + "openshift-monitoring", + "openshift-kube-controller", + "openshift-machine-api", + "openshift-kube-scheduler", + "openshift-ingress", + "openshift-sdn" + ], + "inspect_components": false, + "slack_integration": false, + "slack_api_token": "", + "slack_channel": "", + "watcher_slack_id": "{Monday: , Tuesday: , Wednesday: , Thursday: , Friday: , Saturday: , Sunday: }", + "slack_team_alias": "", + "iterations": 5, + "sleep_time": 30, + "daemon_mode": true, + "fips": false, + "rosa_expiration_time": "2880", + "rosa_cli_fork": "https://github.com/openshift/rosa", + "ocm_cli_fork": "https://github.com/openshift-online/ocm-cli", + "ocm_cli_version": "container", + "rosa_hcp": "true", + "staging_mgmt_cluster_name": "", + "staging_svc_cluster_name": "", + "staging_mgmt_provisioner_shards": "", + "aws_region": "us-west-2", + "oidc_config": "", + "extra_machinepool": [] +} diff --git a/dags/openshift_nightlies/tasks/install/rosahcp/rosahcp.py b/dags/openshift_nightlies/tasks/install/rosahcp/rosahcp.py new file mode 100644 index 000000000..3cb98ba41 --- /dev/null +++ b/dags/openshift_nightlies/tasks/install/rosahcp/rosahcp.py @@ -0,0 +1,63 @@ +import sys +from os.path import abspath, dirname +from os import environ + +from openshift_nightlies.util import var_loader, kubeconfig, constants, executor +from openshift_nightlies.tasks.install.openshift import AbstractOpenshiftInstaller +from openshift_nightlies.tasks.utils import rosa_post_install +from common.models.dag_config import DagConfig +from openshift_nightlies.models.release import OpenshiftRelease + +import requests +import uuid + +from airflow.operators.bash import BashOperator +from airflow.models import Variable +from kubernetes.client import models as k8s + +import json + +# Defines Tasks for installation of Openshift Clusters + +class RosaHCPInstaller(AbstractOpenshiftInstaller): + def __init__(self, dag, config: DagConfig, release: OpenshiftRelease): + super().__init__(dag, config, release) + self.exec_config = executor.get_default_executor_config(self.dag_config, executor_image="airflow-managed-services") + self.rosa_postinstall_setup = rosa_post_install.Diagnosis(dag, config, release) + + def get_install_hcp_task(self): + for iteration in range(self.config['number_of_hostedcluster']): + c_id = f"{'hcp-'+str(iteration+1)}" # adding 1 to name the cluster hcp-1, hcp-2.. + yield c_id, self._get_task(operation="install", id=c_id), self.rosa_postinstall_setup._get_rosa_postinstallation(id=c_id), self._get_task(operation="cleanup", id=c_id) + + def wait_task(self, id="wait_task"): + return BashOperator(task_id=f"{id}", + depends_on_past=False, + trigger_rule="all_success", + dag=self.dag, + bash_command="sleep 60s") + + # Create Airflow Task for Install/Cleanup steps + def _get_task(self, operation="install", id="", trigger_rule="all_success"): + self._setup_task(operation=operation) + task_prefix=f"{id}-" + self.env = { + "ES_SERVER": var_loader.get_secret('elasticsearch'), + "ES_INDEX": "ripsaw-kube-burner", + "THANOS_RECEIVER_URL": var_loader.get_secret("thanos_receiver_url"), + "PROM_URL": var_loader.get_secret("thanos_querier_url"), + **self.env + } + env = {**self.env, **{"HOSTED_ID": id}} + command=f"{constants.root_dag_dir}/scripts/install/rosa-hcp.sh -v {self.release.version} -j /tmp/{self.release_name}-{operation}-task.json -o {operation}" + + return BashOperator( + task_id=f"{task_prefix if id != '' else ''}{operation}", + depends_on_past=False, + bash_command=command, + retries=3, + dag=self.dag, + trigger_rule=trigger_rule, + executor_config=self.exec_config, + env=env + ) diff --git a/dags/openshift_nightlies/util/manifest.py b/dags/openshift_nightlies/util/manifest.py index 3df0ef222..c99001352 100644 --- a/dags/openshift_nightlies/util/manifest.py +++ b/dags/openshift_nightlies/util/manifest.py @@ -163,6 +163,33 @@ def get_rosa_releases(self): } ) + def get_rosahcp_releases(self): + rosahcp = self.yaml['platforms']['rosahcp'] + for version in self.yaml['versions']: + if version['version'] in rosahcp['versions']: + version_number = version['version'] + release_stream = version['releaseStream'] + version_alias = version['alias'] + for variant in rosahcp['variants']: + release = OpenshiftRelease( + platform="rosahcp", + version=version_number, + release_stream=release_stream, + latest_release=self.latest_releases[release_stream], + variant=variant['name'], + config=variant['config'], + version_alias=version_alias + ) + schedule = self._get_schedule(variant, 'rosahcp') + dag_config = self._build_dag_config(schedule) + + self.releases.append( + { + "config": dag_config, + "release": release + } + ) + def get_rogcp_releases(self): rogcp = self.yaml['platforms']['rogcp'] for version in self.yaml['versions']: @@ -248,6 +275,8 @@ def get_releases(self): self.get_openstack_releases() if 'rosa' in self.yaml['platforms']: self.get_rosa_releases() + if 'rosahcp' in self.yaml['platforms']: + self.get_rosahcp_releases() if 'rogcp' in self.yaml['platforms']: self.get_rogcp_releases() if 'hypershift' in self.yaml['platforms']: diff --git a/dags/openshift_nightlies/util/var_loader.py b/dags/openshift_nightlies/util/var_loader.py index 670470ef5..e8d95bb3c 100644 --- a/dags/openshift_nightlies/util/var_loader.py +++ b/dags/openshift_nightlies/util/var_loader.py @@ -25,7 +25,7 @@ def get_config_vars(release: OpenshiftRelease, task="install", config_dir=f"{con if release.platform == 'baremetal' and "bench" in task: file_path = f"{config_dir}/{release.config['benchmarks']}/{task}.json" return get_json(file_path) - elif ( release.platform == 'hypershift' or release.platform == 'rosa' ) and "hcp" in task: + elif ( release.platform == 'hypershift' or release.platform == 'rosa'or release.platform == 'rosahcp' ) and "hcp" in task: file_path = f"{config_dir}/benchmarks/{release.config['benchmarks']}" return get_json(file_path) elif task in release.config: From 74dd06165303bebfd62713c0acec6395a933fcb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Sevilla?= Date: Mon, 7 Aug 2023 11:01:05 +0200 Subject: [PATCH 18/26] Skip garbage-collection in the last task of the DAG (#346) Signed-off-by: Raul Sevilla --- .../config/benchmarks/large-control-plane-mgs.json | 2 +- .../config/benchmarks/large-control-plane.json | 2 +- .../config/benchmarks/medium-control-plane-mgs.json | 2 +- .../config/benchmarks/medium-control-plane.json | 2 +- .../config/benchmarks/small-control-plane-mgs.json | 2 +- .../config/benchmarks/small-control-plane.json | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dags/openshift_nightlies/config/benchmarks/large-control-plane-mgs.json b/dags/openshift_nightlies/config/benchmarks/large-control-plane-mgs.json index 0b86525a0..88888944c 100644 --- a/dags/openshift_nightlies/config/benchmarks/large-control-plane-mgs.json +++ b/dags/openshift_nightlies/config/benchmarks/large-control-plane-mgs.json @@ -8,7 +8,7 @@ { "name": "cluster-density-v2", "workload": "kube-burner", - "custom_cmd": "kube-burner ocp cluster-density-v2 --uuid=${UUID} --iterations=3000 --timeout=6h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" + "custom_cmd": "kube-burner ocp cluster-density-v2 --uuid=${UUID} --iterations=3000 --timeout=6h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml --gc=false" } ] } diff --git a/dags/openshift_nightlies/config/benchmarks/large-control-plane.json b/dags/openshift_nightlies/config/benchmarks/large-control-plane.json index 62dcd771f..f0066429d 100644 --- a/dags/openshift_nightlies/config/benchmarks/large-control-plane.json +++ b/dags/openshift_nightlies/config/benchmarks/large-control-plane.json @@ -38,7 +38,7 @@ { "name": "cluster-density-v2", "workload": "kube-burner", - "custom_cmd": "kube-burner ocp cluster-density-v2 --uuid=${UUID} --iterations=3000 --timeout=6h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" + "custom_cmd": "kube-burner ocp cluster-density-v2 --uuid=${UUID} --iterations=3000 --timeout=6h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml --gc=false" } ] } diff --git a/dags/openshift_nightlies/config/benchmarks/medium-control-plane-mgs.json b/dags/openshift_nightlies/config/benchmarks/medium-control-plane-mgs.json index c0ffa3198..2bd0d34d1 100644 --- a/dags/openshift_nightlies/config/benchmarks/medium-control-plane-mgs.json +++ b/dags/openshift_nightlies/config/benchmarks/medium-control-plane-mgs.json @@ -8,7 +8,7 @@ { "name": "cluster-density-v2", "workload": "kube-burner", - "custom_cmd": "kube-burner ocp cluster-density-v2 --uuid=${UUID} --iterations=750 --timeout=5h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" + "custom_cmd": "kube-burner ocp cluster-density-v2 --uuid=${UUID} --iterations=750 --timeout=5h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml --gc=false" } ] } diff --git a/dags/openshift_nightlies/config/benchmarks/medium-control-plane.json b/dags/openshift_nightlies/config/benchmarks/medium-control-plane.json index 3c619e4ed..980f9cdd4 100644 --- a/dags/openshift_nightlies/config/benchmarks/medium-control-plane.json +++ b/dags/openshift_nightlies/config/benchmarks/medium-control-plane.json @@ -28,7 +28,7 @@ { "name": "cluster-density-v2", "workload": "kube-burner", - "custom_cmd": "kube-burner ocp cluster-density-v2 --uuid=${UUID} --iterations=750 --timeout=3h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" + "custom_cmd": "kube-burner ocp cluster-density-v2 --uuid=${UUID} --iterations=750 --timeout=3h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml --gc=false" } ] } diff --git a/dags/openshift_nightlies/config/benchmarks/small-control-plane-mgs.json b/dags/openshift_nightlies/config/benchmarks/small-control-plane-mgs.json index 4fb13d562..5cf1bb687 100644 --- a/dags/openshift_nightlies/config/benchmarks/small-control-plane-mgs.json +++ b/dags/openshift_nightlies/config/benchmarks/small-control-plane-mgs.json @@ -18,7 +18,7 @@ { "name": "cluster-density-v2", "workload": "kube-burner", - "custom_cmd": "kube-burner ocp cluster-density-v2 --uuid=${UUID} --iterations=500 --timeout=3h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" + "custom_cmd": "kube-burner ocp cluster-density-v2 --uuid=${UUID} --iterations=500 --timeout=3h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml --gc=false" } ] } diff --git a/dags/openshift_nightlies/config/benchmarks/small-control-plane.json b/dags/openshift_nightlies/config/benchmarks/small-control-plane.json index f34a77254..dcc305981 100644 --- a/dags/openshift_nightlies/config/benchmarks/small-control-plane.json +++ b/dags/openshift_nightlies/config/benchmarks/small-control-plane.json @@ -28,7 +28,7 @@ { "name": "cluster-density-v2", "workload": "kube-burner", - "custom_cmd": "kube-burner ocp cluster-density-v2 --uuid=${UUID} --iterations=500 --timeout=3h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" + "custom_cmd": "kube-burner ocp cluster-density-v2 --uuid=${UUID} --iterations=500 --timeout=3h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml --gc=false" } ] } From 967454d458e5b65f589012ad827bbdcfab47bf54 Mon Sep 17 00:00:00 2001 From: David Sanz Moreno Date: Mon, 7 Aug 2023 12:52:53 +0200 Subject: [PATCH 19/26] Always download last version of kube-burner, k8s-netperf and ingress-perf (#348) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Raúl Sevilla --- dags/openshift_nightlies/scripts/install/hypershift.sh | 6 ++---- images/airflow/Dockerfile | 5 +++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/dags/openshift_nightlies/scripts/install/hypershift.sh b/dags/openshift_nightlies/scripts/install/hypershift.sh index 83f8c8d72..06155d18c 100755 --- a/dags/openshift_nightlies/scripts/install/hypershift.sh +++ b/dags/openshift_nightlies/scripts/install/hypershift.sh @@ -296,10 +296,8 @@ update_fw(){ index_mgmt_cluster_stat(){ echo "Indexing Management cluster stat..." cd /home/airflow/workspace - echo "Installing kube-burner" - export KUBE_BURNER_RELEASE=${KUBE_BURNER_RELEASE:-1.3} - curl -L https://github.com/cloud-bulldozer/kube-burner/releases/download/v${KUBE_BURNER_RELEASE}/kube-burner-${KUBE_BURNER_RELEASE}-Linux-x86_64.tar.gz -o kube-burner.tar.gz - sudo tar -xvzf kube-burner.tar.gz -C /usr/local/bin/ + echo "Installing last version of kube-burner" + curl -L $(curl -s https://api.github.com/repos/cloud-bulldozer/kube-burner/releases/latest | jq -r '.assets | map(select(.name | test("linux-x86_64"))) | .[0].browser_download_url') | tar xz -C /usr/local/bin kube-burner echo "Cloning ${E2E_BENCHMARKING_REPO} from branch ${E2E_BENCHMARKING_BRANCH}" git clone -q -b ${E2E_BENCHMARKING_BRANCH} ${E2E_BENCHMARKING_REPO} --depth=1 --single-branch export KUBECONFIG=/home/airflow/auth/config diff --git a/images/airflow/Dockerfile b/images/airflow/Dockerfile index 3e5c60d5e..c19d06dc5 100644 --- a/images/airflow/Dockerfile +++ b/images/airflow/Dockerfile @@ -14,6 +14,7 @@ RUN curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | b ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6 -RUN curl -L https://github.com/cloud-bulldozer/kube-burner/releases/download/v1.7.3/kube-burner-V1.7.3-linux-x86_64.tar.gz | tar xz -C /usr/bin kube-burner -RUN curl -L https://github.com/cloud-bulldozer/k8s-netperf/releases/download/v0.1.11/k8s-netperf_Linux_v0.1.11_x86_64.tar.gz | tar xz -C /usr/bin k8s-netperf +RUN curl -L $(curl -s https://api.github.com/repos/cloud-bulldozer/kube-burner/releases/latest | jq -r '.assets | map(select(.name | test("linux-x86_64"))) | .[0].browser_download_url') | tar xz -C /usr/bin kube-burner +RUN curl -L $(curl -s https://api.github.com/repos/cloud-bulldozer/k8s-netperf/releases/latest | jq -r '.assets | map(select(.name | test("Linux.*x86_64"))) | .[0].browser_download_url') | tar xz -C /usr/bin k8s-netperf +RUN curl -L $(curl -s https://api.github.com/repos/cloud-bulldozer/ingress-perf/releases/latest | jq -r '.assets | map(select(.name | test("Linux.*x86_64"))) | .[0].browser_download_url') | tar xz -C /usr/bin ingress-perf USER airflow From 318650a47eb514548529cd4b9bd4c2b36ef95a96 Mon Sep 17 00:00:00 2001 From: Andrew Collins Date: Tue, 22 Aug 2023 15:33:37 -0500 Subject: [PATCH 20/26] 4.14 small node-density latencies are around 11s now (#350) --- .../config/benchmarks/small-control-plane.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/openshift_nightlies/config/benchmarks/small-control-plane.json b/dags/openshift_nightlies/config/benchmarks/small-control-plane.json index dcc305981..595da94b0 100644 --- a/dags/openshift_nightlies/config/benchmarks/small-control-plane.json +++ b/dags/openshift_nightlies/config/benchmarks/small-control-plane.json @@ -13,7 +13,7 @@ { "name": "node-density", "workload": "kube-burner", - "custom_cmd": "kube-burner ocp node-density --uuid=${UUID} --pods-per-node=245 --timeout=2h --pod-ready-threshold=5s --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" + "custom_cmd": "kube-burner ocp node-density --uuid=${UUID} --pods-per-node=245 --timeout=2h --pod-ready-threshold=15s --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" }, { "name": "node-density-heavy", From 745ea2ca896a2b3b18f6c64f2892dcd9e14f5e69 Mon Sep 17 00:00:00 2001 From: Murali Krishnasamy <70236227+mukrishn@users.noreply.github.com> Date: Thu, 31 Aug 2023 13:54:07 -0400 Subject: [PATCH 21/26] aws shared credential file (#351) --- dags/openshift_nightlies/scripts/install/cloud.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/dags/openshift_nightlies/scripts/install/cloud.sh b/dags/openshift_nightlies/scripts/install/cloud.sh index c4ece5a5d..d49e81aac 100755 --- a/dags/openshift_nightlies/scripts/install/cloud.sh +++ b/dags/openshift_nightlies/scripts/install/cloud.sh @@ -21,6 +21,7 @@ setup(){ export PUBLIC_KEY=/home/airflow/workspace/perf-dept/ssh_keys/id_rsa_pbench_ec2.pub export PRIVATE_KEY=/home/airflow/workspace/perf-dept/ssh_keys/id_rsa_pbench_ec2 export AWS_REGION=${AWS_REGION:-us-west-2} + export AWS_SHARED_CREDENTIALS_FILE="/root/$DEPLOY_PATH/credentials" chmod 600 ${PRIVATE_KEY} From 618effdc2854210c30c84d61145a9996ba54af9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Sevilla?= Date: Tue, 3 Oct 2023 11:18:24 +0200 Subject: [PATCH 22/26] Stop using custom_cmd for control-plane DAGs (#354) Signed-off-by: Raul Sevilla --- .../benchmarks/large-control-plane-mgs.json | 17 +++++++--- .../benchmarks/large-control-plane.json | 17 +++++++--- .../benchmarks/medium-control-plane-mgs.json | 17 +++++++--- .../benchmarks/medium-control-plane.json | 17 +++++++--- .../benchmarks/osp-large-control-plane.json | 19 ++++++++--- .../benchmarks/small-control-plane-mgs.json | 33 ++++++++++++++----- .../benchmarks/small-control-plane.json | 33 ++++++++++++++----- .../config/benchmarks/upgrade.json | 12 +++++-- images/airflow/Dockerfile | 3 -- 9 files changed, 125 insertions(+), 43 deletions(-) diff --git a/dags/openshift_nightlies/config/benchmarks/large-control-plane-mgs.json b/dags/openshift_nightlies/config/benchmarks/large-control-plane-mgs.json index 88888944c..01965338b 100644 --- a/dags/openshift_nightlies/config/benchmarks/large-control-plane-mgs.json +++ b/dags/openshift_nightlies/config/benchmarks/large-control-plane-mgs.json @@ -2,13 +2,22 @@ "benchmarks": [ { "name": "node-density", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp node-density --uuid=${UUID} --pods-per-node=245 --timeout=3h --pod-ready-threshold=10s --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" + "workload": "kube-burner-ocp-wrapper", + "command": "./run.sh", + "env": { + "WORKLOAD": "node-density", + "EXTRA_FLAGS": "--pod-ready-threshold=10s" + } }, { "name": "cluster-density-v2", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp cluster-density-v2 --uuid=${UUID} --iterations=3000 --timeout=6h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml --gc=false" + "workload": "kube-burner-ocp-wrapper", + "command": "./run.sh", + "env": { + "WORKLOAD": "cluster-density-v2", + "ITERATIONS": "2268", + "EXTRA_FLAGS": "--timeout=6h" + } } ] } diff --git a/dags/openshift_nightlies/config/benchmarks/large-control-plane.json b/dags/openshift_nightlies/config/benchmarks/large-control-plane.json index f0066429d..11f89a072 100644 --- a/dags/openshift_nightlies/config/benchmarks/large-control-plane.json +++ b/dags/openshift_nightlies/config/benchmarks/large-control-plane.json @@ -32,13 +32,22 @@ }, { "name": "node-density", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp node-density --uuid=${UUID} --pods-per-node=245 --timeout=3h --pod-ready-threshold=10s --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" + "workload": "kube-burner-ocp-wrapper", + "command": "./run.sh", + "env": { + "WORKLOAD": "node-density", + "EXTRA_FLAGS": "--pod-ready-threshold=10s" + } }, { "name": "cluster-density-v2", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp cluster-density-v2 --uuid=${UUID} --iterations=3000 --timeout=6h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml --gc=false" + "workload": "kube-burner-ocp-wrapper", + "command": "./run.sh", + "env": { + "WORKLOAD": "cluster-density-v2", + "ITERATIONS": "2268", + "EXTRA_FLAGS": "--timeout=6h" + } } ] } diff --git a/dags/openshift_nightlies/config/benchmarks/medium-control-plane-mgs.json b/dags/openshift_nightlies/config/benchmarks/medium-control-plane-mgs.json index 2bd0d34d1..0360dc833 100644 --- a/dags/openshift_nightlies/config/benchmarks/medium-control-plane-mgs.json +++ b/dags/openshift_nightlies/config/benchmarks/medium-control-plane-mgs.json @@ -2,13 +2,22 @@ "benchmarks": [ { "name": "node-density", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp node-density --uuid=${UUID} --pods-per-node=245 --timeout=3h --pod-ready-threshold=5s --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" + "workload": "kube-burner-ocp-wrapper", + "command": "./run.sh", + "env": { + "WORKLOAD": "node-density", + "EXTRA_FLAGS": "--pod-ready-threshold=10s" + } }, { "name": "cluster-density-v2", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp cluster-density-v2 --uuid=${UUID} --iterations=750 --timeout=5h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml --gc=false" + "workload": "kube-burner-ocp-wrapper", + "command": "./run.sh", + "env": { + "WORKLOAD": "cluster-density-v2", + "ITERATIONS": "1080", + "EXTRA_FLAGS": "--timeout=5h" + } } ] } diff --git a/dags/openshift_nightlies/config/benchmarks/medium-control-plane.json b/dags/openshift_nightlies/config/benchmarks/medium-control-plane.json index 980f9cdd4..bfeea50a1 100644 --- a/dags/openshift_nightlies/config/benchmarks/medium-control-plane.json +++ b/dags/openshift_nightlies/config/benchmarks/medium-control-plane.json @@ -22,13 +22,22 @@ }, { "name": "node-density", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp node-density --uuid=${UUID} --pods-per-node=245 --timeout=3h --pod-ready-threshold=5s --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" + "workload": "kube-burner-ocp-wrapper", + "command": "./run.sh", + "env": { + "WORKLOAD": "node-density", + "EXTRA_FLAGS": "--pod-ready-threshold=10s" + } }, { "name": "cluster-density-v2", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp cluster-density-v2 --uuid=${UUID} --iterations=750 --timeout=3h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml --gc=false" + "workload": "kube-burner-ocp-wrapper", + "command": "./run.sh", + "env": { + "WORKLOAD": "cluster-density-v2", + "ITERATIONS": "1080", + "EXTRA_FLAGS": "--timeout=5h" + } } ] } diff --git a/dags/openshift_nightlies/config/benchmarks/osp-large-control-plane.json b/dags/openshift_nightlies/config/benchmarks/osp-large-control-plane.json index 402bc4f61..db92e5b22 100644 --- a/dags/openshift_nightlies/config/benchmarks/osp-large-control-plane.json +++ b/dags/openshift_nightlies/config/benchmarks/osp-large-control-plane.json @@ -32,13 +32,22 @@ }, { "name": "node-density", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp node-density --uuid=${UUID} --pods-per-node=245 --timeout=2h --pod-ready-threshold=10s --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" + "workload": "kube-burner-ocp-wrapper", + "command": "./run.sh", + "env": { + "WORKLOAD": "node-density", + "EXTRA_FLAGS": "--pod-ready-threshold=10s" + } }, { - "name": "cluster-density", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp cluster-density --uuid=${UUID} --iterations=3000 --timeout=6h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" + "name": "cluster-density-v2", + "workload": "kube-burner-ocp-wrapper", + "command": "./run.sh", + "env": { + "WORKLOAD": "cluster-density-v2", + "ITERATIONS": "2268", + "EXTRA_FLAGS": "--timeout=5h" + } } ] } diff --git a/dags/openshift_nightlies/config/benchmarks/small-control-plane-mgs.json b/dags/openshift_nightlies/config/benchmarks/small-control-plane-mgs.json index 5cf1bb687..07cff7d59 100644 --- a/dags/openshift_nightlies/config/benchmarks/small-control-plane-mgs.json +++ b/dags/openshift_nightlies/config/benchmarks/small-control-plane-mgs.json @@ -2,23 +2,40 @@ "benchmarks": [ { "name": "node-density", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp node-density --uuid=${UUID} --pods-per-node=245 --timeout=2h --pod-ready-threshold=5s --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" + "workload": "kube-burner-ocp-wrapper", + "command": "./run.sh", + "env": { + "WORKLOAD": "node-density", + "EXTRA_FLAGS": "--pod-ready-threshold=15s --timeout=2h" + } }, { "name": "node-density-heavy", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp node-density-heavy --uuid=${UUID} --pods-per-node=245 --timeout=2h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" + "workload": "kube-burner-ocp-wrapper", + "command": "./run.sh", + "env": { + "WORKLOAD": "node-density-heavy", + "EXTRA_FLAGS": "--timeout=2h" + } }, { "name": "node-density-cni", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp node-density-cni --uuid=${UUID} --pods-per-node=245 --timeout=2h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" + "workload": "kube-burner-ocp-wrapper", + "command": "./run.sh", + "env": { + "WORKLOAD": "node-density-cni", + "EXTRA_FLAGS": "--timeout=2h" + } }, { "name": "cluster-density-v2", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp cluster-density-v2 --uuid=${UUID} --iterations=500 --timeout=3h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml --gc=false" + "workload": "kube-burner-ocp-wrapper", + "command": "./run.sh", + "env": { + "WORKLOAD": "cluster-density-v2", + "ITERATIONS": "216", + "EXTRA_FLAGS": "--timeout=3h" + } } ] } diff --git a/dags/openshift_nightlies/config/benchmarks/small-control-plane.json b/dags/openshift_nightlies/config/benchmarks/small-control-plane.json index 595da94b0..20ccc210c 100644 --- a/dags/openshift_nightlies/config/benchmarks/small-control-plane.json +++ b/dags/openshift_nightlies/config/benchmarks/small-control-plane.json @@ -12,23 +12,40 @@ }, { "name": "node-density", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp node-density --uuid=${UUID} --pods-per-node=245 --timeout=2h --pod-ready-threshold=15s --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" + "workload": "kube-burner-ocp-wrapper", + "command": "./run.sh", + "env": { + "WORKLOAD": "node-density", + "EXTRA_FLAGS": "--pod-ready-threshold=15s --timeout=2h" + } }, { "name": "node-density-heavy", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp node-density-heavy --uuid=${UUID} --pods-per-node=245 --timeout=2h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" + "workload": "kube-burner-ocp-wrapper", + "command": "./run.sh", + "env": { + "WORKLOAD": "node-density-heavy", + "EXTRA_FLAGS": "--timeout=2h" + } }, { "name": "node-density-cni", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp node-density-cni --uuid=${UUID} --pods-per-node=245 --timeout=2h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml" + "workload": "kube-burner-ocp-wrapper", + "command": "./run.sh", + "env": { + "WORKLOAD": "node-density-cni", + "EXTRA_FLAGS": "--timeout=2h" + } }, { "name": "cluster-density-v2", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp cluster-density-v2 --uuid=${UUID} --iterations=500 --timeout=3h --es-server=${ES_SERVER} --es-index=ripsaw-kube-burner --user-metadata=metadata.yml --gc=false" + "workload": "kube-burner-ocp-wrapper", + "command": "./run.sh", + "env": { + "WORKLOAD": "cluster-density-v2", + "ITERATIONS": "216", + "EXTRA_FLAGS": "--timeout=3h" + } } ] } diff --git a/dags/openshift_nightlies/config/benchmarks/upgrade.json b/dags/openshift_nightlies/config/benchmarks/upgrade.json index 170c41293..1d158d080 100644 --- a/dags/openshift_nightlies/config/benchmarks/upgrade.json +++ b/dags/openshift_nightlies/config/benchmarks/upgrade.json @@ -1,9 +1,15 @@ { "benchmarks": [ { - "name": "cluster-density", - "workload": "kube-burner", - "custom_cmd": "kube-burner ocp cluster-density --iterations=500 --timeout=2h --churn=false --gc=false" + "name": "cluster-density-v2", + "workload": "kube-burner-ocp-wrapper", + "command": "./run.sh", + "env": { + "WORKLOAD": "cluster-density-v2", + "ITERATIONS": "2268", + "EXTRA_FLAGS": "--timeout=2h --gc=false", + "CHURN": "false" + } }, { "name": "upgrades", diff --git a/images/airflow/Dockerfile b/images/airflow/Dockerfile index c19d06dc5..96198382c 100644 --- a/images/airflow/Dockerfile +++ b/images/airflow/Dockerfile @@ -14,7 +14,4 @@ RUN curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | b ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6 -RUN curl -L $(curl -s https://api.github.com/repos/cloud-bulldozer/kube-burner/releases/latest | jq -r '.assets | map(select(.name | test("linux-x86_64"))) | .[0].browser_download_url') | tar xz -C /usr/bin kube-burner -RUN curl -L $(curl -s https://api.github.com/repos/cloud-bulldozer/k8s-netperf/releases/latest | jq -r '.assets | map(select(.name | test("Linux.*x86_64"))) | .[0].browser_download_url') | tar xz -C /usr/bin k8s-netperf -RUN curl -L $(curl -s https://api.github.com/repos/cloud-bulldozer/ingress-perf/releases/latest | jq -r '.assets | map(select(.name | test("Linux.*x86_64"))) | .[0].browser_download_url') | tar xz -C /usr/bin ingress-perf USER airflow From 8de2c3b6d5d44d757d48a211ca3745d41387f5e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Sevilla?= Date: Wed, 15 Nov 2023 13:01:02 +0100 Subject: [PATCH 23/26] Update azure configs (#353) * Use my own fork Signed-off-by: Raul Sevilla * Enable azure Signed-off-by: Raul Sevilla * Use managed-csi storageclass Signed-off-by: Raul Sevilla --------- Signed-off-by: Raul Sevilla --- .../config/install/azure/acs.json | 4 ++-- .../config/install/azure/ovn-cp.json | 4 ++-- .../config/install/azure/ovn-dp-v2.json | 24 +++++++++++++++++++ .../config/install/azure/ovn-dp.json | 4 ++-- .../config/install/azure/ovn-large-cp.json | 4 ++-- 5 files changed, 32 insertions(+), 8 deletions(-) create mode 100644 dags/openshift_nightlies/config/install/azure/ovn-dp-v2.json diff --git a/dags/openshift_nightlies/config/install/azure/acs.json b/dags/openshift_nightlies/config/install/azure/acs.json index 0ecfd39be..5a9dbc38f 100644 --- a/dags/openshift_nightlies/config/install/azure/acs.json +++ b/dags/openshift_nightlies/config/install/azure/acs.json @@ -16,9 +16,9 @@ "openshift_workload_node_volume_size": 512, "openshift_workload_node_volume_type": "Premium_LRS", "openshift_prometheus_retention_period": "15d", - "openshift_prometheus_storage_class": "managed-premium", + "openshift_prometheus_storage_class": "managed-csi", "openshift_prometheus_storage_size": "500Gi", - "openshift_alertmanager_storage_class": "managed-premium", + "openshift_alertmanager_storage_class": "managed-csi", "openshift_alertmanager_storage_size": "20Gi", "rhacs_enable": "true", "rhacs_image_main_registry": "quay.io/rhacs-eng", diff --git a/dags/openshift_nightlies/config/install/azure/ovn-cp.json b/dags/openshift_nightlies/config/install/azure/ovn-cp.json index 5370bcb45..8142f652a 100644 --- a/dags/openshift_nightlies/config/install/azure/ovn-cp.json +++ b/dags/openshift_nightlies/config/install/azure/ovn-cp.json @@ -17,8 +17,8 @@ "openshift_workload_node_volume_size": 50, "openshift_workload_node_volume_type": "Premium_LRS", "openshift_prometheus_retention_period": "15d", - "openshift_prometheus_storage_class": "managed-premium", + "openshift_prometheus_storage_class": "managed-csi", "openshift_prometheus_storage_size": "50Gi", - "openshift_alertmanager_storage_class": "managed-premium", + "openshift_alertmanager_storage_class": "managed-csi", "openshift_alertmanager_storage_size": "2Gi" } diff --git a/dags/openshift_nightlies/config/install/azure/ovn-dp-v2.json b/dags/openshift_nightlies/config/install/azure/ovn-dp-v2.json new file mode 100644 index 000000000..29c4819d7 --- /dev/null +++ b/dags/openshift_nightlies/config/install/azure/ovn-dp-v2.json @@ -0,0 +1,24 @@ +{ + "openshift_master_count": 3, + "openshift_worker_count": 9, + "openshift_base_domain": "ats.azure.devcluster.openshift.com", + "openshift_host_prefix": "23", + "openshift_network_type": "OVNKubernetes", + "openshift_master_vm_size": "Standard_E4s_v3", + "openshift_worker_vm_size": "Standard_D8s_v3", + "openshift_master_root_volume_size": 100, + "openshift_worker_root_volume_size": 50, + "openshift_cidr": "10.128.0.0/14", + "machineset_metadata_label_prefix": "machine.openshift.io", + "openshift_infra_node_vm_size": "Standard_F16s_v2", + "openshift_infra_node_volume_size": 50, + "openshift_infra_node_volume_type": "Premium_LRS", + "openshift_workload_node_volume_size": 50, + "openshift_workload_node_volume_type": "Premium_LRS", + "openshift_prometheus_retention_period": "15d", + "openshift_prometheus_storage_class": "managed-csi", + "openshift_prometheus_storage_size": "50Gi", + "openshift_alertmanager_storage_class": "managed-csi", + "openshift_alertmanager_storage_size": "2Gi", + "openshift_toggle_workload_node": false +} diff --git a/dags/openshift_nightlies/config/install/azure/ovn-dp.json b/dags/openshift_nightlies/config/install/azure/ovn-dp.json index 369e1c205..774595b2b 100644 --- a/dags/openshift_nightlies/config/install/azure/ovn-dp.json +++ b/dags/openshift_nightlies/config/install/azure/ovn-dp.json @@ -17,8 +17,8 @@ "openshift_workload_node_volume_size": 50, "openshift_workload_node_volume_type": "Premium_LRS", "openshift_prometheus_retention_period": "15d", - "openshift_prometheus_storage_class": "managed-premium", + "openshift_prometheus_storage_class": "managed-csi", "openshift_prometheus_storage_size": "50Gi", - "openshift_alertmanager_storage_class": "managed-premium", + "openshift_alertmanager_storage_class": "managed-csi", "openshift_alertmanager_storage_size": "2Gi" } diff --git a/dags/openshift_nightlies/config/install/azure/ovn-large-cp.json b/dags/openshift_nightlies/config/install/azure/ovn-large-cp.json index 725e52b06..879e078f9 100644 --- a/dags/openshift_nightlies/config/install/azure/ovn-large-cp.json +++ b/dags/openshift_nightlies/config/install/azure/ovn-large-cp.json @@ -17,8 +17,8 @@ "openshift_workload_node_volume_size": 512, "openshift_workload_node_volume_type": "Premium_LRS", "openshift_prometheus_retention_period": "15d", - "openshift_prometheus_storage_class": "managed-premium", + "openshift_prometheus_storage_class": "managed-csi", "openshift_prometheus_storage_size": "500Gi", - "openshift_alertmanager_storage_class": "managed-premium", + "openshift_alertmanager_storage_class": "managed-csi", "openshift_alertmanager_storage_size": "20Gi" } From 149a40812f58edbc009287cc389e1b7171c64f43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Sevilla?= Date: Thu, 8 Feb 2024 17:35:36 +0100 Subject: [PATCH 24/26] Migrate release_stream_base_url logic (#355) Signed-off-by: Raul Sevilla --- dags/openshift_nightlies/util/manifest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/openshift_nightlies/util/manifest.py b/dags/openshift_nightlies/util/manifest.py index c99001352..eea92ff36 100644 --- a/dags/openshift_nightlies/util/manifest.py +++ b/dags/openshift_nightlies/util/manifest.py @@ -23,7 +23,7 @@ def get_latest_releases(self): self.latest_releases = {} for stream in release_streams: # ARM binaries under its own CI. - base_url_arm = self.release_stream_base_url.replace("openshift-release",f"openshift-release-{self.ARM64}") + base_url_arm = self.release_stream_base_url.replace("amd64", f"{self.ARM64}") stream_arm = f"{stream}-{self.ARM64}" latest_accepted_release,latest_accepted_release_url = self.request_for_payload(f"{base_url_arm}/{stream_arm}/latest") self.latest_releases[stream_arm] = { From 20703db5f1facc3babe6c9525f2b1a2b4657c273 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Sevilla?= Date: Wed, 14 Feb 2024 22:49:48 +0100 Subject: [PATCH 25/26] Remove snappy dependency and update docs (#356) Signed-off-by: Raul Sevilla --- dags/openshift_nightlies/dag.py | 20 +------- dags/openshift_nightlies/docs/secrets.md | 44 ++++++++-------- .../tasks/utils/defaults.json | 14 ------ .../tasks/utils/scale_ci_diagnosis.py | 50 ------------------- 4 files changed, 23 insertions(+), 105 deletions(-) delete mode 100644 dags/openshift_nightlies/tasks/utils/defaults.json delete mode 100644 dags/openshift_nightlies/tasks/utils/scale_ci_diagnosis.py diff --git a/dags/openshift_nightlies/dag.py b/dags/openshift_nightlies/dag.py index 9e6f5dd71..48428c32b 100644 --- a/dags/openshift_nightlies/dag.py +++ b/dags/openshift_nightlies/dag.py @@ -19,7 +19,7 @@ from openshift_nightlies.tasks.install.hypershift import hypershift from openshift_nightlies.tasks.install.prebuilt import initialize_cluster from openshift_nightlies.tasks.benchmarks import e2e -from openshift_nightlies.tasks.utils import rosa_post_install, scale_ci_diagnosis, final_dag_status +from openshift_nightlies.tasks.utils import rosa_post_install, final_dag_status from openshift_nightlies.util import constants, manifest from abc import ABC, abstractmethod @@ -68,9 +68,6 @@ def _get_openshift_installer(self): def _get_e2e_benchmarks(self): return e2e.E2EBenchmarks(self.dag, self.config, self.release) - def _get_scale_ci_diagnosis(self): - return scale_ci_diagnosis.Diagnosis(self.dag, self.config, self.release) - def _get_rosa_postinstall_setup(self): return rosa_post_install.Diagnosis(self.dag, self.config, self.release) @@ -84,11 +81,7 @@ def build(self): with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks: benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks() - must_gather = self._get_scale_ci_diagnosis().get_must_gather("must-gather") chain(*benchmark_tasks) - # Configure must_gather as downstream of all benchmark tasks - for benchmark in benchmark_tasks: - benchmark >> must_gather if self.config.cleanup_on_success: cleanup_cluster = installer.get_cleanup_task() @@ -159,12 +152,8 @@ def build(self): install_cluster = installer.get_install_task() final_status = final_dag_status.get_task(self.dag) with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks: - must_gather = self._get_scale_ci_diagnosis().get_must_gather("must-gather") benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks() chain(*benchmark_tasks) - # Configure must_gather as downstream of all benchmark tasks - for benchmark in benchmark_tasks: - benchmark >> must_gather rosa_post_installation = self._get_rosa_postinstall_setup()._get_rosa_postinstallation() if self.config.cleanup_on_success: cleanup_cluster = installer.get_cleanup_task() @@ -208,11 +197,7 @@ def build(self): final_status = final_dag_status.get_task(self.dag) with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks: benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks() - must_gather = self._get_scale_ci_diagnosis().get_must_gather("must-gather") chain(*benchmark_tasks) - # Configure must_gather as downstream of all benchmark tasks - for benchmark in benchmark_tasks: - benchmark >> must_gather if self.config.cleanup_on_success: cleanup_cluster = installer.get_cleanup_task() install_cluster >> benchmarks >> cleanup_cluster >> final_status @@ -285,11 +270,8 @@ def build(self): installer = self._get_openshift_installer() initialize_cluster = installer.initialize_cluster_task() with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks: - must_gather = self._get_scale_ci_diagnosis().get_must_gather("must-gather") benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks() chain(*benchmark_tasks) - for benchmark in benchmark_tasks: - benchmark >> must_gather initialize_cluster >> benchmarks def _get_openshift_installer(self): diff --git a/dags/openshift_nightlies/docs/secrets.md b/dags/openshift_nightlies/docs/secrets.md index f0054a650..45d7c2c23 100644 --- a/dags/openshift_nightlies/docs/secrets.md +++ b/dags/openshift_nightlies/docs/secrets.md @@ -172,12 +172,33 @@ Schema: ``` +--- +Key: `rosa_creds` + +Type: JSON + +Description: Token to interact with OCM to deploy clusters via ROSA + +Used by: install, cleanup + +Platforms: ROSA (aws) + +Schema: + +```json +{ + "rosa_token_": "string" +} +``` + +This object han have different keys for the different ROSA environments, for example, a token for the `staging` environment is specified by the key `rosa_token_staging`. + --- Key: `openshift_install_config` Type: JSON -Description: Common openshift install configurations that aren't configurable +Description: Common openshift install configurations that aren't configurable. `openshift_install_pull_secret` should be defined here Used by: install, cleanup @@ -212,24 +233,3 @@ Platforms: Cloud (all), Openstack Schema: Fully qualified URL ---- -Key: `snappy_creds` - -Type: JSON - -Description: Credentials for snappy server that houses cluster artifacts we wish to keep after the cluster is destroyed - -Used by: scale_ci_diagnosis - -Platforms: All - -Schema: - -```json -{ - "username": "string", - "server": "string", - "password": "string" -} - -``` diff --git a/dags/openshift_nightlies/tasks/utils/defaults.json b/dags/openshift_nightlies/tasks/utils/defaults.json deleted file mode 100644 index 047dd24ee..000000000 --- a/dags/openshift_nightlies/tasks/utils/defaults.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "must_gather": - { - "name": "must-gather", - "workload": "scale-ci-diagnosis", - "command": "./ocp_diagnosis.sh", - "env": { - "PROMETHEUS_CAPTURE": "false", - "PROMETHEUS_CAPTURE_TYPE": "full", - "OPENSHIFT_MUST_GATHER": "true", - "STORAGE_MODE": "snappy" - } - } -} diff --git a/dags/openshift_nightlies/tasks/utils/scale_ci_diagnosis.py b/dags/openshift_nightlies/tasks/utils/scale_ci_diagnosis.py deleted file mode 100644 index e8cb5576d..000000000 --- a/dags/openshift_nightlies/tasks/utils/scale_ci_diagnosis.py +++ /dev/null @@ -1,50 +0,0 @@ -from os import environ -from openshift_nightlies.util import var_loader, executor, constants -from openshift_nightlies.models.release import OpenshiftRelease -from common.models.dag_config import DagConfig - -from airflow.operators.bash import BashOperator - - -class Diagnosis(): - - def __init__(self, dag, config: DagConfig, release: OpenshiftRelease): - # General DAG Configuration - self.dag = dag - self.release = release - self.config = config - self.exec_config = executor.get_executor_config_with_cluster_access(self.config, self.release) - self.snappy_creds = var_loader.get_secret("snappy_creds", deserialize_json=True) - - # Specific Task Configuration - self.vars = var_loader.build_task_vars( - release=self.release, task="utils")["must_gather"] - self.git_name = self._git_name() - self.env = { - "SNAPPY_DATA_SERVER_URL": self.snappy_creds['server'], - "SNAPPY_DATA_SERVER_USERNAME": self.snappy_creds['username'], - "SNAPPY_DATA_SERVER_PASSWORD": self.snappy_creds['password'], - "SNAPPY_USER_FOLDER": self.git_name - - } - self.env.update(self.config.dependencies) - - def _git_name(self): - git_username = var_loader.get_git_user() - if git_username == 'cloud-bulldozer': - return "perf-ci" - else: - return f"{git_username}" - - def get_must_gather(self, task_id): - env = {**self.env, **self.vars.get('env', {}), **{"ES_SERVER": var_loader.get_secret('elasticsearch')}, **{"KUBEADMIN_PASSWORD": environ.get("KUBEADMIN_PASSWORD", "")}} - return BashOperator( - task_id=task_id, - depends_on_past=False, - bash_command=f"{constants.root_dag_dir}/scripts/utils/run_scale_ci_diagnosis.sh -w {self.vars['workload']} -c {self.vars['command']} ", - retries=3, - dag=self.dag, - env=env, - executor_config=self.exec_config, - trigger_rule="one_failed" - ) From 9ed28caa4acdd8b6666e36777b500850b413e608 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Sevilla?= Date: Wed, 14 Feb 2024 22:57:13 +0100 Subject: [PATCH 26/26] Remove references to the snappy_creds secret (#357) * Remove references to the snappy_creds secret Signed-off-by: Raul Sevilla * Remove remaining snappy references Signed-off-by: Raul Sevilla --------- Signed-off-by: Raul Sevilla --- dags/nocp/README.md | 1 - dags/nocp/docs/nocp_benchmarks.md | 2 +- dags/nocp/tasks/benchmarks/nocp.py | 5 ----- .../config/baremetal-benchmarks/webfuse-bench.json | 1 - dags/openshift_nightlies/scripts/run_benchmark.sh | 1 - .../scripts/utils/run_scale_ci_diagnosis.sh | 1 - dags/openshift_nightlies/tasks/benchmarks/defaults.json | 9 +++------ dags/openshift_nightlies/tasks/benchmarks/e2e.py | 5 ----- 8 files changed, 4 insertions(+), 21 deletions(-) diff --git a/dags/nocp/README.md b/dags/nocp/README.md index 46d9e4e81..1a4ac9308 100644 --- a/dags/nocp/README.md +++ b/dags/nocp/README.md @@ -47,7 +47,6 @@ This DAG is used for OCM testing. Modules in this DAG - * Each test is called with a timeout. ```Timeout = test duration + 10 minutes``` This extra 10 minutes help test to create necessary result files after running the test for given duration -* ```Automation.py``` uploads all result files to snappy server * Kube burner is used to pull metrics from clusters and account manager services and push to observability ES. * Finally it displays dashboard URLs for these metrics * At the end, this script returns UUID and test result to airflow. UUID is used in the next task (i.e cleanup task) diff --git a/dags/nocp/docs/nocp_benchmarks.md b/dags/nocp/docs/nocp_benchmarks.md index 293c796c1..72a75844a 100644 --- a/dags/nocp/docs/nocp_benchmarks.md +++ b/dags/nocp/docs/nocp_benchmarks.md @@ -7,7 +7,7 @@ Some workloads doesn't need OCP environemnt. For example, ocm-api-load which tes * tool is run from the jump host * user has to manually setup needed packages in the jump host * airflow will run only one task i.e benchmark task -* user provided script for benchmark task should handle everything like triggering the tool on jump host, scrapping metrics, pushing data to snappy and cleanup. +* user provided script for benchmark task should handle everything like triggering the tool on jump host or scrapping metrics ## Adding the new workload diff --git a/dags/nocp/tasks/benchmarks/nocp.py b/dags/nocp/tasks/benchmarks/nocp.py index bac51445e..03ee0183b 100644 --- a/dags/nocp/tasks/benchmarks/nocp.py +++ b/dags/nocp/tasks/benchmarks/nocp.py @@ -23,7 +23,6 @@ def __init__(self, app, dag, config: DagConfig, task_group="benchmarks"): self.dag = dag self.task_group = task_group self.dag_config = config - self.snappy_creds = var_loader.get_secret("snappy_creds", deserialize_json=True) self.es_server_baseline = var_loader.get_secret("es_server_baseline") # Specific Task Configuration @@ -31,10 +30,6 @@ def __init__(self, app, dag, config: DagConfig, task_group="benchmarks"): app, task=self.task_group) self.git_name=self._git_name() self.env = { - "SNAPPY_DATA_SERVER_URL": self.snappy_creds['server'], - "SNAPPY_DATA_SERVER_USERNAME": self.snappy_creds['username'], - "SNAPPY_DATA_SERVER_PASSWORD": self.snappy_creds['password'], - "SNAPPY_USER_FOLDER": self.git_name, "GIT_USER": self.git_name, "TASK_GROUP": self.task_group, "ES_SERVER_BASELINE": self.es_server_baseline, diff --git a/dags/openshift_nightlies/config/baremetal-benchmarks/webfuse-bench.json b/dags/openshift_nightlies/config/baremetal-benchmarks/webfuse-bench.json index 60a88602c..503bf2cf8 100644 --- a/dags/openshift_nightlies/config/baremetal-benchmarks/webfuse-bench.json +++ b/dags/openshift_nightlies/config/baremetal-benchmarks/webfuse-bench.json @@ -8,7 +8,6 @@ "env": { "WORKLOAD": "pod2svc", "COMPARE": "false", - "ENABLE_SNAPPY_BACKUP": "false", "MULTI_AZ": "false" } }, diff --git a/dags/openshift_nightlies/scripts/run_benchmark.sh b/dags/openshift_nightlies/scripts/run_benchmark.sh index 50a310778..2b16b5d01 100755 --- a/dags/openshift_nightlies/scripts/run_benchmark.sh +++ b/dags/openshift_nightlies/scripts/run_benchmark.sh @@ -21,7 +21,6 @@ setup(){ export KUBECONFIG=/home/airflow/workspace/config export GSHEET_KEY_LOCATION=/tmp/key.json export RUN_ID=${AIRFLOW_CTX_DAG_ID}/${AIRFLOW_CTX_DAG_RUN_ID}/$AIRFLOW_CTX_TASK_ID - export SNAPPY_RUN_ID=${AIRFLOW_CTX_DAG_ID}/${AIRFLOW_CTX_DAG_RUN_ID} echo "cpt: true" > metadata.yml curl -sS https://mirror.openshift.com/pub/openshift-v4/clients/ocp/latest/openshift-client-linux.tar.gz | tar xz oc diff --git a/dags/openshift_nightlies/scripts/utils/run_scale_ci_diagnosis.sh b/dags/openshift_nightlies/scripts/utils/run_scale_ci_diagnosis.sh index 3f388b222..95ed0a053 100755 --- a/dags/openshift_nightlies/scripts/utils/run_scale_ci_diagnosis.sh +++ b/dags/openshift_nightlies/scripts/utils/run_scale_ci_diagnosis.sh @@ -21,7 +21,6 @@ setup(){ export KUBECONFIG=/home/airflow/workspace/config export BUILD_NUMBER=test export RUN_ID=${AIRFLOW_CTX_DAG_ID}/${AIRFLOW_CTX_DAG_RUN_ID}/$AIRFLOW_CTX_TASK_ID - export SNAPPY_RUN_ID=${AIRFLOW_CTX_DAG_ID}/${AIRFLOW_CTX_DAG_RUN_ID} curl -sS https://mirror.openshift.com/pub/openshift-v4/clients/ocp/latest/openshift-client-linux.tar.gz | tar xz oc diff --git a/dags/openshift_nightlies/tasks/benchmarks/defaults.json b/dags/openshift_nightlies/tasks/benchmarks/defaults.json index 47474ffd5..2898fce35 100644 --- a/dags/openshift_nightlies/tasks/benchmarks/defaults.json +++ b/dags/openshift_nightlies/tasks/benchmarks/defaults.json @@ -83,8 +83,7 @@ "GOLD_SDN": "openshiftsdn", "COMPARE_WITH_GOLD": "true", "EMAIL_ID_FOR_RESULTS_SHEET": "msheth@redhat.com", - "GSHEET_KEY_LOCATION": "/tmp/key.json", - "ENABLE_SNAPPY_BACKUP": "true" + "GSHEET_KEY_LOCATION": "/tmp/key.json" } }, { @@ -96,8 +95,7 @@ "GOLD_SDN": "openshiftsdn", "COMPARE_WITH_GOLD": "true", "EMAIL_ID_FOR_RESULTS_SHEET": "msheth@redhat.com", - "GSHEET_KEY_LOCATION": "/tmp/key.json", - "ENABLE_SNAPPY_BACKUP": "true" + "GSHEET_KEY_LOCATION": "/tmp/key.json" } }, { @@ -109,8 +107,7 @@ "GOLD_SDN": "openshiftsdn", "COMPARE_WITH_GOLD": "true", "EMAIL_ID_FOR_RESULTS_SHEET": "msheth@redhat.com", - "GSHEET_KEY_LOCATION": "/tmp/key.json", - "ENABLE_SNAPPY_BACKUP": "true" + "GSHEET_KEY_LOCATION": "/tmp/key.json" } }, { diff --git a/dags/openshift_nightlies/tasks/benchmarks/e2e.py b/dags/openshift_nightlies/tasks/benchmarks/e2e.py index 9608888bc..648db7cc2 100644 --- a/dags/openshift_nightlies/tasks/benchmarks/e2e.py +++ b/dags/openshift_nightlies/tasks/benchmarks/e2e.py @@ -23,7 +23,6 @@ def __init__(self, dag, config: DagConfig, release: OpenshiftRelease, task_group self.release = release self.task_group = task_group self.dag_config = config - self.snappy_creds = var_loader.get_secret("snappy_creds", deserialize_json=True) self.es_server_baseline = var_loader.get_secret("es_server_baseline") self.gsheet = var_loader.get_secret("gsheet_key") @@ -38,10 +37,6 @@ def __init__(self, dag, config: DagConfig, release: OpenshiftRelease, task_group release=self.release, task=self.task_group) self.git_name=self._git_name() self.env = { - "SNAPPY_DATA_SERVER_URL": self.snappy_creds['server'], - "SNAPPY_DATA_SERVER_USERNAME": self.snappy_creds['username'], - "SNAPPY_DATA_SERVER_PASSWORD": self.snappy_creds['password'], - "SNAPPY_USER_FOLDER": self.git_name, "PLATFORM": self.release.platform, "TASK_GROUP": self.task_group, "ES_SERVER_BASELINE": self.es_server_baseline