Merge branch 'cloud-bulldozer:master' into cmmo-dag

cloud-bulldozer · Jul 6, 2023 · c3eee05 · c3eee05
2 parents 4312d4f + 12b5201
commit c3eee05
Show file tree

Hide file tree

Showing 17 changed files with 36 additions and 86 deletions.
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -1,10 +1,12 @@
 name: Release Airflow Images
 on:
+  schedule:  # triggers every midnight
+    - cron: '0 0 * * *' 
   push:
     branches:
       - master
     tags:
-      - "*" # triggers only if push new tag version
+      - "*" # triggers on a push event
 
 jobs:
   containers:

diff --git a/Makefile b/Makefile
@@ -1,6 +1,6 @@
 QUAY_ACCOUNT ?= quay.io/cloud-bulldozer
 IMAGE_BUILDER ?= podman
-AIRFLOW_VERSION ?= 2.3.2
+AIRFLOW_VERSION ?= 2.6.2
 AIRFLOW_PYTHON_VERSION ?= python3.8
 AIRFLOW_IMAGE_TAG ?= $(AIRFLOW_VERSION)-$(AIRFLOW_PYTHON_VERSION)
 IMAGE_TAG ?= $(AIRFLOW_VERSION)

diff --git a/dags/common/models/dag_config.py b/dags/common/models/dag_config.py
@@ -20,6 +20,6 @@ class DagConfig:
         })
     executor_image: Optional[dict] = field(default_factory=lambda: {
             "repository": "quay.io/cloud-bulldozer",
-            "tag": "2.3.2"
+            "tag": "2.6.2"
         })
     dependencies: Optional[dict] = field(default_factory=lambda: {})
diff --git a/dags/nocp/manifest.yaml b/dags/nocp/manifest.yaml
@@ -7,4 +7,4 @@ dagConfig:
   cleanupOnSuccess: true
   executorImages:
     repository: quay.io/cloud-bulldozer
-    tag: 2.3.2
+    tag: 2.6.2
diff --git a/dags/nocp/scripts/run_ocm_api_load.sh b/dags/nocp/scripts/run_ocm_api_load.sh
@@ -89,15 +89,12 @@ run_ocm_api_load(){
 
 	# Timeout runs ocm-load-test for the specified duration even if airflow killed this script (when user wants to stop benchmark execution). This helps in ocm-load-test to cleanup resources it created. 10 minutes extra timeout is set so that test can prepare results after running for the given duration.
 	# kill-after option needs sudo permissions
-        timeout --kill-after=60s --preserve-status $(((tduration + 10) * 60)) $TESTDIR/build/ocm-load-test --aws-region $AWS_DEFAULT_REGION --aws-account-id $AWS_ACCOUNT_ID --aws-access-key $AWS_OSDCCADMIN_KEY --aws-access-secret $AWS_OSDCCADMIN_SECRET --cooldown $COOLDOWN --duration $tduration --elastic-index ocm-request-test --elastic-insecure-skip-verify=true --elastic-server $ES_SERVER --gateway-url $GATEWAY_URL --ocm-token $OCM_TOKEN --ocm-token-url $OCM_TOKEN_URL --output-path $TESTDIR/results --rate $trate --test-id $UUID --test-names $tname $rampoptions
+        timeout --kill-after=60s --preserve-status $(((tduration + 10) * 60)) $TESTDIR/build/ocm-load-test --aws-region $AWS_DEFAULT_REGION --aws-account-id $AWS_ACCOUNT_ID --aws-access-key $AWS_OSDCCADMIN_KEY --aws-access-secret $AWS_OSDCCADMIN_SECRET --cooldown $COOLDOWN --duration $tduration --elastic-index ocm-load-metrics --elastic-insecure-skip-verify=true --elastic-server $ES_SERVER --gateway-url $GATEWAY_URL --ocm-token $OCM_TOKEN --ocm-token-url $OCM_TOKEN_URL --output-path $TESTDIR/results --rate $trate --test-id $UUID --test-names $tname $rampoptions
 	sleep $COOLDOWN
     done
     benchmark_rv=$?
     end_time=$(date +%s)
 
-    echo "Uploading Result files..."
-    python3 $TESTDIR/automation.py upload --dir $TESTDIR/results --server ${SNAPPY_DATA_SERVER_URL} --user ${SNAPPY_DATA_SERVER_USERNAME} --password ${SNAPPY_DATA_SERVER_PASSWORD}
-
     # scraping metrics
     export KUBE_ES_INDEX=ocm-uhc-acct-mngr
     envsubst < $TESTDIR/ci/templates/kube-burner-config.yaml > $TESTDIR/kube-burner-am-config.yaml

diff --git a/dags/openshift_nightlies/manifest.yaml b/dags/openshift_nightlies/manifest.yaml
@@ -25,7 +25,7 @@ dagConfig:
   cleanupOnSuccess: true
   executorImages:
     repository: quay.io/cloud-bulldozer
-    tag: 2.3.2
+    tag: 2.6.2
   dependencies:
     e2e_benchmarking:
       repo: https://github.com/cloud-bulldozer/e2e-benchmarking.git

diff --git a/dags/openshift_nightlies/scripts/install/rosa.sh b/dags/openshift_nightlies/scripts/install/rosa.sh
@@ -105,38 +105,6 @@ _login_check(){
     echo "Failed to login after 100 attempts with 5 sec interval"
 }
 
-_wait_for_workload_nodes_ready(){
-    _download_kubeconfig "$(_get_cluster_id $1)" ./kubeconfig
-    export KUBECONFIG=./kubeconfig
-    ALL_READY_ITERATIONS=0
-    ITERATIONS=0
-    # Node count is number of workload nodes, which is 3
-    NODES_COUNT=3
-    # 180 seconds per node, waiting 5 times 60 seconds (5*60 = 5 minutes) with all nodes ready to finalize
-    while [ ${ITERATIONS} -le ${NODES_COUNT} ] ; do
-        NODES_READY_COUNT=$(oc get nodes | grep -i workload | grep " Ready " | wc -l)
-        if [ ${NODES_READY_COUNT} -ne ${NODES_COUNT} ] ; then
-            echo "WARNING: ${ITERATIONS}/${NODES_COUNT} iterations. ${NODES_READY_COUNT}/${NODES_COUNT} nodes ready. Waiting 180 seconds for next check"
-            ALL_READY_ITERATIONS=0
-            ITERATIONS=$((${ITERATIONS}+1))
-            sleep 180
-        else
-            if [ ${ALL_READY_ITERATIONS} -eq 5 ] ; then
-                echo "INFO: ${ALL_READY_ITERATIONS}/5. All nodes ready, continuing process"
-                return 0
-            else
-                echo "INFO: ${ALL_READY_ITERATIONS}/5. All nodes ready. Waiting 60 seconds for next check"
-                ALL_READY_ITERATIONS=$((${ALL_READY_ITERATIONS}+1))
-                sleep 60
-            fi
-        fi
-    done
-    END_CLUSTER_STATUS="Ready. No Workers"
-    echo "ERROR: Workload nodes (${NODES_READY_COUNT}/${NODES_COUNT}) are ready after about $((${NODES_COUNT}*3)) minutes, dumping oc get nodes..."
-    oc get nodes
-    exit 1
-}
-
 _wait_for_cluster_ready(){
     START_TIMER=$(date +%s)
     echo "INFO: Installation starts at $(date -d @${START_TIMER})"
@@ -440,7 +408,6 @@ install(){
 postinstall(){
     # sleeping to address issue #324
     sleep 120
-    export WORKLOAD_TYPE=$(cat ${json_file} | jq -r .openshift_workload_node_instance_type)
     export EXPIRATION_TIME=$(cat ${json_file} | jq -r .rosa_expiration_time)
     _download_kubeconfig "$(_get_cluster_id ${CLUSTER_NAME})" ./kubeconfig
     unset KUBECONFIG
@@ -450,10 +417,6 @@ postinstall(){
         export PASSWORD=$(echo ${CLUSTER_NAME} | md5sum | awk '{print $1}')
         ocm create idp -n localauth -t htpasswd --username kubeadmin --password ${PASSWORD} -c ${CLUSTER_NAME}
         ocm create user kubeadmin -c "$(_get_cluster_id ${CLUSTER_NAME})" --group=cluster-admins
-        if [[ $WORKLOAD_TYPE != "null" ]]; then
-            # create machinepool for workload nodes
-            ocm create machinepool -c ${CLUSTER_NAME} --instance-type ${WORKLOAD_TYPE} --labels 'node-role.kubernetes.io/workload=' --taints 'role=workload:NoSchedule' --replicas 3 workload
-        fi
         # set expiration time
         EXPIRATION_STRING=$(date -d "${EXPIRATION_TIME} minutes" '+{"expiration_timestamp": "%FT%TZ"}')
         ocm patch /api/clusters_mgmt/v1/clusters/"$(_get_cluster_id ${CLUSTER_NAME})" <<< ${EXPIRATION_STRING}
@@ -463,10 +426,6 @@ postinstall(){
         URL=$(rosa describe cluster -c $CLUSTER_NAME --output json | jq -r ".api.url")
         PASSWORD=$(rosa create admin -c "$(_get_cluster_id ${CLUSTER_NAME})" -y 2>/dev/null | grep "oc login" | awk '{print $7}')
         if [ $HCP == "true" ]; then _login_check $URL $PASSWORD; fi
-        if [[ $WORKLOAD_TYPE != "null" ]]; then
-            # create machinepool for workload nodes
-            rosa create machinepool -c ${CLUSTER_NAME} --instance-type ${WORKLOAD_TYPE} --name workload --labels node-role.kubernetes.io/workload= --taints role=workload:NoSchedule --replicas 3
-        fi
         # set expiration to 24h
         rosa edit cluster -c "$(_get_cluster_id ${CLUSTER_NAME})" --expiration=${EXPIRATION_TIME}m
     fi
@@ -711,13 +670,17 @@ if [[ "$operation" == "install" ]]; then
             install
             index_metadata
         fi
-        if [[ $WORKLOAD_TYPE != "null" ]]; then _wait_for_workload_nodes_ready ${CLUSTER_NAME}; fi
     elif [ "${CLUSTER_STATUS}" == "ready" ] ; then
         printf "INFO: Cluster ${CLUSTER_NAME} already installed and ready, reusing..."
-	    postinstall
+        postinstall
+    elif [ "${CLUSTER_STATUS}" == "error" ] ; then
+        printf "INFO: Cluster ${CLUSTER_NAME} errored, cleaning them now..."
+        cleanup
+        printf "INFO: Fail this install to re-try a fresh install"
+        exit 1
     else
         printf "INFO: Cluster ${CLUSTER_NAME} already installed but not ready, exiting..."
-	    exit 1
+        exit 1
     fi
 
 elif [[ "$operation" == "cleanup" ]]; then

diff --git a/dags/openshift_nightlies/scripts/utils/run_scale_ci_diagnosis.sh b/dags/openshift_nightlies/scripts/utils/run_scale_ci_diagnosis.sh
@@ -27,10 +27,6 @@ setup(){
     curl -sS https://mirror.openshift.com/pub/openshift-v4/clients/ocp/latest/openshift-client-linux.tar.gz | tar xz oc
 
     export PATH=$PATH:$(pwd)
-
-    if [[ ! -z "$KUBEADMIN_PASSWORD" ]]; then 
-        oc login -u kubeadmin -p $KUBEADMIN_PASSWORD --insecure-skip-tls-verify
-    fi
 }
 
 setup

diff --git a/dags/requirements.txt b/dags/requirements.txt
@@ -0,0 +1,6 @@
+kubernetes>=25.0.0
+apache-airflow==2.6.2
+prometheus-api-client==0.5.2
+elasticsearch==7.13.4
+apache-airflow-providers-slack==7.3.0
+pydantic>=1.10.0,<2.0.0 # https://github.com/apache/airflow/issues/32311
diff --git a/dags/setup.cfg b/dags/setup.cfg
@@ -20,15 +20,6 @@ home-page = https://github.com/cloud-bulldozer/benchmark-operator/cli
 zip_safe = False
 packages = find:
 include_package_data = True
-# Add here dependencies of your project (semicolon/line-separated), e.g.
-install_requires =
-  kubernetes>=25.0.0
-  apache-airflow==2.3.2
-  prometheus-api-client==0.5.2
-  elasticsearch==7.13.4
-  apache-airflow-providers-slack==7.1.0
-  markupsafe==2.0.1
-
 python_requires = >=3.8
 
 [options.extras_require]

diff --git a/dags/tox.ini b/dags/tox.ini
@@ -8,13 +8,7 @@ extras =
     tests
 setenv =
     py{38,39}-unit: COVERAGE_FILE = .coverage.{envname}
-deps =
-    kubernetes>=25.0.0
-    apache-airflow==2.3.2
-    prometheus-api-client==0.5.2
-    elasticsearch==7.13.4
-    apache-airflow-providers-slack==7.1.0
-    markupsafe==2.0.1
+deps = -r{toxinidir}/requirements.txt
 
 python_requires = >=3.8
 

diff --git a/images/airflow-ansible/Dockerfile b/images/airflow-ansible/Dockerfile
@@ -1,4 +1,4 @@
-ARG BASE_IMAGE=quay.io/cloud-bulldozer/airflow:2.3.2
+ARG BASE_IMAGE
 FROM ${BASE_IMAGE}
 USER root
 RUN apt install bc awscli -y

diff --git a/images/airflow-managed-services/Dockerfile b/images/airflow-managed-services/Dockerfile
@@ -1,7 +1,7 @@
-ARG BASE_IMAGE=quay.io/cloud-bulldozer/airflow:2.3.2
+ARG BASE_IMAGE=quay.io/cloud-bulldozer/airflow:latest
 # Hypershift Compilation
-FROM golang:1.18 AS hypershift
-RUN git clone --branch main https://github.com/openshift/hypershift
+FROM golang:1.19 AS hypershift
+RUN git clone --single-branch --branch main https://github.com/openshift/hypershift --depth=1
 WORKDIR hypershift
 RUN make build
 # Runtime image

diff --git a/images/airflow/Dockerfile b/images/airflow/Dockerfile
@@ -14,7 +14,6 @@ RUN curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | b
 
 ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6
 
-RUN curl -L https://github.com/cloud-bulldozer/kube-burner/releases/download/v1.6/kube-burner-1.6-Linux-x86_64.tar.gz | tar xz -C /usr/bin kube-burner
-RUN curl -L https://github.com/jtaleric/k8s-netperf/releases/download/v0.0.7/k8s-netperf_0.0.7_linux_amd64.tar.gz  | tar xz -C /usr/bin k8s-netperf
+RUN curl -L https://github.com/cloud-bulldozer/kube-burner/releases/download/v1.7.2/kube-burner-V1.7.2-linux-x86_64.tar.gz | tar xz -C /usr/bin kube-burner
+RUN curl -L https://github.com/cloud-bulldozer/k8s-netperf/releases/download/v0.1.11/k8s-netperf_Linux_v0.1.11_x86_64.tar.gz | tar xz -C /usr/bin k8s-netperf
 USER airflow
-RUN pip install prometheus-api-client elasticsearch apache-airflow-providers-elasticsearch apache-airflow-providers-cncf-kubernetes --upgrade
diff --git a/scripts/common.sh b/scripts/common.sh
@@ -33,14 +33,14 @@ output_info() {
     _argo_password=$(kubectl get secret/argocd-initial-admin-secret -n argocd -o jsonpath='{.data.password}' | base64 --decode)
 
     printf "\n\n ArgoCD Configs"
-    printf "\n Host: $_argo_url \n User: $_argo_user \n Password: $_argo_password"
+    printf "\n Host: https://$_argo_url \n User: $_argo_user \n Password: $_argo_password"
 
     _airflow_url=$(oc get route/airflow -o jsonpath='{.spec.host}' -n $_airflow_ns)
     _airflow_user="admin"
     _airflow_password="REDACTED"
 
     printf "\n\n Airflow Configs (Password was user defined so this script doesn't know it!)"
-    printf "\n Host: $_airflow_url \n User: $_airflow_user \n Password: $_airflow_password\n\n"
+    printf "\n Host: https://$_airflow_url \n User: $_airflow_user \n Password: $_airflow_password\n\n"
 
     _results_dashboard_url=$(oc get route/perf-dashboard -o jsonpath='{.spec.host}' -n dashboard)
     if [ -z "$_results_dashboard_url" ]; then

diff --git a/scripts/playground/templates/airflow.yaml b/scripts/playground/templates/airflow.yaml
@@ -22,8 +22,8 @@ spec:
       releaseName: airflow
       values: |-
         defaultAirflowRepository: quay.io/cloud-bulldozer/airflow
-        defaultAirflowTag: 2.3.2
-        airflowVersion: 2.3.2
+        defaultAirflowTag: 2.6.2
+        airflowVersion: 2.6.2
         executor: KubernetesExecutor
         createUserJob:
           useHelmHooks: false
@@ -107,6 +107,8 @@ spec:
     name: airflow-webserver
     weight: 100
   wildcardPolicy: None
+  tls:
+    termination: edge
 --- 
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role

diff --git a/scripts/tenant/templates/airflow.yaml b/scripts/tenant/templates/airflow.yaml
@@ -23,8 +23,8 @@ spec:
       releaseName: airflow
       values: |-
         defaultAirflowRepository: quay.io/cloud-bulldozer/airflow
-        defaultAirflowTag: 2.3.2
-        airflowVersion: 2.3.2
+        defaultAirflowTag: 2.6.2
+        airflowVersion: 2.6.2
         executor: KubernetesExecutor
         images:
           airflow: