diff --git a/README.md b/README.md index f944260..663a748 100644 --- a/README.md +++ b/README.md @@ -40,11 +40,8 @@ Enable the following if troubleshooting an issue for the following subsystems: - `--diagnostic-all` ### Manager - `--diagnostic-manager` -> **Note**: To use this option make sure to download either the `crunchy_gather.py` script or the `edb_mustgather.py` script, and make it executable with `chmod +x ` and place in the same directory as the postmortem script. - `--collect-crunchy`
-> **Note**: To use this option make sure to download the `crunchy_gather.py` script, make it executable with `chmod +x crunchy_gather.py` and place in the same directory as the postmortem script. - `--collect-edb`
-> **Note**: To use this option make sure to download the `edb_mustgather.py` script, make it executable with `chmod +x edb_mustgather.py` and place in the same directory as the postmortem script. ### Gateway `--diagnostic-gateway` > **Note**: In order for this switch to function, make sure connections to `127.0.0.1` are not restricted on the local machine. diff --git a/generate_postmortem.sh b/generate_postmortem.sh index fa5c428..a9944f8 100755 --- a/generate_postmortem.sh +++ b/generate_postmortem.sh @@ -190,26 +190,10 @@ for switch in $@; do ;; *"--collect-crunchy"*) COLLECT_CRUNCHY=1 - SCRIPT_LOCATION="`pwd`/crunchy_gather.py" - if [[ ! -f $SCRIPT_LOCATION ]]; then - echo -e "Unable to locate script [crunchy_gather.py] in current directory. Download from GitHub repository. Exiting..." - exit 1 - fi - warn_if_script_is_not_latest crunchy_gather.py https://raw.githubusercontent.com/ibm-apiconnect/v10-postmortem/master/crunchy_gather.py - chmod +x $SCRIPT_LOCATION ;; *"--collect-edb"*) COLLECT_EDB=1 - is_kubectl_cnp_plugin - - SCRIPT_LOCATION="`pwd`/edb_mustgather.sh" - if [[ ! -f $SCRIPT_LOCATION ]]; then - echo -e "Unable to locate script [edb_mustgather.sh] in current directory. Download from GitHub repository. Exiting..." - exit 1 - fi - warn_if_script_is_not_latest edb_mustgather.sh https://raw.githubusercontent.com/ibm-apiconnect/v10-postmortem/master/edb_mustgather.sh - chmod +x $SCRIPT_LOCATION ;; *"--version"*) print_postmortem_version @@ -257,15 +241,9 @@ if [[ $NOT_DIAG_MANAGER -eq 0 ]]; then EDB_CLUSTER_NAME=$($KUBECTL get cluster --all-namespaces -o=jsonpath='{.items[0].metadata.name}' 2>/dev/null) if [[ -z "$EDB_CLUSTER_NAME" ]]; then COLLECT_CRUNCHY=1 - SCRIPT_LOCATION="`pwd`/crunchy_gather.py" else COLLECT_EDB=1 is_kubectl_cnp_plugin - SCRIPT_LOCATION="`pwd`/edb_mustgather.sh" - fi - if [[ ! -f $SCRIPT_LOCATION ]]; then - echo -e "Unable to locate script ${SCRIPT_LOCATION} in current directory. Download from GitHub repository. Exiting..." - exit 1 fi fi @@ -322,6 +300,835 @@ cat << EOF > $1 EOF } + +function gatherEdbOperatorData() { + $KUBECTL cnp report operator --logs -n ${EDB_OP_NAMESPACE} -f ${SPECIFIC_NS_EDB_OP}/operator-report.zip + + for pod in $PG_OP + do + mkdir ${OPERATOR_PODS}/${pod} + $KUBECTL get po ${pod} -o yaml -n ${EDB_OP_NAMESPACE} > ${OPERATOR_PODS}/${pod}/pod.yaml + $KUBECTL describe pod ${pod} -n ${EDB_OP_NAMESPACE} > ${OPERATOR_PODS}/${pod}/describe.txt + $KUBECTL logs ${pod} -n ${EDB_OP_NAMESPACE} > ${OPERATOR_PODS}/${pod}/logs.txt + $KUBECTL logs ${pod} -n ${EDB_OP_NAMESPACE} --previous 2>/dev/null > ${OPERATOR_PODS}/${pod}/previous-logs.txt + done +} + +function gatherClusterData() { + $KUBECTL cnp status ${EDB_CLUSTER_NAME} -n ${EDB_CLUSTER_NAMESPACE} > ${CLUSTER}/${EDB_CLUSTER_NAME}/status.txt + $KUBECTL cnp status ${EDB_CLUSTER_NAME} --verbose -n ${EDB_CLUSTER_NAMESPACE} > ${CLUSTER}/${EDB_CLUSTER_NAME}/status-verbose.txt + + $KUBECTL get cluster ${EDB_CLUSTER_NAME} -n ${EDB_CLUSTER_NAMESPACE} > ${CLUSTER}/${EDB_CLUSTER_NAME}/info.txt + $KUBECTL get cluster ${EDB_CLUSTER_NAME} -o yaml -n ${EDB_CLUSTER_NAMESPACE} > ${CLUSTER}/${EDB_CLUSTER_NAME}/cluster.yaml + $KUBECTL describe cluster ${EDB_CLUSTER_NAME} -n ${EDB_CLUSTER_NAMESPACE} > ${CLUSTER}/${EDB_CLUSTER_NAME}/describe.txt + +} + +function gatherEDBPodData() { + $KUBECTL cnp report cluster ${EDB_CLUSTER_NAME} --logs -n ${EDB_CLUSTER_NAMESPACE} -f ${SPECIFIC_NS_CLUSTER}/cluster-report.zip + + $KUBECTL get pod -l k8s.enterprisedb.io/cluster=${EDB_CLUSTER_NAME} -L role -n ${EDB_CLUSTER_NAMESPACE} > ${CLUSTER_PODS}/pods.txt + for pod in ${EDB_POD_NAMES}; do + mkdir ${CLUSTER_PODS}/${pod} + $KUBECTL get po ${pod} -o yaml -n ${EDB_CLUSTER_NAMESPACE} > ${CLUSTER_PODS}/${pod}/pod.yaml + $KUBECTL describe pod ${pod} -n ${EDB_CLUSTER_NAMESPACE} > ${CLUSTER_PODS}/${pod}/describe.txt + $KUBECTL logs ${pod} -n ${EDB_CLUSTER_NAMESPACE} > ${CLUSTER_PODS}/${pod}/logs.txt + $KUBECTL logs ${pod} -n ${EDB_CLUSTER_NAMESPACE} --previous 2>/dev/null > ${CLUSTER_PODS}/${pod}/previous-logs.txt + $KUBECTL logs ${pod} -n ${EDB_CLUSTER_NAMESPACE} | jq -r '.record | select(.error_severity == "FATAL")' > ${CLUSTER_PODS}/${pod}/logs-fatal.txt + done +} + +function gatherEDBBackupData() { + $KUBECTL get backups -n ${EDB_CLUSTER_NAMESPACE} -o=jsonpath='{.items[?(@.spec.cluster.name=="'${EDB_CLUSTER_NAME}'")]}' -o wide > ${CLUSTER_BACKUPS}/${EDB_CLUSTER_NAME}/backups.txt + for backup in ${EDB_BACKUP_NAMES}; do + mkdir ${CLUSTER_BACKUPS}/${EDB_CLUSTER_NAME}/${backup} + $KUBECTL get backups ${backup} -o yaml -n ${EDB_CLUSTER_NAMESPACE} > ${CLUSTER_BACKUPS}/${EDB_CLUSTER_NAME}/${backup}/backup.yaml + $KUBECTL describe backups ${backup} -n ${EDB_CLUSTER_NAMESPACE} > ${CLUSTER_BACKUPS}/${EDB_CLUSTER_NAME}/${backup}/describe.txt + done +} + +function gatherEDBScheduledBackupData() { + $KUBECTL -n ${EDB_CLUSTER_NAMESPACE} get scheduledbackups -o=jsonpath='{.items[?(@.spec.cluster.name=="'${EDB_CLUSTER_NAME}'")]}' -o wide > ${CLUSTER_SCHEDULED_BACKUPS}/${EDB_CLUSTER_NAME}/scheduledbackups.txt + for scheduledbackup in ${EDB_SCHEDULED_BACKUP_NAMES}; do + mkdir ${CLUSTER_SCHEDULED_BACKUPS}/${EDB_CLUSTER_NAME}/${scheduledbackup} + $KUBECTL -n ${EDB_CLUSTER_NAMESPACE} get scheduledbackups ${scheduledbackup} -o yaml > ${CLUSTER_SCHEDULED_BACKUPS}/${EDB_CLUSTER_NAME}/${scheduledbackup}/backup.yaml + $KUBECTL -n ${EDB_CLUSTER_NAMESPACE} describe scheduledbackups ${scheduledbackup} > ${CLUSTER_SCHEDULED_BACKUPS}/${EDB_CLUSTER_NAME}/${scheduledbackup}/describe.txt + done +} + +function collectEDB { + EDB_OP_NAMESPACE='' + PG_OP='' + ARCHITECTURE=$($KUBECTL get nodes -o jsonpath='{.items[0].status.nodeInfo.architecture}') + + if [ "$ARCHITECTURE" = 's390x' ]; then + EDB_OP_NAMESPACE='ibm-common-services' + PG_OP=$($KUBECTL get po -n ${EDB_OP_NAMESPACE} -o=custom-columns=NAME:.metadata.name | grep postgresql-operator-controller-manager) + else + EDB_OP_NAMESPACE=$EDB_CLUSTER_NAMESPACE + PG_OP=$($KUBECTL get po -n ${EDB_OP_NAMESPACE} -o=custom-columns=NAME:.metadata.name | grep -e edb-operator -e postgresql-operator-controller-manager) + fi + + MGMT_CR_NAME=$($KUBECTL get mgmt -n ${EDB_CLUSTER_NAMESPACE} -o=jsonpath='{.items[0].metadata.name}' 2>/dev/null) + EDB_CLUSTER_NAME=$($KUBECTL get cluster -n ${EDB_CLUSTER_NAMESPACE} -o=jsonpath='{.items[?(@.metadata.ownerReferences[0].name=="'${MGMT_CR_NAME}'")].metadata.name}' 2>/dev/null) + EDB_POD_NAMES=$($KUBECTL get pod -l k8s.enterprisedb.io/cluster=${EDB_CLUSTER_NAME} -L role -n ${EDB_CLUSTER_NAMESPACE} -o=custom-columns=NAME:.metadata.name --no-headers) + EDB_BACKUP_NAMES=$($KUBECTL get backups -o=jsonpath='{.items[?(@.spec.cluster.name=="'${EDB_CLUSTER_NAME}'")]}' -L role -n ${EDB_CLUSTER_NAMESPACE} -o=custom-columns=NAME:.metadata.name --no-headers) + EDB_SCHEDULED_BACKUP_NAMES=$($KUBECTL get scheduledBackups -o=jsonpath='{.items[?(@.spec.cluster.name=="'${EDB_CLUSTER_NAME}'")]}' -n ${EDB_CLUSTER_NAMESPACE} -o=custom-columns=NAME:.metadata.name --no-headers) + K8S_DATA="${TEMP_PATH}/kubernetes" + K8S_NAMESPACES="${K8S_DATA}/namespaces" + K8S_NAMESPACES_SPECIFIC="${K8S_NAMESPACES}/${EDB_OP_NAMESPACE}" + K8S_NAMESPACES_EDB_DATA="${K8S_NAMESPACES_SPECIFIC}/edb" + K8S_NAMESPACES_POD_DATA="${K8S_NAMESPACES_SPECIFIC}/pods" + K8S_NAMESPACES_POD_DESCRIBE_DATA="${K8S_NAMESPACES_POD_DATA}/describe" + K8S_NAMESPACES_POD_LOG_DATA="${K8S_NAMESPACES_POD_DATA}/logs" + NS=${LOG_PATH}/namespaces + SPECIFIC_NS_EDB_OP=${NS}/${EDB_OP_NAMESPACE} + SPECIFIC_NS_CLUSTER=${NS}/${EDB_CLUSTER_NAMESPACE} + OPERATOR_PODS=${SPECIFIC_NS_EDB_OP}/pods + CLUSTER=${SPECIFIC_NS_CLUSTER}/cluster + CLUSTER_PODS=${SPECIFIC_NS_CLUSTER}/pods + CLUSTER_BACKUPS=${SPECIFIC_NS_CLUSTER}/backups + CLUSTER_SCHEDULED_BACKUPS=${SPECIFIC_NS_CLUSTER}/scheduledbackups + + + mkdir ${NS} + mkdir ${SPECIFIC_NS_EDB_OP} + mkdir -p ${SPECIFIC_NS_CLUSTER} + mkdir ${OPERATOR_PODS} + mkdir ${CLUSTER} + mkdir -p ${CLUSTER}/${EDB_CLUSTER_NAME} + mkdir -p ${CLUSTER_PODS} + mkdir ${CLUSTER_BACKUPS} + mkdir -p ${CLUSTER_BACKUPS}/${EDB_CLUSTER_NAME} + mkdir ${CLUSTER_SCHEDULED_BACKUPS} + mkdir -p ${CLUSTER_SCHEDULED_BACKUPS}/${EDB_CLUSTER_NAME} + + if [[ -n "$PG_OP" ]]; then + echo "Found EDB operator pod $PG_OP" + gatherEdbOperatorData + fi + + if [[ -n "$EDB_CLUSTER_NAME" ]]; then + echo "Found EDB Cluster $EDB_CLUSTER_NAME" + gatherClusterData + fi + + if [[ -n "$EDB_POD_NAMES" ]]; then + gatherEDBPodData + fi + + if [[ -n "$EDB_BACKUP_NAMES" ]]; then + gatherEDBBackupData + fi + + if [[ -n "$EDB_SCHEDULED_BACKUP_NAMES" ]]; then + gatherEDBScheduledBackupData + fi +} + +function collectCrunchy { + python3 - << EOF -n $NAMESPACE -l 7 -c $KUBECTL -o $K8S_NAMESPACES_CRUNCHY_DATA &> "${K8S_NAMESPACES_CRUNCHY_DATA}/crunchy-collect.log" +""" +Copyright 2017 - 2021 Crunchy Data +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +Crunchy kubernetes support dump script + +Original Author: Pramodh Mereddy + +Description: + This script collects kubernetes objects, logs and other metadata from + the objects corresponding to Crunchydata container solution + NOTE: secrets are data are NOT collected + +Pre-requisites: + 1. Valid login session to your kubernetes cluster + 2. kubectl or oc CLI in your PATH + +Example: + ./crunchy_gather_k8s_support_dump.py -n pgdb -o $HOME/dumps/crunchy/pgdb + +Arguments: + -n: namespace or project name + -o: directory to create the support dump in + -l: number of pg_log files to save +""" + +import argparse +import logging +import os +import subprocess +import sys +import tarfile +import posixpath +import time +from collections import OrderedDict + +if sys.version_info[0] < 3: + print("Python 3 or a more recent version is required.") + sys.exit() + +# Local Script Version +# Update for each release +__version__ = "v1.0.2" + + +class Options(): # pylint: disable=too-few-public-methods + """ + class for globals + """ + def __init__(self, dest_dir, namespace, kube_cli, pg_logs_count): + self.dest_dir = dest_dir + self.namespace = namespace + self.kube_cli = kube_cli + self.pg_logs_count = pg_logs_count + self.delete_dir = False + self.output_dir = "" + self.dir_name = (f"crunchy_{time.strftime('%Y-%m-%d-%H%M%S')}") + + +OPT = Options("", "", "kubectl", 2) + + +MAX_ARCHIVE_EMAIL_SIZE = 25*1024*1024 # 25 MB filesize limit +logger = logging.getLogger("crunchy_support") # pylint: disable=locally-disabled, invalid-name + +API_RESOURCES = [ + "pods", + "ReplicaSet", + "StatefulSet", + "Deployment", + "Services", + "Routes", + "Ingress", + "pvc", + "configmap", + "networkpolicies", + "postgresclusters", + "pgreplicas", + "pgclusters", + "pgpolicies", + "pgtasks" +] + +CONTAINER_COMMANDS = { + 'collect': [], + 'exporter': [], + 'database': ["patronictl list", "patronictl history"], + 'pgbadger': [], + 'pgbackrest': [], + 'replication-cert-copy': [], + 'all': ["ps aux --width 500"] +} + + +def run(): + """ + Main function to collect support dump + """ + + logger.info("Saving support dump files in %s", OPT.output_dir) + + collect_current_time() + collect_script_version() + collect_kube_version() + collect_node_info() + collect_namespace_info() + collect_events() + collect_pvc_list() + collect_configmap_list() + collect_pods_describe() + collect_api_resources() + collect_pg_logs() + collect_pods_logs() + collect_pg_pod_details() + archive_files() + + +def collect_current_time(): + """ + function to collect the time which the Support Dump was + captured, so that Events and other relative-time items could + be easily correlated + """ + cmd = "date" + logger.debug("collecting current timestamp info: %s", cmd) + collect_helper(cmd, file_name="timestamp.info", resource_name="timestamp info") + + +def collect_kube_version(): + """ + function to gather kubernetes version information + """ + cmd = OPT.kube_cli + " version " + logger.debug("collecting kube version info: %s", cmd) + collect_helper(cmd, file_name="k8s-version.info", resource_name="Platform Version info") + + +def collect_script_version(): + """ + function to gather script version, allow us to determine + if the tool is out of date + """ + cmd = "echo Support Dump Tool: " + __version__ + logger.debug("collecting support dump tool version info: %s", cmd) + collect_helper(cmd, file_name="dumptool-version.info", resource_name="Support Dump Tool version info") + + +def collect_node_info(): + """ + function to gather kubernetes node information + """ + cmd = OPT.kube_cli + " get nodes -o wide " + logger.debug("collecting node info: %s", cmd) + collect_helper(cmd, file_name="nodes.info", resource_name="Node info") + + +def collect_namespace_info(): + """ + function to gather kubernetes namespace information + """ + if OPT.kube_cli == "oc": + cmd = OPT.kube_cli + " describe project " + OPT.namespace + else: + cmd = OPT.kube_cli + " get namespace -o yaml " + OPT.namespace + + logger.debug("collecting namespace info: %s", cmd) + collect_helper(cmd, file_name="namespace.yml", + resource_name="namespace-info") + + +def collect_pvc_list(): + """ + function to gather kubernetes PVC information + """ + cmd = OPT.kube_cli + " get pvc {}".format(get_namespace_argument()) + collect_helper(cmd, file_name="pvc.list", resource_name="pvc-list") + + +def collect_pvc_details(): + """ + function to gather kubernetes PVC details + """ + cmd = OPT.kube_cli + " get pvc -o yaml {}".format(get_namespace_argument()) + collect_helper(cmd, file_name="pvc.details", resource_name="pvc-details") + + +def collect_configmap_list(): + """ + function to gather configmap list + """ + cmd = OPT.kube_cli + " get configmap {}".format(get_namespace_argument()) + collect_helper(cmd, file_name="configmap.list", + resource_name="configmap-list") + + +def collect_configmap_details(): + """ + function to gather configmap details + """ + cmd = (OPT.kube_cli + + " get configmap -o yaml {}".format(get_namespace_argument())) + collect_helper(cmd, file_name="configmap.details", + resource_name="configmap-details") + + +def collect_events(): + """ + function to gather k8s events + """ + cmd = OPT.kube_cli + " get events {}".format(get_namespace_argument()) + collect_helper(cmd=cmd, file_name="events", resource_name="k8s events") + + +def collect_api_resources(): + """ + function to gather details on different k8s resources + """ + logger.info("Collecting API resources:") + resources_out = OrderedDict() + for resource in API_RESOURCES: + if OPT.kube_cli == "kubectl" and resource == "Routes": + continue + output = run_kube_get(resource) + if output: + resources_out[resource] = run_kube_get(resource) + logger.info(" + %s", resource) + + for entry, out in resources_out.items(): + with open(posixpath.join(OPT.output_dir, f"{entry}.yml"), "wb") as file_pointer: + file_pointer.write(out) + + +def collect_pods_describe(): + """ + function to gather k8s describe on the namespace pods + """ + cmd = OPT.kube_cli + " describe pods {}".format(get_namespace_argument()) + collect_helper(cmd=cmd, file_name="describe-pods", resource_name="pod describe") + + +def collect_pods_logs(): + """ + Collects all the pods logs from a given namespace + """ + logger.info("Collecting pod logs:") + logs_dir = posixpath.join(OPT.output_dir, "pod_logs") + os.makedirs(logs_dir) + + pods = get_pods_v4() + get_op_pod() + if not pods: + logger.debug("No Pods found, trying PGO V5 methods...") + pods = get_pods_v5() + get_op_pod() + if not pods: + logger.warning("Could not get pods list - skipping automatic pod logs collection") + logger.error("########") + logger.error("#### You will need to collect these pod logs manually ####") + logger.error("########") + logger.warning("»HINT: Was the correct namespace used?") + logger.debug("This error sometimes happens when labels have been modified") + return + + logger.info("Found and processing the following containers:") + for pod in pods: + containers = get_containers(pod) + if not containers: + logger.warning("Could not get pods list") + logger.warning("»HINT: Were the labels modified?") + logger.warning("»HINT: Was the correct namespace used?") + logger.error("########") + logger.error("#### You will need to collect these pod logs manually ####") + logger.error("########") + logger.debug("This error sometimes happens when labels have been modified") + return + for cont in containers: + container = cont.rstrip() + cmd = (OPT.kube_cli + " logs {} {} -c {}". + format(get_namespace_argument(), pod, container)) + with open("{}/{}_{}.log".format(logs_dir, pod, + container), "wb") as file_pointer: + handle = subprocess.Popen(cmd, shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + while True: + line = handle.stdout.readline() + if line: + file_pointer.write(line) + else: + break + logger.info(" + pod:%s, container:%s", pod, container) + + +def collect_pg_pod_details(): + """ + Collects PG pods details + """ + logger.info("Collecting PG pod details:") + logs_dir = posixpath.join(OPT.output_dir, "pg_pod_details") + os.makedirs(logs_dir) + + pods = get_pg_pods_v4() + if not pods: + logger.debug("No Pods found, trying PGO V5 methods...") + pods = get_pg_pods_v5() + if not pods: + logger.warning("Could not get pods list - skipping PG pod details collection") + logger.error("########") + logger.error("#### You will need to collect Postgres pod logs manually ####") + logger.error("########") + logger.warning("»HINT: Was the correct namespace used?") + logger.debug("This error sometimes happens when labels have been modified") + return + + logger.info("Found and processing the following containers:") + for pod in pods: + containers = get_containers(pod) + for cont in containers: + container = cont.rstrip() + with open("{}/{}_{}.log".format(logs_dir, pod, + container), "ab+") as file_pointer: + for command in (CONTAINER_COMMANDS['all'] + + CONTAINER_COMMANDS[container]): + cmd = (OPT.kube_cli + " exec -it {} -c {} {} -- " + "/bin/bash -c '{}'" + .format(get_namespace_argument(), + container, pod, command)) + handle = subprocess.Popen(cmd, shell=True, + stdout=file_pointer.fileno(), + stderr=file_pointer.fileno()) + try: + out=handle.communicate(timeout=60) + except subprocess.TimeoutExpired: + logger.warning("The output for " + cmd + " was not captured due to timeout") + handle.kill() + logger.info(" + pod:%s, container:%s", pod, container) + + +def collect_pg_logs(): + """ + Collects PG database server logs + """ + logger.info("Collecting last %s PG logs " + "(may take a while)", OPT.pg_logs_count) + logs_dir = posixpath.join(OPT.output_dir, "pg_logs") + os.makedirs(logs_dir) + pods = get_pg_pods_v4() + if not pods: + logger.debug("No Pods found, trying PGO V5 methods...") + pods = get_pg_pods_v5() + if not pods: + logger.warning("Could not get pods list - skipping pods logs collection") + logger.error("########") + logger.error("#### You will need to collect these Postgres logs manually ####") + logger.error("########") + logger.warning("»HINT: Was the correct namespace used?") + logger.debug("This error sometimes happens when labels have been modified") + return + + logger.info("Found and processing the following containers:") + for pod in pods: + tgt_file = "{}/{}".format(logs_dir, pod) + os.makedirs(tgt_file) + # print("OPT.pg_logs_count: ", OPT.pg_logs_count) + cmd = (OPT.kube_cli + + " exec -it {} -c database {} -- /bin/bash -c" + " 'ls -1dt /pgdata/*/pglogs/* | head -{}'" + .format(get_namespace_argument(), pod, OPT.pg_logs_count)) + # print(cmd) + handle = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + while True: + line = handle.stdout.readline() + if line: + cmd = (OPT.kube_cli + + " cp -c database {} {}:{} {}" + .format(get_namespace_argument(), + pod, line.rstrip().decode('UTF-8'), + tgt_file + line.rstrip().decode('UTF-8'))) + handle2 = subprocess.Popen(cmd, shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + handle2.wait() + else: + break + logger.info(" + pod:%s", pod) + + +def sizeof_fmt(num, suffix="B"): + """ + Formats the file size in a human-readable format + Probably overkill to go to Zi range, but reusable + """ + for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]: + if abs(num) < 1024.0: + return f"{num:3.1f}{unit}{suffix}" + num /= 1024.0 + return f"{num:.1f}Yi{suffix}" + + +def archive_files(): + """ + Create an archive and compress it + """ + archive_file_size = 0 + file_name = OPT.output_dir + ".tar.gz" + + with tarfile.open(file_name, "w|gz") as tar: + tar.add(OPT.output_dir, arcname=OPT.dir_name) + logger.info("") + + # Let user choose to delete the files manually + + if OPT.delete_dir: + rtn, out = run_shell_command(f"rm -rf {OPT.output_dir}") + if rtn: + logger.warning('Failed to delete directory after archiving: %s', + out) + logger.info("support dump files saved at %s", OPT.output_dir) + try: + archive_file_size = os.stat(file_name).st_size + logger.info("┌──────────────────────────────────────────────────────────────────-") + logger.info("│ Archive file saved to: %s ", file_name) + if archive_file_size > MAX_ARCHIVE_EMAIL_SIZE: + logger.info("│ Archive file (%d) may be too big to email.", + sizeof_fmt(archive_file_size)) + logger.info("│ Please request file share link by" + " emailing support@crunchydata.com") + else: + logger.info("│ Archive file size: %s ", sizeof_fmt(archive_file_size)) + logger.info("│ Email the support dump to support@crunchydata.com") + logger.info("│ or attach as a email reply to your existing Support Ticket") + logger.info("└──────────────────────────────────────────────────────────────────-") + except (OSError, ValueError) as e: # pylint: disable=invalid-name + logger.warning("Archive file size: NA --- %s", e) + + +def get_pods_v4(): + """ + Returns list of pods names, all pods + """ + cmd = (OPT.kube_cli + " get pod {} -lvendor=crunchydata " + "-o=custom-columns=NAME:.metadata.name " + "--no-headers".format(get_namespace_argument())) + return_code, out = run_shell_command(cmd) + if return_code == 0: + return out.decode("utf-8").split("\n")[:-1] + logger.warning("Failed to get pods: %s", out) + return None + + +def get_pods_v5(): + """ + Returns list of pods names, all pods + """ + cmd = (OPT.kube_cli + " get pod {} " + "-lpostgres-operator.crunchydata.com/cluster " + "-o=custom-columns=NAME:.metadata.name " + "--no-headers".format(get_namespace_argument())) + return_code, out = run_shell_command(cmd) + if return_code == 0: + return out.decode("utf-8").split("\n")[:-1] + logger.warning("Failed to get pods: %s", out) + return None + + +def get_op_pod(): + """ + Returns just the operator pod + """ + cmd = (OPT.kube_cli + " get pod {} " + "-lapp.kubernetes.io/name=postgres-operator " + "-o=custom-columns=NAME:.metadata.name " + "--no-headers".format(get_namespace_argument())) + return_code, out = run_shell_command(cmd) + if return_code == 0: + return out.decode("utf-8").split("\n")[:-1] + logger.warning("Failed to get pods: %s", out) + return None + + +def get_pg_pods_v4(): + """ + Returns list of pods names, only DB pods + """ + cmd = (OPT.kube_cli + " get pod {} " + "-lpgo-pg-database=true,vendor=crunchydata " + "-o=custom-columns=NAME:.metadata.name " + "--no-headers".format(get_namespace_argument())) + return_code, out = run_shell_command(cmd) + if return_code == 0: + return out.decode("utf-8").split("\n")[:-1] + logger.warning("Failed to get pods: %s", out) + return None + + +def get_pg_pods_v5(): + """ + Returns list of pods names, only DB pods + """ + cmd = (OPT.kube_cli + " get pod {} " + "-lpostgres-operator.crunchydata.com/cluster " + "-o=custom-columns=NAME:.metadata.name " + "--no-headers".format(get_namespace_argument())) + return_code, out = run_shell_command(cmd) + if return_code == 0: + return out.decode("utf-8").split("\n")[:-1] + logger.warning("Failed to get pods: %s", out) + return None + + +def get_containers(pod_name): + """ + Returns list of containers in a pod + """ + cmd = (OPT.kube_cli + " get pods {} {} --no-headers " + "-o=custom-columns=CONTAINERS:.spec.containers[*].name" + .format(get_namespace_argument(), pod_name)) + return_code, out = run_shell_command(cmd) + if return_code == 0: + return out.decode("utf-8").split(",") + logger.warning("Failed to get pods: %s", out) + return None + + +def get_namespace_argument(): + """ + Returns namespace option for kube cli + """ + if OPT.namespace: + return "-n {}".format(OPT.namespace) + return "" + + +def collect_helper(cmd, file_name, resource_name): + """ + helper function to gather data + """ + return_code, out = run_shell_command(cmd) + if return_code: + logger.warning("Error when running %s: %s", cmd, out.decode('utf-8').rstrip()) + return + path = posixpath.join(OPT.output_dir, file_name) + with open(path, "wb") as file_pointer: + file_pointer.write(out) + logger.info("Collected %s", resource_name) + + +def run_shell_command(cmd, log_error=True): + """ + Returns a tuple of the shell exit code, output + """ + try: + output = subprocess.check_output( + cmd, + shell=True, + stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as ex: + if log_error: + logger.debug("Failed in shell command: %s, output: %s", + cmd, ex.output.decode('utf-8').rstrip()) + logger.debug("This is probably fine; an item which doesn't exist in v4/v5") + return ex.returncode, ex.output + + return 0, output + + +def run_kube_get(resource_type): + """ + Returns a tuple of the shell exit code, and kube cli get output + """ + cmd = OPT.kube_cli + " get {} {} -o yaml".format(resource_type, + get_namespace_argument()) + return_code, out = run_shell_command(cmd) + if return_code == 0: + return out + logger.debug("Failed to get %s resource: %s. Resource may not exist", + resource_type, + out.decode('utf-8').rstrip()) + logger.debug("This is probably fine; an item which doesn't exist in v4/v5") + return None + + +def get_kube_cli(): + """ + Determine which kube CLI to use + """ + cmd = "which oc" + return_code, _ = run_shell_command(cmd, False) + if return_code == 0: + return "oc" + + cmd = "which kubectl" + return_code, _ = run_shell_command(cmd, False) + if return_code == 0: + return "kubectl" + logger.error("kubernetes CLI not found") + sys.exit() + + +def check_kube_access(): + """ + Check if the user has access to kube cluster + """ + if OPT.kube_cli == "oc": + cmd = "oc whoami" + else: + cmd = "kubectl cluster-info" + + return_code, _ = run_shell_command(cmd) + return return_code + + +if __name__ == "__main__": + allowed_cli = ("kubectl", "oc") + + parser = argparse.ArgumentParser(description='Crunchy support dump' + 'collector', add_help=True) + + namedArgs = parser.add_argument_group('Named arguments') + namedArgs.add_argument('-n', '--namespace', required=True, + action="store", type=str, + help='kubernetes namespace to dump') + namedArgs.add_argument('-o', '--dest_dir', required=True, + action="store", type=str, + help='path to save dump tarball') + namedArgs.add_argument('-l', '--pg_logs_count', required=False, + action="store", type=int, default=2, + help='number of pg_log files to save') + namedArgs.add_argument('-d', '--delete_dir', required=False, + action="store_true", + help='delete the temporary working directory') + namedArgs.add_argument('-c', '--client_program', required=False, + type=str, action="store", + help='client program. valid options: ' + + str(allowed_cli)) + + results = parser.parse_args() + OPT.namespace = results.namespace + OPT.dest_dir = results.dest_dir + OPT.pg_logs_count = results.pg_logs_count + OPT.delete_dir = results.delete_dir + + # Initialize the target for logging and file collection + if OPT.dest_dir: + OPT.output_dir = posixpath.join(OPT.dest_dir, OPT.dir_name) + else: + OPT.output_dir = (posixpath.join(posixpath.abspath(__file__), + OPT.dir_name)) + + try: + os.makedirs(OPT.output_dir) + except OSError as error: + print(error) + + # Log everything to the file, only info+ to stdout + logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler(f"{OPT.output_dir}/dumptool.log"), + ] + ) + console = logging.StreamHandler() + console.setLevel(logging.INFO) + logging.getLogger('').addHandler(console) + + logger.info("┌────────────────────────────────────────────────────────────────────────────-") + logger.info("│ Crunchy Support Dump Collector") + logger.info("│ NOTE: This tool gathers metadata and pod logs only.") + logger.info("│ (No data or k8s secrets)") + logger.info("└────────────────────────────────────────────────────────────────────────────-") + + if results.client_program is not None: + if results.client_program in allowed_cli: + OPT.kube_cli = results.client_program + else: + logger.error("Invalid optional client program " + "argument: %s. Valid choices: %s.", + results.client_program, + str(allowed_cli)) + sys.exit() + else: + OPT.kube_cli = get_kube_cli() + + if check_kube_access() != 0: + logger.error("Not connected to kubernetes cluster") + sys.exit() + + run() +EOF +} + #------------------------------------------------------------------------------------------------------ #------------------------------------------- Set variables -------------------------------------------- @@ -1148,12 +1955,14 @@ for NAMESPACE in $NAMESPACE_LIST; do #grab crunchy mustgather if [[ $COLLECT_CRUNCHY -eq 1 && "$NAMESPACE" != "kube-system" ]]; then - $CURRENT_PATH/crunchy_gather.py -n $NAMESPACE -l 5 -c $KUBECTL -o $K8S_NAMESPACES_CRUNCHY_DATA &> "${K8S_NAMESPACES_CRUNCHY_DATA}/crunchy-collect.log" + collectCrunchy fi #grab edb mustgather if [[ $COLLECT_EDB -eq 1 && "$NAMESPACE" != "kube-system" ]]; then - $CURRENT_PATH/edb_mustgather.sh $NAMESPACE $K8S_NAMESPACES_EDB &> "${K8S_NAMESPACES_EDB}/edb-collect.log" + EDB_CLUSTER_NAMESPACE=$NAMESPACE + LOG_PATH=$K8S_NAMESPACES_EDB + collectEDB fi #grab apicops mustgather @@ -1418,18 +2227,13 @@ for NAMESPACE in $NAMESPACE_LIST; do #grab postgres data if [[ $NOT_DIAG_MANAGER -eq 0 && $COLLECT_CRUNCHY -eq 1 && "$status" == "Running" && "$pod" == *"postgres"* && ! "$pod" =~ (backrest|pgbouncer|stanza|operator|backup) ]]; then echo "Collecting manager diagnostic data..." - target_dir="${K8S_NAMESPACES_POD_DIAGNOSTIC_DATA}/postgres/${pod}-pglogs" health_dir="${K8S_NAMESPACES_POD_DIAGNOSTIC_DATA}/postgres/${pod}-health-stats" - mkdir -p $target_dir mkdir -p $health_dir POSTGRES_PGLOGS_NAME=`$KUBECTL exec -n $NAMESPACE ${pod} -- ls -1 /pgdata 2>"/dev/null" | grep -v lost 2>"/dev/null"` POSTGRES_PGWAL_NAME=`$KUBECTL exec -n $NAMESPACE ${pod} -- ls -1 /pgwal 2>"/dev/null" | grep -v lost 2>"/dev/null"` - #pglogs - $KUBECTL cp -n $NAMESPACE "${pod}:/pgdata/${POSTGRES_PGLOGS_NAME}/pglogs" $target_dir &>/dev/null - #df DB_DF_OUTPUT=`$KUBECTL exec -n $NAMESPACE ${pod} -c database -- df -h 2>"/dev/null"` echo "$DB_DF_OUTPUT" > $health_dir/df.out @@ -1503,6 +2307,7 @@ for NAMESPACE in $NAMESPACE_LIST; do PG_BACKREST_REPO_POD=$($KUBECTL -n "$NAMESPACE" get po -lpgo-backrest-repo=true,vendor=crunchydata -o=custom-columns=NAME:.metadata.name --no-headers) if [[ $NOT_DIAG_MANAGER -eq 0 && $COLLECT_CRUNCHY -eq 1 && "$status" == "Running" && "$pod" == "$PG_BACKREST_REPO_POD" ]]; then + echo "Collecting manager diagnostic data..." target_dir="${K8S_NAMESPACES_POD_DIAGNOSTIC_DATA}/postgres/${pod}" mkdir -p "$target_dir"