From 026e4e3a7a34e1c990d720c87573fa23debef815 Mon Sep 17 00:00:00 2001 From: Ken McDonell Date: Sat, 16 Sep 2023 15:23:57 +1000 Subject: [PATCH 01/14] qa/common.config: QA Farm network address changes --- qa/common.config | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/qa/common.config b/qa/common.config index 1851bb9343..efe65eaca2 100644 --- a/qa/common.config +++ b/qa/common.config @@ -99,10 +99,10 @@ in unset DISPLAY fi fi - # X11 server is on vm36 (ipaddr 192.168.178.236) - PCPQA_CLOSE_X_SERVER=${DISPLAY-192.168.178.236:0} - # currently not working on bozo (ipaddr 192.168.178.100) - PCPQA_SOCKS_SERVER=192.168.178.100 + # X11 server is on vm36 (ipaddr 192.168.20.236) + PCPQA_CLOSE_X_SERVER=${DISPLAY-192.168.20.236:0} + # currently not working on bozo (ipaddr 192.168.20.100) + PCPQA_SOCKS_SERVER=192.168.20.100 ;; # add settings here for your host if the domain is From 30892a47966a0986eb8b8a259d5f6841196e434b Mon Sep 17 00:00:00 2001 From: Ken McDonell Date: Sun, 24 Sep 2023 09:21:21 +1000 Subject: [PATCH 02/14] qa/admin/package-lists/Ubuntu+18.04+i686: add missing libncursesw package --- qa/admin/package-lists/Ubuntu+18.04+i686 | 1 + 1 file changed, 1 insertion(+) diff --git a/qa/admin/package-lists/Ubuntu+18.04+i686 b/qa/admin/package-lists/Ubuntu+18.04+i686 index a25b6de960..db648aebc5 100644 --- a/qa/admin/package-lists/Ubuntu+18.04+i686 +++ b/qa/admin/package-lists/Ubuntu+18.04+i686 @@ -36,6 +36,7 @@ iproute2 libavahi-common-dev libclass-dbi-perl libcmocka-dev +libncurses5-dev libdbd-mysql-perl libdbd-pg-perl libdevmapper-dev From a519785a32d671eae612436617cd5f3e1ad57add Mon Sep 17 00:00:00 2001 From: Ken McDonell Date: Sun, 24 Sep 2023 09:21:51 +1000 Subject: [PATCH 03/14] qa/870: extra diagnostic on error path --- qa/870 | 1 + 1 file changed, 1 insertion(+) diff --git a/qa/870 b/qa/870 index 8848ead879..2c509f4f75 100755 --- a/qa/870 +++ b/qa/870 @@ -40,6 +40,7 @@ _is_activating() \( -n "$PCP_SYSTEMDUNIT_DIR" -a -f $PCP_SYSTEMDUNIT_DIR/pmlogger.service \) ] then eval `systemctl show --property=ActiveState pmlogger.service` + echo "ActiveState=$ActiveState" >>$here/$seq.full [ "$ActiveState" != activating ] && return 0 fi return 1 From 8d891c6e7a4a08c00c74f252843ad1ec90b1e889 Mon Sep 17 00:00:00 2001 From: Ken McDonell Date: Tue, 26 Sep 2023 12:18:58 +1000 Subject: [PATCH 04/14] qa/src/ready-or-not.c: increase delays from 5msec to 10msec Non-deterministic failures on some platforms ... hope this fixes it. --- qa/src/ready-or-not.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qa/src/ready-or-not.c b/qa/src/ready-or-not.c index 7ecc950d0d..6c2b512855 100644 --- a/qa/src/ready-or-not.c +++ b/qa/src/ready-or-not.c @@ -83,7 +83,7 @@ smack(void) exit(1); } /* store NOTREADY-to-READY delay in msec */ - smack_rp->vset[0]->vlist[0].value.lval = 5; + smack_rp->vset[0]->vlist[0].value.lval = 10; } if ((sts = pmStore(smack_rp)) < 0 && sts != PM_ERR_AGAIN) { @@ -105,7 +105,7 @@ main(int argc, char **argv) pmDesc desc; pmResult *rp; pmHighResResult *hrp; - struct timeval delay = { 0, 5000 }; + struct timeval delay = { 0, 10000 }; /* 10msec pause */ pmSetProgname(argv[0]); From a18e5c9241764dafa80fb4b9cabc5eb137d54271 Mon Sep 17 00:00:00 2001 From: Ken McDonell Date: Tue, 26 Sep 2023 12:19:56 +1000 Subject: [PATCH 05/14] qa/common.filter: remove stray pmsignal warning Due to a rare race condition (i.e. narrow window), a process (particuarly pmlogger) may exit between a ps and a pmsignal, leading to a "No such process" message. Since these are benign (in this case) filter them away in _filter_pcp_start(). --- qa/common.filter | 1 + 1 file changed, 1 insertion(+) diff --git a/qa/common.filter b/qa/common.filter index 18a41c163c..e286ac2adf 100644 --- a/qa/common.filter +++ b/qa/common.filter @@ -651,6 +651,7 @@ s/Performance Co-Pilot starting archive loggers .../Starting pmlogger ... / -e '/is not a native service, redirecting to .*chkconfig/d' \ -e '/^Executing .*chkconfig pm.* --level=5/d' \ -e '/Failed to create avahi client:/d' \ + -e '/pmsignal.* kill: [0-9][0-9]*: No such process/d' \ | _filter_pcp_start_distro \ | _filter_init_distro } From 38aca1a897655898a2c1ecbe11f9252d859e4e45 Mon Sep 17 00:00:00 2001 From: Ken McDonell Date: Tue, 26 Sep 2023 12:23:12 +1000 Subject: [PATCH 06/14] qa/1432: _notrun if no overhead PMDA installed ... like on *BSD. --- qa/1432 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/qa/1432 b/qa/1432 index c250fa11e8..3f3068b651 100755 --- a/qa/1432 +++ b/qa/1432 @@ -20,6 +20,8 @@ fi . ./common.filter . ./common.check +[ -d $PCP_PMDAS_DIR/overhead ] || _notrun "overhead PMDA not installed" + _cleanup() { cd $here From 703b8716caa964ce7c75e6c904913a1c391960a2 Mon Sep 17 00:00:00 2001 From: Ken McDonell Date: Tue, 26 Sep 2023 12:24:00 +1000 Subject: [PATCH 07/14] qa/1429: restart pmloggers This will kill off any stray pmloggers launched via control files used in this QA test. Without this, qa/1433 risked being a drive-by shooting victim when a pmlogger from qa/1429 was still running. --- qa/1429 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/qa/1429 b/qa/1429 index af64e16f4b..bd43f229b5 100755 --- a/qa/1429 +++ b/qa/1429 @@ -29,6 +29,8 @@ _cleanup() cd $here $sudo rm -rf $tmp $tmp.* $sudo mv $PCP_LOG_DIR/NOTICES.$seq $PCP_LOG_DIR/NOTICES + _service pcp restart >>$seq.full 2>&1 + _wait_for_pmlogger } status=0 # success is the default! From f44ba28d8f517a6e144f9e0a5d9445f3062856dd Mon Sep 17 00:00:00 2001 From: Ken McDonell Date: Tue, 26 Sep 2023 12:26:08 +1000 Subject: [PATCH 08/14] qa/1339: revert previous change, port 5670, ... are fine for *BSD Not sure what the original analysis for the change to port 95670 was, but the PMDA install failed with that tcp port on vm37 (OpenBSD 7.1), and reverting the 5670 allowed the test to pass multiple times. So there is some unexplained timing issue or port re-use issue here which has not been fixed. --- qa/1339 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qa/1339 b/qa/1339 index aa6ee5b6b9..c75a70b85a 100755 --- a/qa/1339 +++ b/qa/1339 @@ -118,7 +118,7 @@ echo "=== socket Internet agent ===" | tee -a $here/$seq.full # pick two tcp ports that are not in use # -port1=`_find_free_port 95670` +port1=`_find_free_port 5670` if [ -z "$port1" ] then echo "Arrgh ... port1: no free TCP port in the range 5670 ... " From 5c98d787dd6e99830b7fb74ea44898f491d8a5b5 Mon Sep 17 00:00:00 2001 From: Ken McDonell Date: Tue, 26 Sep 2023 12:29:22 +1000 Subject: [PATCH 09/14] qa/102: dink with sleep delays Trying to avoid pmlogger getting a SIGTERM before pmlc is done. --- qa/102 | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/qa/102 b/qa/102 index bf45c0e3e3..0a964b08b8 100755 --- a/qa/102 +++ b/qa/102 @@ -96,10 +96,9 @@ do cat <$tmp.config log advisory on $delta sample.bin [100] End-of-File - # pmlogger -c $tmp.config -T 4 $tmp $sudo rm -f $tmp.* - _start_up_pmlogger -L -c /dev/null -l $tmp.log -T 4 $tmp - pmsleep 0.5 + _start_up_pmlogger -L -c /dev/null -l $tmp.log -T 6 $tmp + pmsleep 1 cat <$tmp.pmlc connect $pid log advisory on $delta sample.bin [100] From 3c0ed373fd756ff8760593dc0843adbe4b4d511a Mon Sep 17 00:00:00 2001 From: Ken McDonell Date: Tue, 26 Sep 2023 15:52:28 +1000 Subject: [PATCH 10/14] qa/src/ready-or-not.c: increase delay from 10msec to 20msec Was still failing on vm33 (OpenBSD 7.2) --- qa/src/ready-or-not.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qa/src/ready-or-not.c b/qa/src/ready-or-not.c index 6c2b512855..3872fab196 100644 --- a/qa/src/ready-or-not.c +++ b/qa/src/ready-or-not.c @@ -83,7 +83,7 @@ smack(void) exit(1); } /* store NOTREADY-to-READY delay in msec */ - smack_rp->vset[0]->vlist[0].value.lval = 10; + smack_rp->vset[0]->vlist[0].value.lval = 20; } if ((sts = pmStore(smack_rp)) < 0 && sts != PM_ERR_AGAIN) { @@ -105,7 +105,7 @@ main(int argc, char **argv) pmDesc desc; pmResult *rp; pmHighResResult *hrp; - struct timeval delay = { 0, 10000 }; /* 10msec pause */ + struct timeval delay = { 0, 20000 }; /* 20msec pause */ pmSetProgname(argv[0]); From 3a063ea96768c50d2026657d1be36161a81c4c3e Mon Sep 17 00:00:00 2001 From: Ken McDonell Date: Tue, 26 Sep 2023 15:53:21 +1000 Subject: [PATCH 11/14] qa/1339: OpenBSD problem solved (maybe) Seems like it is not the port number, it is the delay between installing the PMDA and probing the metrics ... setting $PCPQA_CHECK_DELAY to 2 (sec) seems to have resolved the issue. --- qa/1339 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/qa/1339 b/qa/1339 index c75a70b85a..2032bbb197 100755 --- a/qa/1339 +++ b/qa/1339 @@ -142,6 +142,10 @@ then netstat -a | grep $port2 >>$here/$seq.full fi +# Give the PMDA + internet socket a chance to be setup before pminfo check +# +export PCPQA_CHECK_DELAY=2 + echo 'socket Internet '$port1 | $sudo ./Install -e >$tmp.out 2>&1 From e6bf03d1c2a0745b821812957aa4be1f2f0abe06 Mon Sep 17 00:00:00 2001 From: Ken McDonell Date: Wed, 27 Sep 2023 12:11:01 +1000 Subject: [PATCH 12/14] qa/766: relax some pre-conditions This one was _notrun almost everywhere, and I don't understand why. - it uses "probe" mode for pmfind, so (avahi-style) discovery is not and issue - the algorithm for finding an active network interface was strange - the size of the subnet scanned in the comments and echo did not match the pmfind invocation Let's see how it goes now it is run in more places. --- qa/766 | 47 +++++++++++------------------------------------ 1 file changed, 11 insertions(+), 36 deletions(-) diff --git a/qa/766 b/qa/766 index 8156a1a621..db6420b00f 100755 --- a/qa/766 +++ b/qa/766 @@ -12,9 +12,6 @@ echo "QA output created by $seq" # get standard environment, filters and checks . ./common.discovery -_get_libpcp_config -$service_discovery || _notrun "No support for service discovery" - status=1 # failure is the default! $sudo rm -rf $tmp.* $seq.full trap "cd $here; rm -rf $tmp.*; exit \$status" 0 1 2 3 15 @@ -70,36 +67,14 @@ _error_tolerance_filter() # real QA test starts here _control_service_discovery -# Obtain the inet address of an active network interface -# We're extracting a metric of the form: -# -# inst [0 or "eth0"] value "172.31.0.12" -# -# and then extracting the address -# -echo >$tmp.tmp -pminfo -f network.interface.inet_addr | \ - tail -n +3 | \ - grep -v 127.0.0.1 | \ - awk '{ print $6 }' | \ - sed s/\"//g \ -| while read addr -do - if `host $addr 2>&1 | grep NXDOMAIN >/dev/null` - then - # continue, no DNS for this interface, may be partially configured - # DHCP - # - : - else - echo $addr >$tmp.tmp - break - fi -done - -addr=`cat $tmp.tmp` -[ -z "$addr" ] && _notrun "no active inet interfaces with DNS resolution" - +# Obtain the inet address of the primary network interface +addr=`host $(hostname) | sed -n -e '/has address/s/.* address //p'` +if [ -z "$addr" ] +then + host $(hostname) >>$seq.full 2>&1 + _notrun "no primary network interface?" + #NOTREACHED +fi echo "addr=$addr" >>$seq.full # Probe the obtained network. @@ -112,17 +87,17 @@ echo "Exit status: $?" | tee -a $seq.full echo "-s pmcd -m probe=$addr/31" -r >> $seq.full echo "-s pmcd -m probe=INET_ADDR/31 -r" -pmfind -s pmcd -m probe=$addr/30 -r | _resolved_filter +pmfind -s pmcd -m probe=$addr/31 -r | _resolved_filter echo "Exit status: $?" | tee -a $seq.full echo "-q -m probe=$addr/30" >> $seq.full echo "-q -m probe=INET_ADDR/30" -pmfind -q -m probe=$addr/29 | _unresolved_filter +pmfind -q -m probe=$addr/30 | _unresolved_filter echo "Exit status: $?" | tee -a $seq.full echo "-q -s pmcd -m probe=$addr/29 --resolve" >> $seq.full echo "-q -s pmcd -m probe=INET_ADDR/29 --resolve" -pmfind -q -s pmcd -m probe=$addr/28 --resolve | _resolved_filter +pmfind -q -s pmcd -m probe=$addr/29 --resolve | _resolved_filter echo "Exit status: $?" | tee -a $seq.full echo "-q -s pmcd -m probe=$addr/28,maxThreads=8" >> $seq.full From 5fc04930ef9357726c1ed53412c2ef3d825ad6fb Mon Sep 17 00:00:00 2001 From: Ken McDonell Date: Wed, 27 Sep 2023 13:24:27 +1000 Subject: [PATCH 13/14] qa/1190: treat pmlogger "activating" the same as "active" Prevents some failures, especially when called from check.callback ... qa/031 and qa/525 both seem susceptible to tripping up here. --- qa/1190 | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/qa/1190 b/qa/1190 index 133b1bf288..4c9725c9dc 100755 --- a/qa/1190 +++ b/qa/1190 @@ -166,6 +166,18 @@ fi $PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep -E '[P]ID|[p]mlogger .*-P ' >$tmp.out n=`grep -v PID <$tmp.out | wc -l | sed -e 's/ //g'` +if [ "$n" -eq 0 ] +then + # if systemctl is in the frame, check for Activating ... we may + # be checking a bit too soon and consider Activating the same as + # Active for the current purposes + # + _systemctl_status pmlogger >$tmp.activating + if grep "Active: activating" $tmp.activating >/dev/null + then + n=1 + fi +fi if [ "$n" -eq 1 ] then $check || echo "Count of primary pmlogger's ... OK" From 1039a92ab95b62232e94505815133ec41a6d1445 Mon Sep 17 00:00:00 2001 From: Ken McDonell Date: Thu, 28 Sep 2023 07:33:00 +1000 Subject: [PATCH 14/14] qa/766: deal with pmcd found in peer VMs in the QA Farm This is expected (now), but not deterministic. --- qa/766 | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/qa/766 b/qa/766 index db6420b00f..8cff5ea8d1 100755 --- a/qa/766 +++ b/qa/766 @@ -85,9 +85,18 @@ echo "-m probe=INET_ADDR/32" pmfind -m probe=$addr/32 | _unresolved_filter echo "Exit status: $?" | tee -a $seq.full +# for this one, we should not be surprised to find peer VMs +# in the QA Farm ... there is no DNS to support the reverse +# lookup, so they will be IP addr on the same Class C subnet +# +subnet=`echo $addr | sed -e 's/\.[^.]*$/./' -e 's/\./\\\\./g'` +echo "subnet=$subnet" >>$seq.full echo "-s pmcd -m probe=$addr/31" -r >> $seq.full echo "-s pmcd -m probe=INET_ADDR/31 -r" -pmfind -s pmcd -m probe=$addr/31 -r | _resolved_filter +pmfind -s pmcd -m probe=$addr/31 -r \ +| tee -a $seq.full \ +| _resolved_filter \ +| grep -v "pcp://$subnet" echo "Exit status: $?" | tee -a $seq.full echo "-q -m probe=$addr/30" >> $seq.full