From 5ea9777b39ecc69864df040bec2c3af9c7204f88 Mon Sep 17 00:00:00 2001 From: Michele Pagot Date: Fri, 13 Sep 2024 16:04:03 +0200 Subject: [PATCH] Delay the crash system command One of the HanaSR test is about crashing one cluster node running HANA. Crash command is executed through a ssh channel. Problem is that, as soon as the system crash, the ssh connection is interrupted leaving the ssh client blocked. The idea is: compose the remotely executed command with a sleep and then the crash, run these two in background. It gives time to the ssh client to close the session before the crash happening. Remove the timepout=0 behavior, stop forwarding to run_ssh_command all the args content. --- lib/sles4sap_publiccloud.pm | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/lib/sles4sap_publiccloud.pm b/lib/sles4sap_publiccloud.pm index 3f0588524f01..b2b2a92dbfdd 100644 --- a/lib/sles4sap_publiccloud.pm +++ b/lib/sles4sap_publiccloud.pm @@ -369,9 +369,12 @@ sub stop_hana { my %commands = ( stop => 'HDB stop', kill => 'HDB kill -x', + # -b is for running the command in background # echo b > /proc/sysrq-trigger is for crashing the remote node + # sleep 5 is to give time sudo to put the command execution in background and + # to ssh to return, both before to trigger the crash # This also work in conjunction with ssh -fn arguments - crash => 'sudo su -c "echo b > /proc/sysrq-trigger &"' + crash => 'sudo -b sh -c "sleep 5; echo b > /proc/sysrq-trigger"' ); croak("HANA stop method '$args{method}' unknown.") unless $commands{$args{method}}; @@ -400,7 +403,8 @@ sub stop_hana { cmd => $cmd, # This timeout is to ensure the run_ssh_command is executed in a reasonable amount of time. # It is not about how much time the crash is expected to take in the SUT. - # Also consider that internally run_ssh_command is using this value for two different guard mechanisms. + # Expect run_ssh_command to return immediately, and 10 has nothing to do with the value of sleep 5 executed remotely. + # Also consider that internally run_ssh_command is using this value for two different guard mechanism. timeout => 10, # This test does not care about output, # setting this in conjunction with timeout >0 result in the internal implementation of @@ -408,6 +412,13 @@ sub stop_hana { rc_only => 1, ssh_opts => $crash_ssh_opts); + # crash trigger command: + # - is executed in background + # - has sleep 5 executed remotely. + # run_ssh_command return immediately, so before the remote system execute the crash command. + # So the test execution has to sleep now, waiting that remote system has time to execute the crash procedure. + sleep 10; + # Wait till ssh disappear record_info("Wait ssh disappear start"); my $out = $self->{my_instance}->wait_for_ssh(timeout => 60, wait_stop => 1);