Skip to content

Commit

Permalink
hmi: Add test case to trigger TOD topology switch.
Browse files Browse the repository at this point in the history
This test triggers the TOD topology failover on all the chips to see OPAL
TI and panic path to make sure OS does not get stuck while going down.

This test needs following skiboot and kernel commit to pass:

skiboot:
  497734984 opal/hmi: set a flag to inform OS that TOD/TB has failed.
  ca349b836 opal/hmi: Don't retry TOD recovery if it is already in failed state.
  017da88b2 opal/hmi: Fix double unlock of hmi lock in failure path.

kernel:
  http://patchwork.ozlabs.org/patch/1051379/

Signed-off-by: Mahesh Salgaonkar <[email protected]>
  • Loading branch information
maheshsal committed Mar 10, 2019
1 parent 0cf940f commit dbd9430
Show file tree
Hide file tree
Showing 3 changed files with 136 additions and 0 deletions.
12 changes: 12 additions & 0 deletions common/OPexpect.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,15 @@ def expect(self, pattern, timeout=-1, searchwindowsize=-1, async=False):
raise PlatformError(state, log)

return r - len(op_patterns)

'''
Provide function that do not raise any exception. This is usefull for the
tests that do not consider panic or opal TI as test failure. This allows
such testcases to take control and look for specific pattern in system
crash scenario.
'''
def expect_no_fail(self, pattern, timeout=-1, searchwindowsize=-1, async=False):
r = super(spawn,self).expect(pattern,
timeout=timeout,
searchwindowsize=searchwindowsize)
return r
1 change: 1 addition & 0 deletions common/OpTestConstants.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,7 @@ class OpTestConstants():
TFMR_DEC_PARITY_ERROR = "0006080000000000"
TFMR_PURR_PARITY_ERROR = "0004080000000000"
TFMR_SPURR_PARITY_ERROR = "0005080000000000"
HMI_TOD_TOPOLOGY_FAILOVER = 7

# CPU sleep states constants
GET_CPU_SLEEP_STATE2 = "cat /sys/devices/system/cpu/cpu*/cpuidle/state2/disable"
Expand Down
123 changes: 123 additions & 0 deletions testcases/OpTestHMIHandling.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,26 @@ def clear_stop(self):
else:
self.assertTrue(False, "OpTestHMIHandling failed to recover from previous OpSystemState.UNKNOWN_BAD")

def handle_panic(self):
rc = self.cv_SYSTEM.console.pty.expect_no_fail(["Kernel panic - not syncing: Unrecoverable HMI exception", pexpect.TIMEOUT, pexpect.EOF], timeout=120)
if rc == 0:
rc = self.cv_SYSTEM.console.pty.expect_no_fail(["ISTEP", pexpect.TIMEOUT, pexpect.EOF], timeout=120)
if rc == 0:
self.cv_SYSTEM.set_state(OpSystemState.IPLing)
self.cv_SYSTEM.goto_state(OpSystemState.OS)
else:
self.assertTrue(False, "OpTestHMIHandling: System failing to reboot after topology recovery failure")
else:
self.assertTrue(False, "OpTestHMIHandling: No panic after topology recovery failure")

def handle_OpalTI(self):
rc = self.cv_SYSTEM.console.pty.expect_no_fail(["ISTEP", pexpect.TIMEOUT, pexpect.EOF], timeout=120)
if rc == 0:
self.cv_SYSTEM.set_state(OpSystemState.IPLing)
self.cv_SYSTEM.goto_state(OpSystemState.OS)
else:
self.assertTrue(False, "System failed to reboot after OPAL TI")

def handle_ipl(self):
rc = self.cv_SYSTEM.console.pty.expect(["ISTEP", pexpect.TIMEOUT, pexpect.EOF], timeout=120)
if rc == 0:
Expand Down Expand Up @@ -202,6 +222,54 @@ def form_scom_addr(self, addr, core):
log.debug(val)
return val

def is_node_present(self, node):
''' Check if specified device tree is present or not.'''
self.cv_SYSTEM.goto_state(OpSystemState.OS)
l_cmd = "ls %s" % node
try:
self.cv_HOST.host_run_command(l_cmd, console=1)
except CommandFailed as cf:
'''Node is not present '''
return 0

return 1

def get_OpalSwXstop(self):
self.proc_gen = self.cv_HOST.host_get_proc_gen(console=1)
self.cv_SYSTEM.goto_state(OpSystemState.OS)
try:
o = self.cv_HOST.host_run_command("nvram -p ibm,skiboot --print-config=opal-sw-xstop", console=1)
'''
On a fresh system this isn't set. The command will exit with
exitcode = 255.
On power8 we treat this as enabled
On power9 we treat this as disable.
'''
except CommandFailed as cf:
if cf.exitcode == 255:
if self.proc_gen in ["POWER8", "POWER8E"]:
return "enable"
elif self.proc_gen in ["POWER9"]:
return "disable"
else:
self.assertTrue(False, "get_OpalSwXstop() failed to query nvram.")
return o

def set_OpalSwXstop(self, val):
self.cv_SYSTEM.goto_state(OpSystemState.OS)
o = self.get_OpalSwXstop()
if val in o:
return

l_cmd = "nvram -p ibm,skiboot --update-config opal-sw-xstop=%s" % val
self.cv_HOST.host_run_command(l_cmd, console=1)
o = self.get_OpalSwXstop()
if val in o:
pass
else:
l_msg = "Failed to set opal-sw-xstop config to %s" % val
self.assertTrue(False, l_msg)

def clearGardEntries(self):
self.cv_SYSTEM.goto_state(OpSystemState.OS)
self.util.PingFunc(self.cv_HOST.ip, BMC_CONST.PING_RETRY_POWERCYCLE)
Expand Down Expand Up @@ -287,6 +355,8 @@ def _testHMIHandling(self, i_test):
self._testTFMR_Errors(BMC_CONST.TFMR_DEC_PARITY_ERROR)
self._testTFMR_Errors(BMC_CONST.TFMR_PURR_PARITY_ERROR)
self._testTFMR_Errors(BMC_CONST.TFMR_SPURR_PARITY_ERROR)
elif l_test == BMC_CONST.HMI_TOD_TOPOLOGY_FAILOVER:
self._test_tod_topology_failover()
else:
raise Exception("Please provide valid test case")
l_con.run_command("dmesg -C")
Expand Down Expand Up @@ -407,6 +477,43 @@ def _test_malfunction_alert(self):
console.pty.sendline(l_cmd)
self.handle_ipl()

def _test_tod_topology_failover(self):
'''
This function is used to test error path for hmi TOD topology failover.
On HMI recovery failure TOD/TB goes in invalid state and stops running.
In this case kernel should either
a) panic followed by clean reboot. (For opal-sw-xstop=disable)
OR
b) cause OPAL TI by triggering sw checkstop to OCC. (For
opal-sw-xstop=enable)
In both cases we should not see any hangs at Linux OS level.
To simulate error condition inject TOD topology failover on all the
chips until we see HMI failure.
'''
scom_addr = "0x40000"
l_error = "0x4000000000000000"
l_test_mode = "TI"

g = self.get_OpalSwXstop()
if "disable" in g:
l_test_mode="panic"

console = self.cv_SYSTEM.console
l_cmd = ""
for l_pair in self.l_dic:
l_chip = l_pair[0]
l_cmd_str = "PATH=/usr/local/sbin:$PATH putscom -c %s %s %s; " % (l_chip, scom_addr, l_error)
l_cmd = l_cmd + l_cmd_str

console.pty.sendline(l_cmd)
if l_test_mode == "panic":
self.handle_panic()
else:
self.handle_OpalTI()

return

def _test_hyp_resource_err(self):
'''
This function is used to test HMI: Hypervisor resource error
Expand Down Expand Up @@ -563,6 +670,20 @@ def runTest(self):
self._testHMIHandling(BMC_CONST.HMI_MALFUNCTION_ALERT)
self.clearGardEntries()

class TodTopologyFailoverPanic(OpTestHMIHandling):
def runTest(self):
self.set_OpalSwXstop("disable")
self._testHMIHandling(BMC_CONST.HMI_TOD_TOPOLOGY_FAILOVER)

class TodTopologyFailoverOpalTI(OpTestHMIHandling):
def runTest(self):
rc = self.is_node_present("/proc/device-tree/ibm,sw-checkstop-fir")
if rc == 1:
self.set_OpalSwXstop("enable")
self._testHMIHandling(BMC_CONST.HMI_TOD_TOPOLOGY_FAILOVER)
else:
self.skipTest("OPAL TI not supported on this system.")

class HypervisorResourceError(OpTestHMIHandling):
def runTest(self):
self._testHMIHandling(BMC_CONST.HMI_HYPERVISOR_RESOURCE_ERROR)
Expand All @@ -576,6 +697,8 @@ def unrecoverable_suite():
s = unittest.TestSuite()
s.addTest(MalfunctionAlert())
s.addTest(HypervisorResourceError())
s.addTest(TodTopologyFailoverPanic())
s.addTest(TodTopologyFailoverOpalTI())
s.addTest(ClearGard())
return s

Expand Down

0 comments on commit dbd9430

Please sign in to comment.