Skip to content

Commit

Permalink
Addressing minor changes identified
Browse files Browse the repository at this point in the history
Prereq, xname_errs removal
removing the unused func: do_nodes_power_off
and its testcases
  • Loading branch information
Shivaprasad Ashok Metimath committed Mar 14, 2024
1 parent 2ee1ab0 commit 882a059
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 238 deletions.
42 changes: 5 additions & 37 deletions sat/apiclient/pcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,40 +33,6 @@
class PCSError(APIError):
"""An error occurred in PCS."""

def __init__(self, message, xname_errs=None):
"""Create a new PCSError with the given message and info about the failing xnames.
Args:
message (str): the error message
xname_errs (list): a list of dictionaries representing the failures for
the individual components that failed. Each dict should have the
following keys:
e: the error code
err_msg: the error message
xname: the actual xname which failed
"""
self.message = message
self.xname_errs = xname_errs if xname_errs is not None else []
self.xnames = [xname_err['xname'] for xname_err in self.xname_errs
if 'xname' in xname_err]

def __str__(self):
"""Convert to str."""
if not self.xname_errs:
return self.message
else:
# A mapping from a tuple of (err_code, err_msg) to a list of xnames
# with that combination of err_code and err_msg.
xnames_by_err = defaultdict(list)
for xname_err in self.xname_errs:
xnames_by_err[(xname_err.get('e'), xname_err.get('err_msg'))].append(xname_err.get('xname'))

xname_err_summary = '\n'.join([f'xname(s) ({", ".join(xnames)}) failed with '
f'e={err_info[0]} and err_msg="{err_info[1]}"'
for err_info, xnames in xnames_by_err.items()])

return f'{self.message}\n{xname_err_summary}'


class PCSClient(APIGatewayClient):
"""Client for the Power Control Service."""
Expand All @@ -81,7 +47,6 @@ def set_xnames_power_state(self, xnames, power_state, force=False, recursive=Fal
power_state (str): the desired power state. Either "on" or "off".
force (bool): if True, disable checks and force the power operation.
recursive (bool): if True, power on component and its descendants.
prereq (bool): if True, power on component and its ancestors.
Returns:
None
Expand Down Expand Up @@ -111,6 +76,9 @@ def set_xnames_power_state(self, xnames, power_state, force=False, recursive=Fal
try:
for xname in xnames:
xname_instance = XName(xname)
# Current if block handles to fetch children of chassis & computemodule only.
# TO DO: It will have to be updated to handle any other component type i/p
# INFO: current sat swap usually is executed for computemodule only
if xname_instance.get_type() == 'Chassis':
target_xnames.update(
hsm_client.query_components(
Expand All @@ -134,8 +102,8 @@ def set_xnames_power_state(self, xnames, power_state, force=False, recursive=Fal
try:
self.post('transitions', json=params).json()
except APIError as err:
raise PCSError(f'Power {power_state} operation failed for xname(s).',
xname_errs=xnames) from err
raise PCSError(f'Power {power_state} operation failed for xname(s): '
f'{", ".join(target_xnames)}.') from err

def get_xnames_power_state(self, xnames):
"""Get the power state of the given xnames from PCS.
Expand Down
49 changes: 0 additions & 49 deletions sat/cli/bootsys/power.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,52 +113,3 @@ def member_has_completed(self, member):
return False

return current_state == self.power_state


def do_nodes_power_off(timeout):
"""Ensure the compute and application nodes (UANs) are powered off.
Args:
timeout (int): the timeout for waiting for nodes to reach powered off
state according to PCS after turning off their power.
Returns:
A tuple of:
timed_out_nodes: a set of nodes that timed out waiting to reach
power state 'off' according to pcs.
failed_nodes: a set of nodes that failed to power off with pcs
"""
inf = engine()
on_nodes = set(get_nodes_by_role_and_state('compute', 'on') +
get_nodes_by_role_and_state('application', 'on'))
num_on_nodes = len(on_nodes)

if not on_nodes:
# All nodes are already off
return set(), set()

LOGGER.info(f'Forcing power off of {num_on_nodes} compute or application '
f'{inf.plural("node")} still powered on: {", ".join(on_nodes)}')

wait_nodes = on_nodes
failed_nodes = set()
pcs_client = PCSClient(SATSession())
try:
pcs_client.set_xnames_power_state(list(on_nodes), 'off', force=True)
except PCSError as err:
LOGGER.warning(err)
if err.xnames:
failed_nodes = set(err.xnames)
# Only wait on the nodes that did not have an error powering off
wait_nodes = set(on_nodes) - failed_nodes
else:
# This probably indicates all nodes failed to be powered off
return set(), on_nodes

num_wait_nodes = len(wait_nodes)
LOGGER.info(f'Waiting {timeout} seconds until {num_wait_nodes} {inf.plural("node", num_wait_nodes)} '
f'reach powered off state according to PCS.')

waiter = PCSPowerWaiter(wait_nodes, 'off', timeout)
timed_out_nodes = waiter.wait_for_completion()
return timed_out_nodes, failed_nodes
153 changes: 1 addition & 152 deletions tests/cli/bootsys/test_power.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@
from unittest.mock import patch

from sat.apiclient import APIError
from sat.cli.bootsys.power import (PCSError, PCSPowerWaiter,
do_nodes_power_off,
from sat.cli.bootsys.power import (PCSPowerWaiter,
get_nodes_by_role_and_state)
from tests.common import ExtendedTestCase

Expand Down Expand Up @@ -96,156 +95,6 @@ def test_member_has_completed_api_error(self):
self.assert_in_element(f'Failed to query power state: {api_err_msg}', cm.output)


class TestDoNodesPowerOff(ExtendedTestCase):
"""Test the do_nodes_power_off function."""

def setUp(self):
"""Set up some mocks."""
self.mock_pcs_client = patch('sat.cli.bootsys.power.PCSClient').start().return_value
self.mock_sat_session = patch('sat.cli.bootsys.power.SATSession').start()

self.timeout = 10 # does not actually affect duration; just for asserts
self.compute_nodes = ['x1000c0s0b0n0', 'x1000c0s0b0n1']
self.application_nodes = ['x3000c0s24b1n0', 'x3000c0s24b2n0']
self.all_nodes = set(self.compute_nodes + self.application_nodes)
self.timed_out_nodes = set()

def mock_get_nodes(role, _):
if role == 'compute':
return self.compute_nodes
elif role == 'application':
return self.application_nodes
else:
return []

self.mock_get_nodes = patch('sat.cli.bootsys.power.get_nodes_by_role_and_state',
mock_get_nodes).start()

self.mock_pcs_waiter_cls = patch('sat.cli.bootsys.power.PCSPowerWaiter').start()
self.mock_pcs_waiter = self.mock_pcs_waiter_cls.return_value
self.mock_pcs_waiter.wait_for_completion.return_value = self.timed_out_nodes

self.mock_print = patch('builtins.print').start()

def tearDown(self):
"""Stop all patches."""
patch.stopall()

def assert_log_calls(self, logs, num_wait=None, include_wait_call=True):
"""Assert that the expected logging calls are made.
Args:
logs (object): the recording object returned from the assertLogs()
context manager. has two attributes: `output` and `records`
num_wait (int): the number of nodes waited on. defaults to
len(self.all_nodes)
include_wait_call (bool): whether to include a print statement
for a wait.
"""
num_wait = num_wait if num_wait is not None else len(self.all_nodes)
calls = [f'Forcing power off of {len(self.all_nodes)} compute or application '
f'nodes still powered on: {", ".join(self.all_nodes)}']
if include_wait_call:
calls.append(f'Waiting {self.timeout} seconds until {num_wait} nodes '
f'reach powered off state according to PCS.')
for c in calls:
self.assert_in_element(c, logs.output)

def assert_pcs_client_call(self):
"""Assert the call is made to the PCSClient to power off nodes."""
self.mock_pcs_client.set_xnames_power_state.assert_called_once_with(
list(self.all_nodes), 'off', force=True
)

def test_do_nodes_power_off_already_off(self):
"""Test do_nodes_power_off when all nodes are already off."""
self.compute_nodes = []
self.application_nodes = []
timed_out, failed = do_nodes_power_off(self.timeout)

self.assertEqual(set(), timed_out)
self.assertEqual(set(), failed)

self.mock_pcs_client.set_xnames_power_state.assert_not_called()
self.mock_pcs_waiter_cls.assert_not_called()

def test_do_nodes_power_off_success(self):
"""Test do_nodes_power_off in the successful case."""
with self.assertLogs(level=logging.INFO) as cm:
timed_out, failed = do_nodes_power_off(self.timeout)

self.assert_pcs_client_call()
self.assertEqual(set(), timed_out)
self.assertEqual(set(), failed)
self.mock_pcs_waiter_cls.assert_called_once_with(
self.all_nodes, 'off', self.timeout
)
self.mock_pcs_waiter.wait_for_completion.assert_called_once_with()
self.assert_log_calls(cm)

def test_do_nodes_power_off_one_failed(self):
"""Test do_nodes_power_off when one fails to power off and the rest succeed."""
expected_failed = {self.compute_nodes[0]}
failed_xname_errs = [
{
'e': 1,
'err_msg': 'NodeBMC unreachable',
'xname': self.compute_nodes[0]
}
]
pcs_err_msg = 'Power off operation failed.'
pcs_err = PCSError(pcs_err_msg, xname_errs=failed_xname_errs)
self.mock_pcs_client.set_xnames_power_state.side_effect = pcs_err

with self.assertLogs(level=logging.INFO) as cm:
timed_out, failed = do_nodes_power_off(self.timeout)

self.assert_in_element(f'{pcs_err_msg}\n'
f'xname(s) ({self.compute_nodes[0]}) failed with '
f'e={failed_xname_errs[0]["e"]} and '
f'err_msg="{failed_xname_errs[0]["err_msg"]}"',
cm.output)
self.assert_pcs_client_call()
self.assertEqual(set(), timed_out)
self.assertEqual(expected_failed, failed)
self.mock_pcs_waiter_cls.assert_called_once_with(
self.all_nodes - expected_failed, 'off', self.timeout
)
self.mock_pcs_waiter.wait_for_completion.assert_called_once_with()
self.assert_log_calls(cm, num_wait=len(self.all_nodes - expected_failed))

def test_do_nodes_power_off_pcs_failed(self):
"""Test do_nodes_power_off when the PCS power off request fails."""
pcs_err_msg = 'PCS did not respond'
pcs_err = PCSError(pcs_err_msg)
expected_failed = self.all_nodes
self.mock_pcs_client.set_xnames_power_state.side_effect = pcs_err

with self.assertLogs(level=logging.INFO) as cm:
timed_out, failed = do_nodes_power_off(self.timeout)

self.assert_in_element(pcs_err_msg, cm.output)
self.assert_pcs_client_call()
self.assertEqual(set(), timed_out)
self.assertEqual(expected_failed, failed)
self.mock_pcs_waiter_cls.assert_not_called()
self.assert_log_calls(cm, include_wait_call=False)

def test_do_nodes_power_off_one_timed_out(self):
"""Test do_node_power_off when one node times out."""
expected_timed_out = {self.compute_nodes[0]}
self.mock_pcs_waiter.wait_for_completion.return_value = expected_timed_out

with self.assertLogs(level=logging.INFO) as cm:
timed_out, failed = do_nodes_power_off(self.timeout)

self.assert_pcs_client_call()
self.assertEqual(expected_timed_out, timed_out)
self.assertEqual(set(), failed)
self.mock_pcs_waiter_cls.assert_called_once_with(self.all_nodes, 'off', self.timeout)
self.assert_log_calls(cm)


class TestGetNodesByRoleAndState(unittest.TestCase):
"""Test the get_nodes_by_role_and_state function."""

Expand Down

0 comments on commit 882a059

Please sign in to comment.