Skip to content

Commit

Permalink
CRAYSAT-956: Replace CAPMC client support with PCS
Browse files Browse the repository at this point in the history
This commit implements a basic PCSClient class which is mostly
compatible with the CAPMCClient class. Usage of the CAPMCClient class
was replaced with the new PCSClient class.
  • Loading branch information
jack-stanek-hpe authored and Shivaprasad Ashok Metimath committed Oct 24, 2023
1 parent db37f2e commit 618f4a3
Show file tree
Hide file tree
Showing 11 changed files with 431 additions and 139 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [3.25.5] - 2023-09-26

### Added
- Added support for the Power Control Service (PCS). Functionality using CAPMC
was changed to use PCS instead.

## [3.25.4] - 2023-09-01

### Fixed
Expand Down
192 changes: 192 additions & 0 deletions sat/apiclient/pcs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
#
# MIT License
#
# (C) Copyright 2021-2023 Hewlett Packard Enterprise Development LP
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
"""
Basic client library for PCS.
"""
from collections import defaultdict

from csm_api_client.service.gateway import APIError, APIGatewayClient
from csm_api_client.service.hsm import HSMClient


class PCSError(APIError):
"""An error occurred in PCS."""

def __init__(self, message, xname_errs=None):
"""Create a new PCSError with the given message and info about the failing xnames.
Args:
message (str): the error message
xname_errs (list): a list of dictionaries representing the failures for
the individual components that failed. Each dict should have the
following keys:
e: the error code
err_msg: the error message
xname: the actual xname which failed
"""
self.message = message
self.xname_errs = xname_errs if xname_errs is not None else []
self.xnames = [xname_err['xname'] for xname_err in self.xname_errs
if 'xname' in xname_err]

def __str__(self):
"""Convert to str."""
if not self.xname_errs:
return self.message
else:
# A mapping from a tuple of (err_code, err_msg) to a list of xnames
# with that combination of err_code and err_msg.
xnames_by_err = defaultdict(list)
for xname_err in self.xname_errs:
xnames_by_err[(xname_err.get('e'), xname_err.get('err_msg'))].append(xname_err.get('xname'))

xname_err_summary = '\n'.join([f'xname(s) ({", ".join(xnames)}) failed with '
f'e={err_info[0]} and err_msg="{err_info[1]}"'
for err_info, xnames in xnames_by_err.items()])

return f'{self.message}\n{xname_err_summary}'


class PCSClient(APIGatewayClient):
"""Client for the Power Control Service."""
base_resource_path = 'power-control/v1/'

def set_xnames_power_state(self, xnames, power_state, force=False, recursive=False, prereq=False):
"""Set the power state of the given xnames.
Args:
xnames (list): the xnames (str) to perform the power operation
against.
power_state (str): the desired power state. Either "on" or "off".
force (bool): if True, disable checks and force the power operation.
recursive (bool): if True, power on component and its descendants.
prereq (bool): if True, power on component and its ancestors.
Returns:
None
Raises:
ValueError: if the given `power_state` is not one of 'on' or 'off'
PCSError: if the attempt to power on/off the given xnames with PCS
fails. This exception contains more specific information about
the failure, which will be included in its __str__.
"""
allowed_states = {'on', 'off', 'soft-off', 'soft-restart', 'hard-restart', 'init', 'force-off'}
power_state = power_state.lower()
if power_state not in allowed_states:
allowed_states_str = ", ".join("\"" + state + "\"" for state in allowed_states)
raise ValueError(f'Invalid power state {power_state} given. Must be {allowed_states_str}')

if force:
if power_state in {'off', 'soft-off'}:
power_state = 'force-off'
elif power_state == 'soft-restart':
power_state = 'hard-restart'

target_xnames = set()
if recursive:
hsm_client = HSMClient(self.session)

try:
target_xnames |= set(
component['XName']
for xname in xnames
for component in hsm_client.get_node_components(ancestor=xname)
)
except APIError as err:
raise PCSError(f'Could not retrieve descendent components for xnames: {err}') from err
if prereq:
hsm_client = HSMClient(self.session)
try:
target_xnames |= set(
component['XName']
for component in hsm_client.get_all_components()
if any(xname.startswith(component['XName']) for xname in xnames)
)
except APIError as err:
raise PCSError(f'Could not query ancestor components for xnames: {err}') from err

params = {
'operation': power_state,
'taskDeadlineMinutes': -1,
'location': [
{'xname': xname}
for xname in target_xnames
]
}
try:
self.post('transitions', json=params).json()
except APIError as err:
raise PCSError(f'Power {power_state} operation failed for xname(s).',
xname_errs=xnames) from err

def get_xnames_power_state(self, xnames):
"""Get the power state of the given xnames from PCS.
Args:
xnames (list): the xnames (str) to get power state for.
Returns:
dict: a dictionary whose keys are the power states and whose values
are lists of xnames in those power states.
Raises:
PCSError: if the request to get power state fails.
"""
xnames = set(xnames)
try:
resp = self.get('power-status', params={'xname': xnames}).json().get('status')
nodes_by_power_state = defaultdict(list)
for node in resp:
nodes_by_power_state[node['powerState']].append(node['xname'])
return nodes_by_power_state
except APIError as err:
raise PCSError(f'Failed to get power state of xname(s): {", ".join(xnames)}') from err

def get_xname_power_state(self, xname):
"""Get the power state of a single xname from PCS.
Args:
xname (str): the xname to get power state of
Returns:
str: the power state of the node
Raises:
PCSError: if the request to PCS fails or the expected information
is not returned by the PCS API.
"""
try:
resp = self.get('power-status', params={'xname': xname}).json().get('status')
except APIError as err:
raise PCSError(f'Failed to get power state for xname {xname}: {err}') from err

matching_states = [node['powerState'] for node in resp
if node['xname'] == xname]
if not matching_states:
raise PCSError(f'Unable to determine power state of {xname}. Not '
f'present in response from PCS: {resp}')
elif len(matching_states) > 1:
raise PCSError(f'Unable to determine power state of {xname}. PCS '
f'reported multiple power states: {", ".join(matching_states)}')
return matching_states.pop()
52 changes: 24 additions & 28 deletions sat/cli/bootsys/cabinet_power.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,10 @@
"""
import logging

from csm_api_client.k8s import load_kube_api
from kubernetes.client import BatchV1Api
from kubernetes.config import ConfigException

from sat.apiclient import APIError, CAPMCClient, HSMClient
from sat.cli.bootsys.power import CAPMCPowerWaiter
from sat.apiclient import APIError, HSMClient
from sat.apiclient.pcs import PCSClient
from sat.cli.bootsys.power import PCSPowerWaiter
from sat.config import get_config_value
from sat.cronjob import recreate_namespaced_stuck_cronjobs
from sat.hms_discovery import (HMSDiscoveryCronJob, HMSDiscoveryError,
HMSDiscoveryScheduledWaiter)
from sat.session import SATSession
Expand Down Expand Up @@ -74,32 +70,32 @@ def do_air_cooled_cabinets_power_off(args):
return

LOGGER.info(f'Powering off {len(node_xnames)} non-management nodes in air-cooled cabinets.')
capmc_client = CAPMCClient(SATSession())
pcs_client = PCSClient(SATSession())
try:
capmc_client.set_xnames_power_state(node_xnames, 'off', force=True)
pcs_client.set_xnames_power_state(node_xnames, 'off', force=True)
except APIError as err:
LOGGER.warning(f'Failed to power off all air-cooled non-management nodes: {err}')

LOGGER.info(f'Waiting for {len(node_xnames)} non-management nodes in air-cooled cabinets '
f'to reach powered off state.')
capmc_waiter = CAPMCPowerWaiter(node_xnames, 'off',
get_config_value('bootsys.capmc_timeout'))
timed_out_xnames = capmc_waiter.wait_for_completion()
pcs_waiter = PCSPowerWaiter(node_xnames, 'off',
get_config_value('bootsys.pcs_timeout'))
timed_out_xnames = pcs_waiter.wait_for_completion()

if timed_out_xnames:
LOGGER.error(f'The following non-management nodes failed to reach the powered off '
f'state after powering off with CAPMC: {timed_out_xnames}')
f'state after powering off with PCS: {timed_out_xnames}')
raise SystemExit(1)

LOGGER.info(f'All {len(node_xnames)} non-management nodes in air-cooled cabinets '
f'reached powered off state according to CAPMC.')
f'reached powered off state according to PCS.')


def get_xnames_for_power_action(hsm_client):
"""Get xnames of RouterModules, ComputeModules, and Chassis.
This helper function gets all the xnames used in a power action (turn on or
turn off) individually since CAPMC does not support recursively powering off
turn off) individually since PCS does not support recursively powering off
disabled components in Shasta v1.5. See CRAYSAT-920.
Returns:
Expand Down Expand Up @@ -131,27 +127,27 @@ def do_liquid_cooled_cabinets_power_off(args):

LOGGER.info(f'Powering off all liquid-cooled chassis, compute modules, and router modules. '
f'({len(xnames_to_power_off)} components total)')
capmc_client = CAPMCClient(SATSession())
pcs_client = PCSClient(SATSession())
try:
capmc_client.set_xnames_power_state(xnames_to_power_off, 'off')
pcs_client.set_xnames_power_state(xnames_to_power_off, 'off')
except APIError as err:
LOGGER.warning(f'Failed to power off all cabinets: {err}')
if hasattr(err, '__cause__'):
LOGGER.warning(f'Cause: {err.__cause__}')

LOGGER.info(f'Waiting for {len(xnames_to_power_off)} components to reach '
f'powered off state.')
capmc_waiter = CAPMCPowerWaiter(xnames_to_power_off, 'off',
get_config_value('bootsys.capmc_timeout'))
timed_out_xnames = capmc_waiter.wait_for_completion()
pcs_waiter = PCSPowerWaiter(xnames_to_power_off, 'off',
get_config_value('bootsys.pcs_timeout'))
timed_out_xnames = pcs_waiter.wait_for_completion()

if timed_out_xnames:
LOGGER.error(f'The following components failed to reach the powered off '
f'state after powering off with CAPMC: {timed_out_xnames}')
f'state after powering off with PCS: {timed_out_xnames}')
raise SystemExit(1)

LOGGER.info(f'All {len(xnames_to_power_off)} liquid-cooled chassis components reached powered off '
f'state according to CAPMC.')
f'state according to PCS.')


def do_cabinets_power_off(args):
Expand Down Expand Up @@ -181,9 +177,9 @@ def do_cabinets_power_off(args):
def do_cabinets_power_on(args):
"""Power on the liquid-cooled compute cabinets in the system.
Do not do this with a manual call to CAPMC. Instead, restart the
Do not do this with a manual call to PCS. Instead, restart the
hms-discovery cronjob in k8s, and let it do the power on for us. Then wait
for all the compute modules of type "Mountain" to be powered on in CAPMC.
for all the compute modules of type "Mountain" to be powered on in PCS.
Args:
args (argparse.Namespace): The parsed bootsys arguments.
Expand Down Expand Up @@ -221,11 +217,11 @@ def do_cabinets_power_on(args):
raise SystemExit(1)

# Once ComputeModules are powered on, it is possible to boot nodes with BOS.
# Suppress warnings about CAPMC state query errors because we expect the
# Suppress warnings about PCS state query errors because we expect the
# compute modules to be unreachable until they are powered on.
module_waiter = CAPMCPowerWaiter(xnames_to_power_on, 'on',
get_config_value('bootsys.discovery_timeout'),
suppress_warnings=True)
module_waiter = PCSPowerWaiter(xnames_to_power_on, 'on',
get_config_value('bootsys.discovery_timeout'),
suppress_warnings=True)
modules_timed_out = module_waiter.wait_for_completion()

if modules_timed_out:
Expand Down
4 changes: 2 additions & 2 deletions sat/cli/bootsys/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@


TIMEOUT_SPECS = [
TimeoutSpec('capmc', ['shutdown'], 120,
'components reach powered off state after they are shutdown with CAPMC.'),
TimeoutSpec('pcs', ['shutdown'], 120,
'components reach powered off state after they are shutdown with PCS.'),
TimeoutSpec('discovery', ['boot'], 600,
'compute modules reach the powered on state '
'after the HMS Discovery cronjob is resumed.'),
Expand Down
Loading

0 comments on commit 618f4a3

Please sign in to comment.