Skip to content

Commit

Permalink
Updating ncn shutdowm timeout and hard power off handle
Browse files Browse the repository at this point in the history
IM:CRAYSAT-1513
Reviewer:Ryan
  • Loading branch information
Shivaprasad Ashok Metimath committed Oct 26, 2023
1 parent af2d2b4 commit 1d07322
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 52 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,14 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

### Fixed
- Updated `sat bootsys` to increase the default management NCN shutdown timeout
to 900 seconds.
- Updated `sat bootsys` to include a prompt for input before proceeding with
hard power off of management NCNs after timeout.

## [3.25.5] - 2023-10-12

### Security
Expand Down
2 changes: 1 addition & 1 deletion docs/man/sat-bootsys.8.rst
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ These options set the timeouts of various parts of the stages of the
**--ncn-shutdown-timeout** *NCN_SHUTDOWN_TIMEOUT*
Timeout, in seconds, to wait until management NCNs
have completed a graceful shutdown and have reached the
powered off state according to IPMI. Defaults to 300.
powered off state according to IPMI. Defaults to 900.
Overrides the option bootsys.ncn_shutdown_timeout in
the config file.

Expand Down
56 changes: 20 additions & 36 deletions sat/cli/bootsys/mgmt_power.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#
# MIT License
#
# (C) Copyright 2020-2021 Hewlett Packard Enterprise Development LP
# (C) Copyright 2020-2021,2023 Hewlett Packard Enterprise Development LP
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
Expand Down Expand Up @@ -36,10 +36,9 @@

from sat.cli.bootsys.ipmi_console import IPMIConsoleLogger, ConsoleLoggingError
from sat.cli.bootsys.util import get_and_verify_ncn_groups, get_ssh_client, FatalBootsysError
from sat.cli.bootsys.util import hard_power_off
from sat.waiting import GroupWaiter, WaitingFailure
from sat.config import get_config_value
from sat.util import BeginEndLogger, get_username_and_password_interactively, prompt_continue
from sat.util import BeginEndLogger, get_username_and_password_interactively, pester_choices, prompt_continue

LOGGER = logging.getLogger(__name__)
INF = inflect.engine()
Expand Down Expand Up @@ -218,8 +217,8 @@ def finish_shutdown(hosts, username, password, ncn_shutdown_timeout, ipmi_timeou
After start_shutdown is called, this checks that all the hosts
have reached an IPMI "off" power state. If the shutdown has timed
out on a given host, an IPMI power off command is sent to hard
power off the host.
out on a given host, a prompt is shown to the user to decide whether
to proceed with hard power off.
Args:
hosts ([str]): a list of hostnames to power off.
Expand All @@ -237,16 +236,23 @@ def finish_shutdown(hosts, username, password, ncn_shutdown_timeout, ipmi_timeou
pending_hosts = ipmi_waiter.wait_for_completion()

if pending_hosts:
LOGGER.warning('Forcibly powering off nodes: %s', ', '.join(pending_hosts))
LOGGER.warning('The following nodes did not complete a graceful ' 'shutdown within the timeout: %s', ', '.join(pending_hosts))

# Confirm all nodes have actually turned off.
failed_hosts = IPMIPowerStateWaiter(pending_hosts, 'off', ipmi_timeout, username, password,
send_command=True).wait_for_completion()
prompt_message = 'Do you want to forcibly power off the nodes that timedout?'
if pester_choices(prompt_message, ('yes', 'no')) == 'yes':
LOGGER.info('Proceeding with hard power off.')

if failed_hosts:
LOGGER.error('The following nodes failed to reach powered '
'off state: %s', ', '.join(failed_hosts))
sys.exit(1)
failed_hosts = IPMIPowerStateWaiter(pending_hosts, 'off', ipmi_timeout, username, password,
send_command=True).wait_for_completion()

if failed_hosts:
LOGGER.error('The following nodes failed to reach powered '
'off state: %s', ', '.join(failed_hosts))
sys.exit(1)
else:
LOGGER.info('User opted not to proceed with hard power off. Exiting.')
sys.exit(0)


def do_mgmt_shutdown_power(ssh_client, username, password, excluded_ncns, ncn_shutdown_timeout, ipmi_timeout):
Expand Down Expand Up @@ -279,37 +285,15 @@ def do_mgmt_shutdown_power(ssh_client, username, password, excluded_ncns, ncn_sh
with IPMIConsoleLogger(other_ncns, username, password):
LOGGER.info(f'Sending shutdown command to other NCNs: {", ".join(other_ncns)}')
start_shutdown(other_ncns, ssh_client)

LOGGER.info(f'Waiting up to {ncn_shutdown_timeout} seconds for other NCNs to '
f'reach powered off state according to ipmitool: {", ".join(other_ncns)}.')
ipmi_waiter = IPMIPowerStateWaiter(other_ncns, 'off', ncn_shutdown_timeout, username, password)
pending_hosts = ipmi_waiter.wait_for_completion()

if pending_hosts:
prompt_msg = ('Some NCNs have not reached the "off" state. Do you want to proceed '
'with forcibly powering off these NCNs? (yes/no): ')
if hard_power_off(prompt_msg):
LOGGER.warning('Forcibly powering off nodes: %s', ', '.join(pending_hosts))

# Confirm all nodes have actually turned off.
failed_hosts = IPMIPowerStateWaiter(pending_hosts, 'off', ipmi_timeout, username, password,
send_command=True).wait_for_completion()

if failed_hosts:
LOGGER.error('The following nodes failed to reach powered '
'off state: %s', ', '.join(failed_hosts))
raise SystemExit(1)
else:
LOGGER.info('User chose not to proceed with forcible shutdown. Exiting gracefully.')
raise SystemExit(0)
else:
LOGGER.info('Shutdown and power off of all other NCNs complete.')
finish_shutdown(other_ncns, username, password, ncn_shutdown_timeout, ipmi_timeout)
LOGGER.info('Shutdown of all other NCNs complete.')
except ConsoleLoggingError as err:
LOGGER.error(f'Aborting shutdown of NCNs due to failure to set up NCN console logging: {err}')
raise SystemExit(1)



def do_power_off_ncns(args):
"""Power off NCNs while monitoring consoles with ipmitool.
Expand Down
15 changes: 0 additions & 15 deletions sat/cli/bootsys/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,18 +229,3 @@ def get_ssh_client(host_keys=None):
ssh_client.set_missing_host_key_policy(WarningPolicy)

return ssh_client

def hard_power_off(prompt):
"""Prompt the user for a yes or no response.
Args:
prompt (str): The prompt message to display.
Returns:
str: 'yes' or 'no' based on user input.
"""
while True:
response = input(f'{prompt} (yes/no): ').strip().lower()
if response in {'yes', 'no'}:
return response
print('Invalid response. Please enter "yes" or "no".')

0 comments on commit 1d07322

Please sign in to comment.