Skip to content

Commit

Permalink
Increasing ncn shutdown timeout and hard power off handle
Browse files Browse the repository at this point in the history
IM:CRAYSAT-1513
Reviewer:Ryan

Updating ncn shutdowm timeout and hard power off handle

IM:CRAYSAT-1513
Reviewer:Ryan
  • Loading branch information
Shivaprasad Ashok Metimath committed Oct 27, 2023
1 parent ae9ca0c commit 98ee57a
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 17 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,14 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

### Fixed
- Updated `sat bootsys` to increase the default management NCN shutdown timeout
to 900 seconds.
- Updated `sat bootsys` to include a prompt for input before proceeding with
hard power off of management NCNs after timeout.

## [3.25.5] - 2023-10-12

### Security
Expand Down
2 changes: 1 addition & 1 deletion docs/man/sat-bootsys.8.rst
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ These options set the timeouts of various parts of the stages of the
**--ncn-shutdown-timeout** *NCN_SHUTDOWN_TIMEOUT*
Timeout, in seconds, to wait until management NCNs
have completed a graceful shutdown and have reached the
powered off state according to IPMI. Defaults to 300.
powered off state according to IPMI. Defaults to 900.
Overrides the option bootsys.ncn_shutdown_timeout in
the config file.

Expand Down
37 changes: 22 additions & 15 deletions sat/cli/bootsys/mgmt_power.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#
# MIT License
#
# (C) Copyright 2020-2021 Hewlett Packard Enterprise Development LP
# (C) Copyright 2020-2021,2023 Hewlett Packard Enterprise Development LP
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
Expand Down Expand Up @@ -38,7 +38,7 @@
from sat.cli.bootsys.util import get_and_verify_ncn_groups, get_ssh_client, FatalBootsysError
from sat.waiting import GroupWaiter, WaitingFailure
from sat.config import get_config_value
from sat.util import BeginEndLogger, get_username_and_password_interactively, prompt_continue
from sat.util import BeginEndLogger, get_username_and_password_interactively, pester_choices, prompt_continue

LOGGER = logging.getLogger(__name__)
INF = inflect.engine()
Expand Down Expand Up @@ -217,8 +217,8 @@ def finish_shutdown(hosts, username, password, ncn_shutdown_timeout, ipmi_timeou
After start_shutdown is called, this checks that all the hosts
have reached an IPMI "off" power state. If the shutdown has timed
out on a given host, an IPMI power off command is sent to hard
power off the host.
out on a given host, a prompt is shown to the user to decide whether
to proceed with hard power off.
Args:
hosts ([str]): a list of hostnames to power off.
Expand All @@ -236,16 +236,24 @@ def finish_shutdown(hosts, username, password, ncn_shutdown_timeout, ipmi_timeou
pending_hosts = ipmi_waiter.wait_for_completion()

if pending_hosts:
LOGGER.warning('Forcibly powering off nodes: %s', ', '.join(pending_hosts))
LOGGER.warning('The following nodes did not complete a graceful '
'shutdown within the timeout: %s', ', '.join(pending_hosts))

# Confirm all nodes have actually turned off.
failed_hosts = IPMIPowerStateWaiter(pending_hosts, 'off', ipmi_timeout, username, password,
send_command=True).wait_for_completion()
prompt_message = 'Do you want to forcibly power off the nodes that timedout?'
if pester_choices(prompt_message, ('yes', 'no')) == 'yes':
LOGGER.info('Proceeding with hard power off.')

if failed_hosts:
LOGGER.error('The following nodes failed to reach powered '
'off state: %s', ', '.join(failed_hosts))
sys.exit(1)
failed_hosts = IPMIPowerStateWaiter(pending_hosts, 'off', ipmi_timeout, username, password,
send_command=True).wait_for_completion()

if failed_hosts:
LOGGER.error('The following nodes failed to reach powered '
'off state: %s', ', '.join(failed_hosts))
sys.exit(1)
else:
LOGGER.info('User opted not to proceed with hard power off. Exiting.')
sys.exit(0)


def do_mgmt_shutdown_power(ssh_client, username, password, excluded_ncns, ncn_shutdown_timeout, ipmi_timeout):
Expand Down Expand Up @@ -280,11 +288,10 @@ def do_mgmt_shutdown_power(ssh_client, username, password, excluded_ncns, ncn_sh
start_shutdown(other_ncns, ssh_client)
LOGGER.info(f'Waiting up to {ncn_shutdown_timeout} seconds for other NCNs to '
f'reach powered off state according to ipmitool: {", ".join(other_ncns)}.')
finish_shutdown(other_ncns, username, password,
ncn_shutdown_timeout, ipmi_timeout)
LOGGER.info('Shutdown and power off of all other NCNs complete.')
finish_shutdown(other_ncns, username, password, ncn_shutdown_timeout, ipmi_timeout)
LOGGER.info('Shutdown of all other NCNs complete.')
except ConsoleLoggingError as err:
LOGGER.error(f'Aborting shutdown of NCNs due failure to set up NCN console logging: {err}')
LOGGER.error(f'Aborting shutdown of NCNs due to failure to set up NCN console logging: {err}')
raise SystemExit(1)


Expand Down
2 changes: 1 addition & 1 deletion sat/cli/bootsys/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
'compute and application nodes have completed their BOS shutdown.'),
TimeoutSpec('bos-boot', ['boot', 'reboot'], 900,
'compute and application nodes have completed their BOS boot.'),
TimeoutSpec('ncn-shutdown', ['shutdown'], 300,
TimeoutSpec('ncn-shutdown', ['shutdown'], 900,
'management NCNs have completed a graceful shutdown and have reached '
'the powered off state according to IPMI.'),
]
Expand Down

0 comments on commit 98ee57a

Please sign in to comment.