From 98ee57a1b75f652c7fcddee9246447294916645a Mon Sep 17 00:00:00 2001 From: Shivaprasad Ashok Metimath Date: Mon, 16 Oct 2023 12:00:45 +0000 Subject: [PATCH] Increasing ncn shutdown timeout and hard power off handle IM:CRAYSAT-1513 Reviewer:Ryan Updating ncn shutdowm timeout and hard power off handle IM:CRAYSAT-1513 Reviewer:Ryan --- CHANGELOG.md | 8 ++++++++ docs/man/sat-bootsys.8.rst | 2 +- sat/cli/bootsys/mgmt_power.py | 37 +++++++++++++++++++++-------------- sat/cli/bootsys/parser.py | 2 +- 4 files changed, 32 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 40b76c66..61e96a96 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Fixed +- Updated `sat bootsys` to increase the default management NCN shutdown timeout + to 900 seconds. +- Updated `sat bootsys` to include a prompt for input before proceeding with + hard power off of management NCNs after timeout. + ## [3.25.5] - 2023-10-12 ### Security diff --git a/docs/man/sat-bootsys.8.rst b/docs/man/sat-bootsys.8.rst index d1f628c2..353d1d1f 100644 --- a/docs/man/sat-bootsys.8.rst +++ b/docs/man/sat-bootsys.8.rst @@ -247,7 +247,7 @@ These options set the timeouts of various parts of the stages of the **--ncn-shutdown-timeout** *NCN_SHUTDOWN_TIMEOUT* Timeout, in seconds, to wait until management NCNs have completed a graceful shutdown and have reached the - powered off state according to IPMI. Defaults to 300. + powered off state according to IPMI. Defaults to 900. Overrides the option bootsys.ncn_shutdown_timeout in the config file. diff --git a/sat/cli/bootsys/mgmt_power.py b/sat/cli/bootsys/mgmt_power.py index ba956f85..a76465a4 100644 --- a/sat/cli/bootsys/mgmt_power.py +++ b/sat/cli/bootsys/mgmt_power.py @@ -1,7 +1,7 @@ # # MIT License # -# (C) Copyright 2020-2021 Hewlett Packard Enterprise Development LP +# (C) Copyright 2020-2021,2023 Hewlett Packard Enterprise Development LP # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), @@ -38,7 +38,7 @@ from sat.cli.bootsys.util import get_and_verify_ncn_groups, get_ssh_client, FatalBootsysError from sat.waiting import GroupWaiter, WaitingFailure from sat.config import get_config_value -from sat.util import BeginEndLogger, get_username_and_password_interactively, prompt_continue +from sat.util import BeginEndLogger, get_username_and_password_interactively, pester_choices, prompt_continue LOGGER = logging.getLogger(__name__) INF = inflect.engine() @@ -217,8 +217,8 @@ def finish_shutdown(hosts, username, password, ncn_shutdown_timeout, ipmi_timeou After start_shutdown is called, this checks that all the hosts have reached an IPMI "off" power state. If the shutdown has timed - out on a given host, an IPMI power off command is sent to hard - power off the host. + out on a given host, a prompt is shown to the user to decide whether + to proceed with hard power off. Args: hosts ([str]): a list of hostnames to power off. @@ -236,16 +236,24 @@ def finish_shutdown(hosts, username, password, ncn_shutdown_timeout, ipmi_timeou pending_hosts = ipmi_waiter.wait_for_completion() if pending_hosts: - LOGGER.warning('Forcibly powering off nodes: %s', ', '.join(pending_hosts)) + LOGGER.warning('The following nodes did not complete a graceful ' + 'shutdown within the timeout: %s', ', '.join(pending_hosts)) # Confirm all nodes have actually turned off. - failed_hosts = IPMIPowerStateWaiter(pending_hosts, 'off', ipmi_timeout, username, password, - send_command=True).wait_for_completion() + prompt_message = 'Do you want to forcibly power off the nodes that timedout?' + if pester_choices(prompt_message, ('yes', 'no')) == 'yes': + LOGGER.info('Proceeding with hard power off.') - if failed_hosts: - LOGGER.error('The following nodes failed to reach powered ' - 'off state: %s', ', '.join(failed_hosts)) - sys.exit(1) + failed_hosts = IPMIPowerStateWaiter(pending_hosts, 'off', ipmi_timeout, username, password, + send_command=True).wait_for_completion() + + if failed_hosts: + LOGGER.error('The following nodes failed to reach powered ' + 'off state: %s', ', '.join(failed_hosts)) + sys.exit(1) + else: + LOGGER.info('User opted not to proceed with hard power off. Exiting.') + sys.exit(0) def do_mgmt_shutdown_power(ssh_client, username, password, excluded_ncns, ncn_shutdown_timeout, ipmi_timeout): @@ -280,11 +288,10 @@ def do_mgmt_shutdown_power(ssh_client, username, password, excluded_ncns, ncn_sh start_shutdown(other_ncns, ssh_client) LOGGER.info(f'Waiting up to {ncn_shutdown_timeout} seconds for other NCNs to ' f'reach powered off state according to ipmitool: {", ".join(other_ncns)}.') - finish_shutdown(other_ncns, username, password, - ncn_shutdown_timeout, ipmi_timeout) - LOGGER.info('Shutdown and power off of all other NCNs complete.') + finish_shutdown(other_ncns, username, password, ncn_shutdown_timeout, ipmi_timeout) + LOGGER.info('Shutdown of all other NCNs complete.') except ConsoleLoggingError as err: - LOGGER.error(f'Aborting shutdown of NCNs due failure to set up NCN console logging: {err}') + LOGGER.error(f'Aborting shutdown of NCNs due to failure to set up NCN console logging: {err}') raise SystemExit(1) diff --git a/sat/cli/bootsys/parser.py b/sat/cli/bootsys/parser.py index ae55994d..19e24f42 100644 --- a/sat/cli/bootsys/parser.py +++ b/sat/cli/bootsys/parser.py @@ -55,7 +55,7 @@ 'compute and application nodes have completed their BOS shutdown.'), TimeoutSpec('bos-boot', ['boot', 'reboot'], 900, 'compute and application nodes have completed their BOS boot.'), - TimeoutSpec('ncn-shutdown', ['shutdown'], 300, + TimeoutSpec('ncn-shutdown', ['shutdown'], 900, 'management NCNs have completed a graceful shutdown and have reached ' 'the powered off state according to IPMI.'), ]