From 1d07322258ed72945e34c978b1ec33addb91cdbb Mon Sep 17 00:00:00 2001 From: Shivaprasad Ashok Metimath Date: Mon, 16 Oct 2023 12:23:14 +0000 Subject: [PATCH] Updating ncn shutdowm timeout and hard power off handle IM:CRAYSAT-1513 Reviewer:Ryan --- CHANGELOG.md | 8 +++++ docs/man/sat-bootsys.8.rst | 2 +- sat/cli/bootsys/mgmt_power.py | 56 +++++++++++++---------------------- sat/cli/bootsys/util.py | 15 ---------- 4 files changed, 29 insertions(+), 52 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 40b76c66..61e96a96 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Fixed +- Updated `sat bootsys` to increase the default management NCN shutdown timeout + to 900 seconds. +- Updated `sat bootsys` to include a prompt for input before proceeding with + hard power off of management NCNs after timeout. + ## [3.25.5] - 2023-10-12 ### Security diff --git a/docs/man/sat-bootsys.8.rst b/docs/man/sat-bootsys.8.rst index d1f628c2..353d1d1f 100644 --- a/docs/man/sat-bootsys.8.rst +++ b/docs/man/sat-bootsys.8.rst @@ -247,7 +247,7 @@ These options set the timeouts of various parts of the stages of the **--ncn-shutdown-timeout** *NCN_SHUTDOWN_TIMEOUT* Timeout, in seconds, to wait until management NCNs have completed a graceful shutdown and have reached the - powered off state according to IPMI. Defaults to 300. + powered off state according to IPMI. Defaults to 900. Overrides the option bootsys.ncn_shutdown_timeout in the config file. diff --git a/sat/cli/bootsys/mgmt_power.py b/sat/cli/bootsys/mgmt_power.py index 01123ccf..e3a6c5dc 100644 --- a/sat/cli/bootsys/mgmt_power.py +++ b/sat/cli/bootsys/mgmt_power.py @@ -1,7 +1,7 @@ # # MIT License # -# (C) Copyright 2020-2021 Hewlett Packard Enterprise Development LP +# (C) Copyright 2020-2021,2023 Hewlett Packard Enterprise Development LP # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), @@ -36,10 +36,9 @@ from sat.cli.bootsys.ipmi_console import IPMIConsoleLogger, ConsoleLoggingError from sat.cli.bootsys.util import get_and_verify_ncn_groups, get_ssh_client, FatalBootsysError -from sat.cli.bootsys.util import hard_power_off from sat.waiting import GroupWaiter, WaitingFailure from sat.config import get_config_value -from sat.util import BeginEndLogger, get_username_and_password_interactively, prompt_continue +from sat.util import BeginEndLogger, get_username_and_password_interactively, pester_choices, prompt_continue LOGGER = logging.getLogger(__name__) INF = inflect.engine() @@ -218,8 +217,8 @@ def finish_shutdown(hosts, username, password, ncn_shutdown_timeout, ipmi_timeou After start_shutdown is called, this checks that all the hosts have reached an IPMI "off" power state. If the shutdown has timed - out on a given host, an IPMI power off command is sent to hard - power off the host. + out on a given host, a prompt is shown to the user to decide whether + to proceed with hard power off. Args: hosts ([str]): a list of hostnames to power off. @@ -237,16 +236,23 @@ def finish_shutdown(hosts, username, password, ncn_shutdown_timeout, ipmi_timeou pending_hosts = ipmi_waiter.wait_for_completion() if pending_hosts: - LOGGER.warning('Forcibly powering off nodes: %s', ', '.join(pending_hosts)) + LOGGER.warning('The following nodes did not complete a graceful ' 'shutdown within the timeout: %s', ', '.join(pending_hosts)) # Confirm all nodes have actually turned off. - failed_hosts = IPMIPowerStateWaiter(pending_hosts, 'off', ipmi_timeout, username, password, - send_command=True).wait_for_completion() + prompt_message = 'Do you want to forcibly power off the nodes that timedout?' + if pester_choices(prompt_message, ('yes', 'no')) == 'yes': + LOGGER.info('Proceeding with hard power off.') - if failed_hosts: - LOGGER.error('The following nodes failed to reach powered ' - 'off state: %s', ', '.join(failed_hosts)) - sys.exit(1) + failed_hosts = IPMIPowerStateWaiter(pending_hosts, 'off', ipmi_timeout, username, password, + send_command=True).wait_for_completion() + + if failed_hosts: + LOGGER.error('The following nodes failed to reach powered ' + 'off state: %s', ', '.join(failed_hosts)) + sys.exit(1) + else: + LOGGER.info('User opted not to proceed with hard power off. Exiting.') + sys.exit(0) def do_mgmt_shutdown_power(ssh_client, username, password, excluded_ncns, ncn_shutdown_timeout, ipmi_timeout): @@ -279,37 +285,15 @@ def do_mgmt_shutdown_power(ssh_client, username, password, excluded_ncns, ncn_sh with IPMIConsoleLogger(other_ncns, username, password): LOGGER.info(f'Sending shutdown command to other NCNs: {", ".join(other_ncns)}') start_shutdown(other_ncns, ssh_client) - LOGGER.info(f'Waiting up to {ncn_shutdown_timeout} seconds for other NCNs to ' f'reach powered off state according to ipmitool: {", ".join(other_ncns)}.') - ipmi_waiter = IPMIPowerStateWaiter(other_ncns, 'off', ncn_shutdown_timeout, username, password) - pending_hosts = ipmi_waiter.wait_for_completion() - - if pending_hosts: - prompt_msg = ('Some NCNs have not reached the "off" state. Do you want to proceed ' - 'with forcibly powering off these NCNs? (yes/no): ') - if hard_power_off(prompt_msg): - LOGGER.warning('Forcibly powering off nodes: %s', ', '.join(pending_hosts)) - - # Confirm all nodes have actually turned off. - failed_hosts = IPMIPowerStateWaiter(pending_hosts, 'off', ipmi_timeout, username, password, - send_command=True).wait_for_completion() - - if failed_hosts: - LOGGER.error('The following nodes failed to reach powered ' - 'off state: %s', ', '.join(failed_hosts)) - raise SystemExit(1) - else: - LOGGER.info('User chose not to proceed with forcible shutdown. Exiting gracefully.') - raise SystemExit(0) - else: - LOGGER.info('Shutdown and power off of all other NCNs complete.') + finish_shutdown(other_ncns, username, password, ncn_shutdown_timeout, ipmi_timeout) + LOGGER.info('Shutdown of all other NCNs complete.') except ConsoleLoggingError as err: LOGGER.error(f'Aborting shutdown of NCNs due to failure to set up NCN console logging: {err}') raise SystemExit(1) - def do_power_off_ncns(args): """Power off NCNs while monitoring consoles with ipmitool. diff --git a/sat/cli/bootsys/util.py b/sat/cli/bootsys/util.py index b2f5bef0..e5bad5a4 100644 --- a/sat/cli/bootsys/util.py +++ b/sat/cli/bootsys/util.py @@ -229,18 +229,3 @@ def get_ssh_client(host_keys=None): ssh_client.set_missing_host_key_policy(WarningPolicy) return ssh_client - -def hard_power_off(prompt): - """Prompt the user for a yes or no response. - - Args: - prompt (str): The prompt message to display. - - Returns: - str: 'yes' or 'no' based on user input. - """ - while True: - response = input(f'{prompt} (yes/no): ').strip().lower() - if response in {'yes', 'no'}: - return response - print('Invalid response. Please enter "yes" or "no".') \ No newline at end of file