Skip to content

Commit

Permalink
Increasing ncn shutdown timeout and hard power off handle
Browse files Browse the repository at this point in the history
IM:CRAYSAT-1513
Reviewer:Ryan
  • Loading branch information
Shivaprasad Ashok Metimath committed Oct 16, 2023
1 parent ae9ca0c commit af2d2b4
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 5 deletions.
30 changes: 26 additions & 4 deletions sat/cli/bootsys/mgmt_power.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@

from sat.cli.bootsys.ipmi_console import IPMIConsoleLogger, ConsoleLoggingError
from sat.cli.bootsys.util import get_and_verify_ncn_groups, get_ssh_client, FatalBootsysError
from sat.cli.bootsys.util import hard_power_off
from sat.waiting import GroupWaiter, WaitingFailure
from sat.config import get_config_value
from sat.util import BeginEndLogger, get_username_and_password_interactively, prompt_continue
Expand Down Expand Up @@ -278,16 +279,37 @@ def do_mgmt_shutdown_power(ssh_client, username, password, excluded_ncns, ncn_sh
with IPMIConsoleLogger(other_ncns, username, password):
LOGGER.info(f'Sending shutdown command to other NCNs: {", ".join(other_ncns)}')
start_shutdown(other_ncns, ssh_client)

LOGGER.info(f'Waiting up to {ncn_shutdown_timeout} seconds for other NCNs to '
f'reach powered off state according to ipmitool: {", ".join(other_ncns)}.')
finish_shutdown(other_ncns, username, password,
ncn_shutdown_timeout, ipmi_timeout)
LOGGER.info('Shutdown and power off of all other NCNs complete.')
ipmi_waiter = IPMIPowerStateWaiter(other_ncns, 'off', ncn_shutdown_timeout, username, password)
pending_hosts = ipmi_waiter.wait_for_completion()

if pending_hosts:
prompt_msg = ('Some NCNs have not reached the "off" state. Do you want to proceed '
'with forcibly powering off these NCNs? (yes/no): ')
if hard_power_off(prompt_msg):
LOGGER.warning('Forcibly powering off nodes: %s', ', '.join(pending_hosts))

# Confirm all nodes have actually turned off.
failed_hosts = IPMIPowerStateWaiter(pending_hosts, 'off', ipmi_timeout, username, password,
send_command=True).wait_for_completion()

if failed_hosts:
LOGGER.error('The following nodes failed to reach powered '
'off state: %s', ', '.join(failed_hosts))
raise SystemExit(1)
else:
LOGGER.info('User chose not to proceed with forcible shutdown. Exiting gracefully.')
raise SystemExit(0)
else:
LOGGER.info('Shutdown and power off of all other NCNs complete.')
except ConsoleLoggingError as err:
LOGGER.error(f'Aborting shutdown of NCNs due failure to set up NCN console logging: {err}')
LOGGER.error(f'Aborting shutdown of NCNs due to failure to set up NCN console logging: {err}')
raise SystemExit(1)



def do_power_off_ncns(args):
"""Power off NCNs while monitoring consoles with ipmitool.
Expand Down
2 changes: 1 addition & 1 deletion sat/cli/bootsys/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
'compute and application nodes have completed their BOS shutdown.'),
TimeoutSpec('bos-boot', ['boot', 'reboot'], 900,
'compute and application nodes have completed their BOS boot.'),
TimeoutSpec('ncn-shutdown', ['shutdown'], 300,
TimeoutSpec('ncn-shutdown', ['shutdown'], 900,
'management NCNs have completed a graceful shutdown and have reached '
'the powered off state according to IPMI.'),
]
Expand Down
15 changes: 15 additions & 0 deletions sat/cli/bootsys/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,3 +229,18 @@ def get_ssh_client(host_keys=None):
ssh_client.set_missing_host_key_policy(WarningPolicy)

return ssh_client

def hard_power_off(prompt):
"""Prompt the user for a yes or no response.
Args:
prompt (str): The prompt message to display.
Returns:
str: 'yes' or 'no' based on user input.
"""
while True:
response = input(f'{prompt} (yes/no): ').strip().lower()
if response in {'yes', 'no'}:
return response
print('Invalid response. Please enter "yes" or "no".')

0 comments on commit af2d2b4

Please sign in to comment.