Skip to content

Commit

Permalink
Dev: remove upgradeutil and add crm cluster health hawk2 [--fix]
Browse files Browse the repository at this point in the history
  • Loading branch information
liangxin1300 authored Sep 24, 2024
2 parents ca41b17 + 8bd4efc commit 685cf0a
Show file tree
Hide file tree
Showing 10 changed files with 101 additions and 82 deletions.
9 changes: 0 additions & 9 deletions crmsh/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
from lxml import etree

from . import config, constants, ssh_key, sh
from . import upgradeutil
from . import utils
from . import xmlutil
from .cibconfig import cib_factory
Expand Down Expand Up @@ -1403,10 +1402,6 @@ def init_sbd():
_context.sbd_manager.sbd_init()


def init_upgradeutil():
upgradeutil.force_set_local_upgrade_seq()


def init_ocfs2():
"""
OCFS2 configure process
Expand Down Expand Up @@ -2155,7 +2150,6 @@ def corosync_stage_finished():
"corosync": (corosync_stage_finished, False),
"remote_auth": (init_remote_auth, True),
"sbd": (lambda: True, False),
"upgradeutil": (init_upgradeutil, True),
"cluster": (is_online, False)
}

Expand Down Expand Up @@ -2227,7 +2221,6 @@ def bootstrap_init(context):
init_corosync()
init_remote_auth()
init_sbd()
init_upgradeutil()

lock_inst = lock.Lock()
try:
Expand Down Expand Up @@ -2298,7 +2291,6 @@ def bootstrap_join(context):

if _context.stage != "":
remote_user, cluster_node = _parse_user_at_host(_context.cluster_node, _context.current_user)
init_upgradeutil()
check_stage_dependency(_context.stage)
globals()["join_" + _context.stage](cluster_node, remote_user)
else:
Expand All @@ -2314,7 +2306,6 @@ def bootstrap_join(context):
_context.cluster_node = cluster_user_at_node
_context.initialize_user()

init_upgradeutil()
remote_user, cluster_node = _parse_user_at_host(_context.cluster_node, _context.current_user)
utils.ping_node(cluster_node)
join_ssh(cluster_node, remote_user)
Expand Down
30 changes: 30 additions & 0 deletions crmsh/healthcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import crmsh.constants
import crmsh.parallax
import crmsh.user_of_host
import crmsh.utils


Expand Down Expand Up @@ -164,6 +165,35 @@ def fix_cluster(self, nodes: typing.Iterable[str], ask: typing.Callable[[str], N
crmsh.bootstrap.change_user_shell('hacluster', node)


class PasswordlessPrimaryUserAuthenticationFeature(Feature):
def check_quick(self) -> bool:
local_node = crmsh.utils.this_node()
try:
crmsh.utils.user_of(local_node)
return True
except crmsh.user_of_host.UserNotFoundError:
return False

def check_local(self, nodes: typing.Iterable[str]) -> bool:
try:
for node in nodes:
crmsh.utils.user_of(node)
except crmsh.user_of_host.UserNotFoundError:
return False
try:
crmsh.parallax.parallax_call(nodes, 'true')
return True
except ValueError:
return False

def fix_local(self, nodes: typing.Iterable[str], ask: typing.Callable[[str], None]) -> None:
logger.warning('Passwordless ssh is not initialized. Use `crm cluster init ssh` and `crm cluster join ssh -c <init-node>` to set it up.')
raise FixFailure

def fix_cluster(self, nodes: typing.Iterable[str], ask: typing.Callable[[str], None]) -> None:
return self.fix_local(nodes, ask)


def main_check_local(args) -> int:
try:
feature = Feature.get_feature_by_name(args.feature)()
Expand Down
2 changes: 0 additions & 2 deletions crmsh/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from . import constants
from . import clidisplay
from . import term
from . import upgradeutil
from . import utils
from . import userdir

Expand Down Expand Up @@ -367,7 +366,6 @@ def run():
if options.profile:
return profile_run(context, user_args)
else:
upgradeutil.upgrade_if_needed()
return main_input_loop(context, user_args)
except utils.NoSSHError as msg:
logger.error('%s', msg)
Expand Down
7 changes: 6 additions & 1 deletion crmsh/report/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import crmsh.sh
import crmsh.report.sh
import crmsh.user_of_host
from crmsh import utils as crmutils
from crmsh import utils as crmutils, userdir
from crmsh import constants as crmconstants
from crmsh import config, log, tmpfiles, ui_cluster
from crmsh.sh import ShellUtils
Expand Down Expand Up @@ -409,6 +409,11 @@ def find_ssh_user(context: Context) -> None:
logger.warning('%s', buf.getvalue())
else:
logger.warning("passwordless ssh to node %s does not work", n)
if not crmutils.can_ask():
logger.error('Cannot create a report non-interactively. Interactive authentication is required.')
if userdir.getuser() == 'hacluster':
logger.warning('Passwordless ssh does not work. Run "crm cluster health hawk2 --fix" to set it up.')
raise ValueError('Cannot create a report.')


def load_from_crmsh_config(context: Context) -> None:
Expand Down
18 changes: 16 additions & 2 deletions crmsh/scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
except ImportError:
import simplejson as json

from . import config, constants
from . import config, constants, user_of_host
from . import handles
from . import options
from . import userdir
Expand Down Expand Up @@ -1639,6 +1639,16 @@ def _copy_utils(dst):
raise ValueError(e)


def _check_parallax_remote_available(printer, hosts):
try:
_parallax_call(printer, hosts, 'true', timeout_seconds=15)
except user_of_host.UserNotFoundError:
if userdir.getuser() == 'hacluster':
raise ValueError('Passwordless ssh does not work. Run "crm cluster health hawk2 --fix" to set it up.') from None
else:
raise ValueError('Passwordless ssh does not work.') from None


def _create_remote_workdirs(printer, hosts, path, timeout_seconds):
"Create workdirs on remote hosts"
ok = True
Expand Down Expand Up @@ -1779,6 +1789,7 @@ def prepare(self, has_remote_actions):
json.dump(self.data, open(self.statefile, 'w'))
_copy_utils(self.workdir)
if has_remote_actions:
_check_parallax_remote_available(self.printer, self.hosts)
_create_remote_workdirs(self.printer, self.hosts, self.workdir, self.timeout_seconds)
_copy_to_remote_dirs(self.printer, self.hosts, self.workdir, self.timeout_seconds)
# make sure all path references are relative to the script directory
Expand Down Expand Up @@ -2106,7 +2117,10 @@ def run(script, params, printer):
finally:
if not dry_run:
if not config.core.debug:
_run_cleanup(printer, has_remote_actions, local_node, hosts, workdir, int(params['timeout']))
try:
_run_cleanup(printer, has_remote_actions, local_node, hosts, workdir, int(params['timeout']))
except user_of_host.UserNotFoundError:
pass
elif has_remote_actions:
_print_debug(printer, local_node, hosts, workdir, int(params['timeout']))
else:
Expand Down
38 changes: 37 additions & 1 deletion crmsh/ui_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from argparse import ArgumentParser, RawDescriptionHelpFormatter

import crmsh.parallax
from . import command, sh
from . import command, sh, healthcheck
from . import utils
from . import scripts
from . import completers as compl
Expand Down Expand Up @@ -751,6 +751,42 @@ def do_health(self, context, *args):
'''
Extensive health check.
'''
if not args:
return self._do_health_legacy(context, *args)
parser = argparse.ArgumentParser()
parser.add_argument('component', choices=['hawk2'])
parser.add_argument('-f', '--fix', action='store_true')
parsed_args = parser.parse_args(args)
if parsed_args.component == 'hawk2':
nodes = utils.list_cluster_nodes()
if parsed_args.fix:
if not healthcheck.feature_full_check(healthcheck.PasswordlessPrimaryUserAuthenticationFeature(), nodes):
try:
healthcheck.feature_fix(
healthcheck.PasswordlessPrimaryUserAuthenticationFeature(),
nodes,
utils.ask,
)
except healthcheck.FixFailure:
logger.error('Cannot fix automatically.')
return False
try:
healthcheck.feature_fix(healthcheck.PasswordlessHaclusterAuthenticationFeature(), nodes, utils.ask)
logger.info("hawk2: passwordless ssh authentication: OK.")
return True
except healthcheck.FixFailure:
logger.error("hawk2: passwordless ssh authentication: FAIL.")
return False
else:
if healthcheck.feature_full_check(healthcheck.PasswordlessHaclusterAuthenticationFeature(), nodes):
logger.info("hawk2: passwordless ssh authentication: OK.")
return True
else:
logger.error("hawk2: passwordless ssh authentication: FAIL.")
logger.warning('Please run "crm cluster health hawk2 --fix"')
return False

def _do_health_legacy(self, context, *args):
params = self._args_implicit(context, args, 'nodes')
script = scripts.load_script('health')
if script is None:
Expand Down
2 changes: 1 addition & 1 deletion crmsh/user_of_host.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def user_pair_for_ssh(self, host: str) -> typing.Tuple[str, str]:
else:
ret = self._guess_user_for_ssh(host)
if ret is None:
raise UserNotFoundError
raise UserNotFoundError from None
else:
self._user_pair_cache[host] = ret
return ret
Expand Down
32 changes: 1 addition & 31 deletions test/features/bootstrap_bugs.feature
Original file line number Diff line number Diff line change
Expand Up @@ -231,20 +231,10 @@ Feature: Regression test for bootstrap bugs
Given Cluster service is "stopped" on "hanode1"
And Cluster service is "stopped" on "hanode2"
When Run "crm cluster init -y" on "hanode1"
And Run "rm -f /root/.ssh/id_rsa.pub" on "hanode1"
Then Cluster service is "started" on "hanode1"
When Run "crm cluster join -c hanode1 -y" on "hanode2"
Then Cluster service is "started" on "hanode2"
When Run "rm -f /root/.ssh/id_rsa.pub" on "hanode1"
When Run "rm -f /root/.ssh/id_rsa.pub" on "hanode2"
When Run "rm -f /var/lib/crmsh/upgrade_seq" on "hanode1"
When Run "rm -f /var/lib/crmsh/upgrade_seq" on "hanode2"
When Run "rm -rf /var/lib/heartbeat/cores/hacluster/.ssh" on "hanode1"
And Run "rm -rf /var/lib/heartbeat/cores/hacluster/.ssh" on "hanode2"
And Run "usermod -s /usr/sbin/nologin hacluster" on "hanode1"
And Run "usermod -s /usr/sbin/nologin hacluster" on "hanode2"
And Run "crm status" on "hanode1"
Then Check user shell for hacluster between "hanode1 hanode2"
Then Check passwordless for hacluster between "hanode1 hanode2"

@skip_non_root
@clean
Expand Down Expand Up @@ -277,26 +267,6 @@ Feature: Regression test for bootstrap bugs
And Expected "hacluster:haclient" in stdout
And Run "stat -c '%U:%G' ~hacluster/.ssh/authorized_keys" OK on "hanode2"
And Expected "hacluster:haclient" in stdout
# in an upgraded cluster in which ~hacluster/.ssh/authorized_keys exists
When Run "chown root:root ~hacluster/.ssh/authorized_keys && chmod 0600 ~hacluster/.ssh/authorized_keys" on "hanode1"
And Run "chown root:root ~hacluster/.ssh/authorized_keys && chmod 0600 ~hacluster/.ssh/authorized_keys" on "hanode2"
And Run "rm -f /var/lib/crmsh/upgrade_seq" on "hanode1"
And Run "rm -f /var/lib/crmsh/upgrade_seq" on "hanode2"
And Run "crm status" on "hanode1"
Then Run "stat -c '%U:%G' ~hacluster/.ssh/authorized_keys" OK on "hanode1"
And Expected "hacluster:haclient" in stdout
Then Run "stat -c '%U:%G' ~hacluster/.ssh/authorized_keys" OK on "hanode2"
And Expected "hacluster:haclient" in stdout
# in an upgraded cluster in which ~hacluster/.ssh/authorized_keys does not exist
When Run "rm -rf /var/lib/heartbeat/cores/hacluster/.ssh/" on "hanode1"
And Run "rm -rf /var/lib/heartbeat/cores/hacluster/.ssh/" on "hanode2"
And Run "rm -f /var/lib/crmsh/upgrade_seq" on "hanode1"
And Run "rm -f /var/lib/crmsh/upgrade_seq" on "hanode2"
And Run "crm status" on "hanode1"
Then Run "stat -c '%U:%G' ~hacluster/.ssh/authorized_keys" OK on "hanode1"
And Expected "hacluster:haclient" in stdout
Then Run "stat -c '%U:%G' ~hacluster/.ssh/authorized_keys" OK on "hanode2"
And Expected "hacluster:haclient" in stdout

@clean
Scenario: Ditch no-quorum-policy=ignore when joining
Expand Down
18 changes: 0 additions & 18 deletions test/features/bootstrap_init_join_remove.feature
Original file line number Diff line number Diff line change
Expand Up @@ -184,21 +184,3 @@ Feature: crmsh bootstrap process - init, join and remove
Then Cluster service is "started" on "hanode3"
And Online nodes are "hanode1 hanode2 hanode3"
And Check passwordless for hacluster between "hanode1 hanode2 hanode3"

Scenario: Check hacluster's user shell
Given Cluster service is "stopped" on "hanode3"
When Run "crm cluster join -c hanode1 -y" on "hanode3"
Then Cluster service is "started" on "hanode3"
And Online nodes are "hanode1 hanode2 hanode3"
When Run "rm -rf /var/lib/heartbeat/cores/hacluster/.ssh" on "hanode1"
And Run "rm -rf /var/lib/heartbeat/cores/hacluster/.ssh" on "hanode2"
And Run "rm -rf /var/lib/heartbeat/cores/hacluster/.ssh" on "hanode3"
And Run "usermod -s /usr/sbin/nologin hacluster" on "hanode1"
And Run "usermod -s /usr/sbin/nologin hacluster" on "hanode2"
And Run "usermod -s /usr/sbin/nologin hacluster" on "hanode3"
And Run "rm -f /var/lib/crmsh/upgrade_seq" on "hanode1"
And Run "rm -f /var/lib/crmsh/upgrade_seq" on "hanode2"
And Run "rm -f /var/lib/crmsh/upgrade_seq" on "hanode3"
And Run "crm status" on "hanode1"
Then Check user shell for hacluster between "hanode1 hanode2 hanode3"
Then Check passwordless for hacluster between "hanode1 hanode2 hanode3"
27 changes: 10 additions & 17 deletions test/features/healthcheck.feature
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,15 @@ Feature: healthcheck detect and fix problems in a crmsh deployment
And Online nodes are "hanode1 hanode2"
And Show cluster status on "hanode1"

@clean
Scenario: a new node joins when directory ~hacluster/.ssh is removed from cluster
When Run "rm -rf ~hacluster/.ssh" on "hanode1"
And Run "rm -rf ~hacluster/.ssh" on "hanode2"
And Run "crm cluster join -c hanode1 -y" on "hanode3"
Then Cluster service is "started" on "hanode3"
# FIXME: new join implement does not trigger a exception any longer, and the auto fix is not applied
# And File "~hacluster/.ssh/id_rsa" exists on "hanode1"
# And File "~hacluster/.ssh/id_rsa" exists on "hanode2"
# And File "~hacluster/.ssh/id_rsa" exists on "hanode3"

# skip non-root as behave_agent is not able to run commands interactively with non-root sudoer
@skip_non_root
@clean
Scenario: An upgrade_seq file in ~hacluster/crmsh/ will be migrated to /var/lib/crmsh (bsc#1213050)
When Run "mv /var/lib/crmsh ~hacluster/" on "hanode1"
Then File "~hacluster/crmsh/upgrade_seq" exists on "hanode1"
When Run "crm cluster status" on "hanode1"
Then File "/var/lib/crmsh/upgrade_seq" exists on "hanode1"
When Run "rm -rf ~hacluster/.ssh" on "hanode1"
And Try "crm cluster health hawk2" on "hanode1"
Then Expected "hawk2: passwordless ssh authentication: FAIL." in stderr
When Run "crm cluster health hawk2 --fix" on "hanode1"
Then Expected "hawk2: passwordless ssh authentication: OK." in stdout
When Run "rm -rf ~hacluster/.ssh /root/.config/crm" on "hanode1"
And Try "crm cluster health hawk2" on "hanode1"
Then Expected "hawk2: passwordless ssh authentication: FAIL." in stderr
When Try "crm cluster health hawk2 --fix" on "hanode1"
Then Expected "Cannot fix automatically" in stderr

0 comments on commit 685cf0a

Please sign in to comment.