Skip to content

Commit

Permalink
Dev: implement ssh-agent support for bootstrap (jsc#PED-5774)
Browse files Browse the repository at this point in the history
  • Loading branch information
nicholasyang2022 committed Oct 9, 2023
1 parent feb9b1c commit ad3c90b
Show file tree
Hide file tree
Showing 6 changed files with 355 additions and 99 deletions.
161 changes: 108 additions & 53 deletions crmsh/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,8 @@ def __init__(self):
COROSYNC_AUTH, "/var/lib/heartbeat/crm/*", "/var/lib/pacemaker/cib/*",
"/var/lib/corosync/*", "/var/lib/pacemaker/pengine/*", PCMK_REMOTE_AUTH,
"/var/lib/csync2/*", "~/.config/crm/*"]
self.ssh_key_file = None
self.use_ssh_agent = False
self.skip_ssh = False

@classmethod
def set_context(cls, options):
Expand Down Expand Up @@ -485,7 +486,7 @@ def is_online():
if not xmlutil.CrmMonXmlParser.is_node_online(cluster_node):
shutil.copy(COROSYNC_CONF_ORIG, corosync.conf())
sync_file(corosync.conf())
ServiceManager(sh.LocalOnlyClusterShell(sh.LocalShell())).stop_service("corosync")
ServiceManager(sh.ClusterShellAdaptorForLocalShell(sh.LocalShell())).stop_service("corosync")
print()
utils.fatal("Cannot see peer node \"{}\", please check the communication IP".format(cluster_node))
return True
Expand Down Expand Up @@ -833,11 +834,22 @@ def _parse_user_at_host(s: str, default_user: str) -> typing.Tuple[str, str]:

def init_ssh():
user_host_list = [_parse_user_at_host(x, _context.current_user) for x in _context.user_at_node_list]
init_ssh_impl(
_context.current_user,
ssh_key.KeyFile(_context.ssh_key_file) if _context.ssh_key_file is not None else None,
user_host_list,
)
if _context.use_ssh_agent:
try:
ssh_agent = ssh_key.AgentClient()
keys = ssh_agent.list()
key = keys[utils.ask_for_selection(
[key.public_key() for key in keys],
"Select a public key",
0,
_context.yes_to_all,
)]
except ssh_key.Error:
logger.error("Cannot get a public key from ssh-agent.")
raise
else:
key = None
init_ssh_impl(_context.current_user, key, user_host_list)
if user_host_list:
service_manager = ServiceManager()
for user, node in user_host_list:
Expand All @@ -852,8 +864,9 @@ def init_ssh_impl(local_user: str, ssh_public_key: typing.Optional[ssh_key.Key],
If user_node_list is not empty, those user and host will also be configured.
If ssh_public_key is specified, it will be added to authorized_keys; if not, a new key pair will be generated for each node.
"""
ServiceManager(sh.LocalOnlyClusterShell(sh.LocalShell())).start_service("sshd.service", enable=True)
shell = sh.SSHShell(sh.LocalShell(), local_user)
ServiceManager(sh.ClusterShellAdaptorForLocalShell(sh.LocalShell())).start_service("sshd.service", enable=True)
local_shell = sh.LocalShell()
shell = sh.SSHShell(local_shell, local_user)
authorized_key_manager = ssh_key.AuthorizedKeyManager(shell)
if ssh_public_key is not None:
# Use specified key. Do not generate new ones.
Expand All @@ -870,6 +883,22 @@ def init_ssh_impl(local_user: str, ssh_public_key: typing.Optional[ssh_key.Key],
authorized_key_manager.add(node, user, ssh_public_key)
else:
_init_ssh_on_remote_nodes(local_user, ssh_public_key, user_node_list)
user_by_host = utils.HostUserConfig()
for user, node in user_node_list:
user_by_host.add(user, node)
user_by_host.add(local_user, utils.this_node())
user_by_host.save_remote([node for user, node in user_node_list])
for user, node in user_node_list:
change_user_shell('hacluster', node)
# Starting from here, ClusterShell is available
shell = sh.ClusterShell(local_shell, UserOfHost.instance())
# FIXME: AuthorizedKeyManager cannot operate a user on remote host using root permission provided by ClusterShell
authorized_key_manager = ssh_key.AuthorizedKeyManager(shell)
_init_ssh_for_secondary_user_on_remote_nodes(
shell, authorized_key_manager,
[node for user, node in user_node_list],
'hacluster',
)


def _init_ssh_on_remote_nodes(
Expand All @@ -879,16 +908,13 @@ def _init_ssh_on_remote_nodes(
):
# Swap public ssh key between remote node and local
public_key_list = list()
hacluster_public_key_list = list()
for i, (remote_user, node) in enumerate(user_node_list):
utils.ssh_copy_id(local_user, remote_user, node)
# After this, login to remote_node is passwordless
public_key_list.append(swap_public_ssh_key(node, local_user, remote_user, local_user, remote_user, add=True))
hacluster_public_key_list.append(swap_public_ssh_key(node, 'hacluster', 'hacluster', local_user, remote_user, add=True))
if len(user_node_list) > 1:
shell = sh.LocalShell()
shell_script = _merge_authorized_keys(public_key_list)
hacluster_shell_script = _merge_authorized_keys(hacluster_public_key_list)
for i, (remote_user, node) in enumerate(user_node_list):
result = shell.su_subprocess_run(
local_user,
Expand All @@ -899,22 +925,25 @@ def _init_ssh_on_remote_nodes(
)
if result.returncode != 0:
utils.fatal('Failed to add public keys to {}@{}: {}'.format(remote_user, node, result.stdout))
result = shell.su_subprocess_run(
local_user,
'ssh {} {}@{} sudo -H -u {} /bin/sh'.format(constants.SSH_OPTION, remote_user, node, 'hacluster'),
input=hacluster_shell_script,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
)
if result.returncode != 0:
utils.fatal('Failed to add public keys to {}@{}: {}'.format(remote_user, node, result.stdout))
user_by_host = utils.HostUserConfig()
for user, node in user_node_list:
user_by_host.add(user, node)
user_by_host.add(local_user, utils.this_node())
user_by_host.save_remote([node for user, node in user_node_list])
for user, node in user_node_list:
change_user_shell('hacluster', node)


def _init_ssh_for_secondary_user_on_remote_nodes(
cluster_shell: sh.ClusterShell,
authorized_key_manager: ssh_key.AuthorizedKeyManager,
nodes: typing.Iterable[str],
user: str,
):
key_file_manager = ssh_key.KeyFileManager(cluster_shell)
local_keys = [ssh_key.KeyFile(path) for path in key_file_manager.list_public_key_for_user(None, user)]
assert local_keys
for node in nodes:
if not sh.SSHShell(cluster_shell.local_shell, user).can_run_as(node, user):
for key in local_keys:
authorized_key_manager.add(node, user, key)
remote_keys = key_file_manager.ensure_key_pair_exists_for_user(node, user)
for key in remote_keys:
authorized_key_manager.add(None, user, key)



def _merge_authorized_keys(keys: typing.List[str]) -> bytes:
Expand Down Expand Up @@ -1704,26 +1733,48 @@ def join_ssh(seed_host, seed_user):
"""
if not seed_host:
utils.fatal("No existing IP/hostname specified (use -c option)")

local_user = _context.current_user
ServiceManager(sh.LocalOnlyClusterShell(sh.LocalShell())).start_service("sshd.service", enable=True)
configure_ssh_key(local_user)
if 0 != utils.ssh_copy_id_no_raise(local_user, seed_user, seed_host):
msg = f"Failed to login to {seed_user}@{seed_host}. Please check the credentials."
sudoer = userdir.get_sudoer()
if sudoer and seed_user != sudoer:
args = ['sudo crm']
args += [x for x in sys.argv[1:]]
for i, arg in enumerate(args):
if arg == '-c' or arg == '--cluster-node' and i + 1 < len(args):
if '@' not in args[i+1]:
args[i + 1] = f'{sudoer}@{seed_host}'
msg += '\nOr, run "{}".'.format(' '.join(args))
raise ValueError(msg)
# After this, login to remote_node is passwordless
swap_public_ssh_key(seed_host, local_user, seed_user, local_user, seed_user, add=True)
configure_ssh_key('hacluster')
swap_public_ssh_key(seed_host, 'hacluster', 'hacluster', local_user, seed_user, add=True)

if _context.use_ssh_agent:
try:
ssh_agent = ssh_key.AgentClient()
keys = ssh_agent.list()
key = keys[utils.ask_for_selection(
[key.public_key() for key in keys],
"Select a public key",
0,
_context.yes_to_all,
)]
except ssh_key.Error:
logger.error("Cannot get a public key from ssh-agent.")
raise
else:
key = None
return join_ssh_impl(local_user, seed_host, seed_user, key, _context.skip_ssh)


def join_ssh_impl(local_user, seed_host, seed_user, key: typing.Optional[ssh_key.Key], skip_key_swap: bool):
ServiceManager(sh.ClusterShellAdaptorForLocalShell(sh.LocalShell())).start_service("sshd.service", enable=True)
if key is not None:
ssh_key.AuthorizedKeyManager(sh.SSHShell(sh.LocalShell(), local_user)).add(None, local_user, key)
elif not skip_key_swap:
configure_ssh_key(local_user)
if 0 != utils.ssh_copy_id_no_raise(local_user, seed_user, seed_host):
msg = f"Failed to login to {seed_user}@{seed_host}. Please check the credentials."
sudoer = userdir.get_sudoer()
if sudoer and seed_user != sudoer:
args = ['sudo crm']
args += [x for x in sys.argv[1:]]
for i, arg in enumerate(args):
if arg == '-c' or arg == '--cluster-node' and i + 1 < len(args):
if '@' not in args[i+1]:
args[i + 1] = f'{sudoer}@{seed_host}'
msg += '\nOr, run "{}".'.format(' '.join(args))
raise ValueError(msg)
# After this, login to remote_node is passwordless
swap_public_ssh_key(seed_host, local_user, seed_user, local_user, seed_user, add=True)
configure_ssh_key('hacluster')
swap_public_ssh_key(seed_host, 'hacluster', 'hacluster', local_user, seed_user, add=True)

# This makes sure the seed host has its own SSH keys in its own
# authorized_keys file (again, to help with the case where the
Expand Down Expand Up @@ -2197,7 +2248,7 @@ def update_nodeid(nodeid, node=None):
if is_qdevice_configured:
start_qdevice_on_join_node(seed_host)
else:
ServiceManager(sh.LocalOnlyClusterShell(sh.LocalShell())).disable_service("corosync-qdevice.service")
ServiceManager(sh.ClusterShellAdaptorForLocalShell(sh.LocalShell())).disable_service("corosync-qdevice.service")


def adjust_priority_in_rsc_defaults(is_2node_wo_qdevice):
Expand Down Expand Up @@ -2246,7 +2297,7 @@ def start_qdevice_on_join_node(seed_host):
qnetd_addr = corosync.get_value("quorum.device.net.host")
qdevice_inst = qdevice.QDevice(qnetd_addr, cluster_node=seed_host)
qdevice_inst.certificate_process_on_join()
ServiceManager(sh.LocalOnlyClusterShell(sh.LocalShell())).start_service("corosync-qdevice.service", enable=True)
ServiceManager(sh.ClusterShellAdaptorForLocalShell(sh.LocalShell())).start_service("corosync-qdevice.service", enable=True)


def get_cluster_node_ip(node: str) -> str:
Expand Down Expand Up @@ -2402,7 +2453,7 @@ def bootstrap_init(context):
_context.cluster_node = args[1]

if stage and _context.cluster_is_running and \
not ServiceManager(shell=sh.LocalOnlyClusterShell(sh.LocalShell())).service_is_active(CSYNC2_SERVICE):
not ServiceManager(shell=sh.ClusterShellAdaptorForLocalShell(sh.LocalShell())).service_is_active(CSYNC2_SERVICE):
_context.skip_csync2 = True
_context.node_list_in_cluster = utils.list_cluster_nodes()
elif not _context.cluster_is_running:
Expand Down Expand Up @@ -2449,12 +2500,16 @@ def bootstrap_add(context):
options += '-i {} '.format(nic)
options = " {}".format(options.strip()) if options else ""

if context.use_ssh_agent:
options += ' --skip-ssh'

shell = sh.ClusterShell(sh.LocalShell(), UserOfHost.instance(), _context.use_ssh_agent)
for (user, node) in (_parse_user_at_host(x, _context.current_user) for x in _context.user_at_node_list):
print()
logger.info("Adding node {} to cluster".format(node))
cmd = 'crm cluster join -y {} -c {}@{}'.format(options, _context.current_user, utils.this_node())
logger.info("Running command on {}: {}".format(node, cmd))
out = sh.cluster_shell().get_stdout_or_raise_error(cmd, node)
out = shell.get_stdout_or_raise_error(cmd, node)
print(out)


Expand All @@ -2470,7 +2525,7 @@ def bootstrap_join(context):

check_tty()

corosync_active = ServiceManager(sh.LocalOnlyClusterShell(sh.LocalShell())).service_is_active("corosync.service")
corosync_active = ServiceManager(sh.ClusterShellAdaptorForLocalShell(sh.LocalShell())).service_is_active("corosync.service")
if corosync_active and _context.stage != "ssh":
utils.fatal("Abort: Cluster is currently active. Run this command on a node joining the cluster.")

Expand Down Expand Up @@ -2849,7 +2904,7 @@ def bootstrap_arbitrator(context):
utils.fatal("Failed to copy {} from {}".format(BOOTH_CFG, _context.cluster_node))
# TODO: verify that the arbitrator IP in the configuration is us?
logger.info("Enabling and starting the booth arbitrator service")
ServiceManager(sh.LocalOnlyClusterShell(sh.LocalShell())).start_service("booth@booth", enable=True)
ServiceManager(sh.ClusterShellAdaptorForLocalShell(sh.LocalShell())).start_service("booth@booth", enable=True)


def get_stonith_timeout_generally_expected():
Expand Down
Loading

0 comments on commit ad3c90b

Please sign in to comment.