diff --git a/crmsh/ui_cluster.py b/crmsh/ui_cluster.py index 654f44ae2f..23340695f5 100644 --- a/crmsh/ui_cluster.py +++ b/crmsh/ui_cluster.py @@ -172,40 +172,73 @@ def do_start(self, context, *args): for node in node_list: logger.info("Cluster services started on {}".format(node)) + @staticmethod + def _node_ready_to_stop_cluster_service(node): + """ + Check if the specific node is ready to stop cluster service + + If both corosync.service and pacemaker.service is active, return True + If some services started, stop them first and return False + """ + corosync_active = utils.service_is_active("corosync.service", remote_addr=node) + sbd_active = utils.service_is_active("sbd.service", remote_addr=node) + pacemaker_active = utils.service_is_active("pacemaker.service", remote_addr=node) + + if not corosync_active: + if sbd_active: + utils.stop_service("corosync", remote_addr=node) + logger.info(f"The cluster stack stopped on {node}") + else: + logger.info(f"The cluster stack already stopped on {node}") + return False + + elif not pacemaker_active: + utils.stop_service("corosync", remote_addr=node) + logger.info("The cluster stack stopped on {}".format(node)) + return False + + return True + + @staticmethod + def _wait_for_dc(node=None): + """ + Wait for the cluster's DC to become available + """ + if not utils.service_is_active("pacemaker.service", remote_addr=node): + return + + dc_deadtime = utils.get_property("dc-deadtime", peer=node) or str(constants.DC_DEADTIME_DEFAULT) + dc_timeout = int(dc_deadtime.strip('s')) + 5 + try: + utils.check_function_with_timeout(utils.get_dc, wait_timeout=dc_timeout, peer=node) + except TimeoutError: + logger.error("No DC found currently, please wait if the cluster is still starting") + raise utils.TerminateSubCommand + + @staticmethod + def _set_dlm(node=None): + """ + When dlm running and quorum is lost, before stop cluster service, should set + enable_quorum_fencing=0, enable_quorum_lockspace=0 for dlm config option + """ + if utils.is_dlm_running(node) and not utils.is_quorate(node): + logger.debug("Quorum is lost; Set enable_quorum_fencing=0 and enable_quorum_lockspace=0 for dlm") + utils.set_dlm_option(peer=node, enable_quorum_fencing=0, enable_quorum_lockspace=0) + @command.skill_level('administrator') def do_stop(self, context, *args): ''' Stops the cluster services on all nodes or specific node(s) ''' node_list = parse_option_for_nodes(context, *args) - for node in node_list[:]: - if not utils.service_is_active("corosync.service", remote_addr=node): - if utils.service_is_active("sbd.service", remote_addr=node): - utils.stop_service("corosync", remote_addr=node) - logger.info("Cluster services stopped on {}".format(node)) - else: - logger.info("Cluster services already stopped on {}".format(node)) - node_list.remove(node) - elif not utils.service_is_active("pacemaker.service", remote_addr=node): - utils.stop_service("corosync", remote_addr=node) - logger.info("Cluster services stopped on {}".format(node)) - node_list.remove(node) + node_list = [n for n in node_list if self._node_ready_to_stop_cluster_service(n)] if not node_list: return + logger.debug(f"stop node list: {node_list}") - dc_deadtime = utils.get_property("dc-deadtime") or constants.DC_DEADTIME_DEFAULT - dc_timeout = int(dc_deadtime.strip('s')) + 5 - try: - utils.check_function_with_timeout(utils.get_dc, wait_timeout=dc_timeout) - except TimeoutError: - logger.error("No DC found currently, please wait if the cluster is still starting") - return False + self._wait_for_dc(node_list[0]) - # When dlm running and quorum is lost, before stop cluster service, should set - # enable_quorum_fencing=0, enable_quorum_lockspace=0 for dlm config option - if utils.is_dlm_running() and not utils.is_quorate(): - logger.debug("Quorum is lost; Set enable_quorum_fencing=0 and enable_quorum_lockspace=0 for dlm") - utils.set_dlm_option(enable_quorum_fencing=0, enable_quorum_lockspace=0) + self._set_dlm(node_list[0]) # Stop pacemaker since it can make sure cluster has quorum until stop corosync utils.stop_service("pacemaker", node_list=node_list) @@ -216,7 +249,7 @@ def do_stop(self, context, *args): utils.stop_service("corosync", node_list=node_list) for node in node_list: - logger.info("Cluster services stopped on {}".format(node)) + logger.info("The cluster stack stopped on {}".format(node)) @command.skill_level('administrator') def do_restart(self, context, *args): diff --git a/crmsh/utils.py b/crmsh/utils.py index f99782c006..d8dd56089d 100644 --- a/crmsh/utils.py +++ b/crmsh/utils.py @@ -853,14 +853,14 @@ def append_file(dest, src): return False -def get_dc(): +def get_dc(peer=None): cmd = "crmadmin -D -t 1" - rc, s, _ = get_stdout_stderr(add_sudo(cmd)) - if rc != 0: + out = get_stdout_or_raise_error(add_sudo(cmd), remote=peer, no_raise=True) + if not out: return None - if not s.startswith("Designated"): + if not out.startswith("Designated"): return None - return s.split()[-1] + return out.split()[-1] def wait4dc(what="", show_progress=True): @@ -2945,47 +2945,62 @@ def is_standby(node): return re.search(r'Node\s+{}:\s+standby'.format(node), out) is not None -def get_dlm_option_dict(): +def get_dlm_option_dict(peer=None): """ Get dlm config option dictionary """ - out = get_stdout_or_raise_error("dlm_tool dump_config") + out = get_stdout_or_raise_error("dlm_tool dump_config", remote=peer) return dict(re.findall("(\w+)=(\w+)", out)) -def set_dlm_option(**kargs): +def set_dlm_option(peer=None, **kargs): """ Set dlm option """ - dlm_option_dict = get_dlm_option_dict() + dlm_option_dict = get_dlm_option_dict(peer=peer) for option, value in kargs.items(): if option not in dlm_option_dict: - raise ValueError('"{}" is not dlm config option'.format(option)) + raise ValueError(f'"{option}" is not dlm config option') if dlm_option_dict[option] != value: - get_stdout_or_raise_error('dlm_tool set_config "{}={}"'.format(option, value)) + get_stdout_or_raise_error(f'dlm_tool set_config "{option}={value}"', remote=peer) -def is_dlm_running(): +def is_dlm_running(peer=None): """ Check if dlm ra controld is running """ - from . import xmlutil - return xmlutil.CrmMonXmlParser.is_resource_started(constants.DLM_CONTROLD_RA) + return is_resource_running(constants.DLM_CONTROLD_RA, peer=peer) -def is_dlm_configured(): +def has_resource_configured(ra_type, peer=None): + """ + Check if the RA configured + """ + out = get_stdout_or_raise_error("crm_mon -1rR", remote=peer) + return re.search(ra_type, out) is not None + + +def is_resource_running(ra_type, peer=None): + """ + Check if the RA running + """ + out = get_stdout_or_raise_error("crm_mon -1rR", remote=peer) + patt = f"\({ra_type}\):\s*Started" + return re.search(patt, out) is not None + + +def is_dlm_configured(peer=None): """ Check if dlm configured """ - from . import xmlutil - return xmlutil.CrmMonXmlParser.is_resource_configured(constants.DLM_CONTROLD_RA) + return has_resource_configured(constants.DLM_CONTROLD_RA, peer=peer) -def is_quorate(): +def is_quorate(peer=None): """ Check if cluster is quorated """ - out = get_stdout_or_raise_error("corosync-quorumtool -s", success_val_list=[0, 2]) + out = get_stdout_or_raise_error("corosync-quorumtool -s", remote=peer, success_val_list=[0, 2]) res = re.search(r'Quorate:\s+(.*)', out) if res: return res.group(1) == "Yes" @@ -3012,14 +3027,16 @@ def get_pcmk_delay_max(two_node_without_qdevice=False): return 0 -def get_property(name): +def get_property(name, property_type="crm_config", peer=None): """ Get cluster properties """ - cib_path = os.getenv('CIB_file', constants.CIB_RAW_FILE) - cmd = "CIB_file={} crm configure get_property {}".format(cib_path, name) - rc, stdout, _ = get_stdout_stderr(cmd) - return stdout if rc == 0 else None + if property_type == "crm_config": + cib_path = os.getenv('CIB_file', constants.CIB_RAW_FILE) + cmd = "CIB_file={} sudo --preserve-env=CIB_file crm configure get_property {}".format(cib_path, name) + else: + cmd = "sudo crm_attribute -t {} -n {} -Gq".format(property_type, name) + return get_stdout_or_raise_error(cmd, remote=peer, no_raise=True) def set_property(**kwargs): @@ -3145,7 +3162,7 @@ def read_from_file(infile): return to_ascii(data) -def check_function_with_timeout(check_function, wait_timeout=30, interval=1): +def check_function_with_timeout(check_function, wait_timeout=30, interval=1, *args, **kwargs): """ Run check_function in a loop Return when check_function is true @@ -3154,7 +3171,7 @@ def check_function_with_timeout(check_function, wait_timeout=30, interval=1): current_time = int(time.time()) timeout = current_time + wait_timeout while current_time <= timeout: - if check_function(): + if check_function(*args, **kwargs): return time.sleep(interval) current_time = int(time.time()) diff --git a/test/features/bootstrap_bugs.feature b/test/features/bootstrap_bugs.feature index 8775ce32d2..97b1085ab9 100644 --- a/test/features/bootstrap_bugs.feature +++ b/test/features/bootstrap_bugs.feature @@ -131,3 +131,25 @@ Feature: Regression test for bootstrap bugs Then Service "corosync" is "started" on "hanode1" When Run "crm cluster stop" on "hanode1" Then Service "corosync" is "stopped" on "hanode1" + + @clean + Scenario: Can't stop all nodes' cluster service when local node's service is down(bsc#1213889) + Given Cluster service is "stopped" on "hanode1" + And Cluster service is "stopped" on "hanode2" + When Run "crm cluster init -y" on "hanode1" + Then Cluster service is "started" on "hanode1" + When Run "crm cluster join -c hanode1 -y" on "hanode2" + Then Cluster service is "started" on "hanode2" + When Wait for DC + When Wait "10" seconds + Then Online nodes are "hanode1 hanode2" + # Add more operations + When Run "crm node standby hanode1" on "hanode1" + When Run "crm node online hanode1" on "hanode1" + When Run "crm node standby hanode2" on "hanode1" + When Run "crm node online hanode2" on "hanode1" + When Wait "10" seconds + When Run "crm cluster stop" on "hanode1" + And Run "crm cluster stop --all" on "hanode1" + Then Cluster service is "stopped" on "hanode1" + And Cluster service is "stopped" on "hanode2" diff --git a/test/testcases/confbasic-xml.exp b/test/testcases/confbasic-xml.exp index 20892dcdb6..b934d5bb3b 100644 --- a/test/testcases/confbasic-xml.exp +++ b/test/testcases/confbasic-xml.exp @@ -67,7 +67,7 @@ - + diff --git a/test/testcases/confbasic.exp b/test/testcases/confbasic.exp index 5fc2dff519..ad15b963ad 100644 --- a/test/testcases/confbasic.exp +++ b/test/testcases/confbasic.exp @@ -89,7 +89,7 @@ primitive d2 Delay \ params mondelay=45 \ op start timeout=60s interval=0s \ op stop timeout=60s interval=0s \ - op monitor timeout=30s interval=10s \ + op monitor timeout=40s interval=10s \ op monitor role=Started interval=60s timeout=30s primitive d3 ocf:pacemaker:Dummy \ op monitor timeout=20s interval=10s \ diff --git a/test/testcases/file.exp b/test/testcases/file.exp index dce48de52e..e78b5b8655 100644 --- a/test/testcases/file.exp +++ b/test/testcases/file.exp @@ -11,9 +11,9 @@ primitive p1 ocf:pacemaker:Dummy \ op stop timeout=20s interval=0s primitive p2 Delay \ params startdelay=2 mondelay=2 stopdelay=2 \ - op monitor timeout=30s interval=10s \ + op monitor timeout=40s interval=10s \ op start timeout=30s interval=0s \ - op stop timeout=30s interval=0s + op stop timeout=40s interval=0s primitive p3 ocf:pacemaker:Dummy \ op monitor timeout=20s interval=10s \ op start timeout=20s interval=0s \ @@ -56,9 +56,9 @@ primitive p0 ocf:pacemaker:Dummy \ op stop timeout=20s interval=0s primitive p2 Delay \ params startdelay=2 mondelay=2 stopdelay=2 \ - op monitor timeout=30s interval=10s \ + op monitor timeout=40s interval=10s \ op start timeout=30s interval=0s \ - op stop timeout=30s interval=0s + op stop timeout=40s interval=0s primitive p3 ocf:pacemaker:Dummy \ op monitor timeout=20s interval=10s \ op start timeout=20s interval=0s \ diff --git a/test/testcases/resource.exp b/test/testcases/resource.exp index 977b9b5c2f..114939413d 100644 --- a/test/testcases/resource.exp +++ b/test/testcases/resource.exp @@ -133,9 +133,9 @@ resource p0 is NOT running - + - + @@ -167,9 +167,9 @@ resource p0 is NOT running - + - + diff --git a/test/unittests/test_ui_cluster.py b/test/unittests/test_ui_cluster.py index 66463777be..17da24771a 100644 --- a/test/unittests/test_ui_cluster.py +++ b/test/unittests/test_ui_cluster.py @@ -79,51 +79,91 @@ def test_do_start(self, mock_parse_nodes, mock_active, mock_start, mock_qdevice_ mock_qdevice_configured.assert_called_once_with() mock_info.assert_called_once_with("Cluster services started on node1") - @mock.patch('logging.Logger.info') - @mock.patch('crmsh.utils.service_is_active') + @mock.patch('crmsh.ui_cluster.Cluster._wait_for_dc') + @mock.patch('crmsh.ui_cluster.Cluster._node_ready_to_stop_cluster_service') @mock.patch('crmsh.ui_cluster.parse_option_for_nodes') - def test_do_stop_already_stopped(self, mock_parse_nodes, mock_active, mock_info): + def test_do_stop_return(self, mock_parse_nodes, mock_node_ready_to_stop_cluster_service, mock_dc): + mock_parse_nodes.return_value = ["node1", "node2"] + mock_node_ready_to_stop_cluster_service.side_effect = [False, False] + context_inst = mock.Mock() - mock_parse_nodes.return_value = ["node1"] - mock_active.side_effect = [False, False] - self.ui_cluster_inst.do_stop(context_inst, "node1") - mock_active.assert_has_calls([ - mock.call("corosync.service", remote_addr="node1"), - mock.call("sbd.service", remote_addr="node1") - ]) - mock_info.assert_called_once_with("Cluster services already stopped on node1") + self.ui_cluster_inst.do_stop(context_inst, "node1", "node2") + + mock_parse_nodes.assert_called_once_with(context_inst, "node1", "node2") + mock_node_ready_to_stop_cluster_service.assert_has_calls([mock.call("node1"), mock.call("node2")]) + mock_dc.assert_not_called() @mock.patch('logging.Logger.debug') @mock.patch('logging.Logger.info') - @mock.patch('crmsh.utils.stop_service') - @mock.patch('crmsh.utils.set_dlm_option') - @mock.patch('crmsh.utils.is_quorate') - @mock.patch('crmsh.utils.is_dlm_running') - @mock.patch('crmsh.utils.get_dc') - @mock.patch('crmsh.utils.check_function_with_timeout') - @mock.patch('crmsh.utils.get_property') @mock.patch('crmsh.utils.service_is_active') + @mock.patch('crmsh.utils.stop_service') + @mock.patch('crmsh.ui_cluster.Cluster._set_dlm') + @mock.patch('crmsh.ui_cluster.Cluster._wait_for_dc') + @mock.patch('crmsh.ui_cluster.Cluster._node_ready_to_stop_cluster_service') @mock.patch('crmsh.ui_cluster.parse_option_for_nodes') - def test_do_stop(self, mock_parse_nodes, mock_active, mock_get_property, mock_check, mock_get_dc, mock_dlm_running, mock_is_quorate, mock_set_dlm, mock_stop, mock_info, mock_debug): + def test_do_stop(self, mock_parse_nodes, mock_node_ready_to_stop_cluster_service, mock_dc, + mock_set_dlm, mock_stop, mock_is_active, mock_info, mock_debug): + mock_parse_nodes.return_value = ["node1", "node2"] + mock_node_ready_to_stop_cluster_service.side_effect = [True, False] + mock_stop.side_effect = [["node1"], ["node1"], ["node1"]] + mock_is_active.return_value = True + context_inst = mock.Mock() - mock_parse_nodes.return_value = ["node1"] - mock_active.side_effect = [True, True, True] - mock_dlm_running.return_value = True - mock_is_quorate.return_value = False - mock_get_property.return_value = "20s" + self.ui_cluster_inst.do_stop(context_inst, "node1", "node2") - self.ui_cluster_inst.do_stop(context_inst, "node1") + mock_parse_nodes.assert_called_once_with(context_inst, "node1", "node2") + mock_node_ready_to_stop_cluster_service.assert_has_calls([mock.call("node1"), mock.call("node2")]) + mock_debug.assert_called_once_with("stop node list: ['node1']") + mock_dc.assert_called_once_with("node1") + mock_set_dlm.assert_called_once_with("node1") + mock_stop.assert_has_calls([ + mock.call("pacemaker", node_list=["node1"]), + mock.call("corosync-qdevice.service", node_list=["node1"]), + mock.call("corosync", node_list=["node1"]), + ]) + mock_info.assert_called_once_with("The cluster stack stopped on node1") - mock_active.assert_has_calls([ + @mock.patch('logging.Logger.info') + @mock.patch('crmsh.utils.stop_service') + @mock.patch('crmsh.utils.service_is_active') + def test_node_ready_to_stop_cluster_service_corosync(self, mock_is_active, mock_stop, mock_info): + mock_is_active.side_effect = [False, True, False] + res = self.ui_cluster_inst._node_ready_to_stop_cluster_service("node1") + assert res is False + mock_is_active.assert_has_calls([ mock.call("corosync.service", remote_addr="node1"), + mock.call("sbd.service", remote_addr="node1"), mock.call("pacemaker.service", remote_addr="node1"), - mock.call("corosync-qdevice.service") ]) - mock_stop.assert_has_calls([ - mock.call("pacemaker", node_list=["node1"]), - mock.call("corosync-qdevice.service", node_list=["node1"]), - mock.call("corosync", node_list=["node1"]) + mock_stop.assert_called_once_with("corosync", remote_addr="node1") + mock_info.assert_called_once_with("The cluster stack stopped on node1") + + @mock.patch('logging.Logger.info') + @mock.patch('crmsh.utils.stop_service') + @mock.patch('crmsh.utils.service_is_active') + def test_node_ready_to_stop_cluster_service_pacemaker(self, mock_is_active, mock_stop, mock_info): + mock_is_active.side_effect = [True, True, False] + res = self.ui_cluster_inst._node_ready_to_stop_cluster_service("node1") + assert res is False + mock_is_active.assert_has_calls([ + mock.call("corosync.service", remote_addr="node1"), + mock.call("sbd.service", remote_addr="node1"), + mock.call("pacemaker.service", remote_addr="node1"), + ]) + mock_stop.assert_called_once_with("corosync", remote_addr="node1") + mock_info.assert_called_once_with("The cluster stack stopped on node1") + + @mock.patch('logging.Logger.info') + @mock.patch('crmsh.utils.stop_service') + @mock.patch('crmsh.utils.service_is_active') + def test_node_ready_to_stop_cluster_service(self, mock_is_active, mock_stop, mock_info): + mock_is_active.side_effect = [True, True, True] + res = self.ui_cluster_inst._node_ready_to_stop_cluster_service("node1") + assert res is True + mock_is_active.assert_has_calls([ + mock.call("corosync.service", remote_addr="node1"), + mock.call("sbd.service", remote_addr="node1"), + mock.call("pacemaker.service", remote_addr="node1"), ]) - mock_info.assert_called_once_with("Cluster services stopped on node1") - mock_debug.assert_called_once_with("Quorum is lost; Set enable_quorum_fencing=0 and enable_quorum_lockspace=0 for dlm") - mock_check.assert_called_once_with(mock_get_dc, wait_timeout=25) + mock_info.assert_not_called() + mock_stop.assert_not_called() diff --git a/test/unittests/test_utils.py b/test/unittests/test_utils.py index 43065d3ad3..f30ac976c2 100644 --- a/test/unittests/test_utils.py +++ b/test/unittests/test_utils.py @@ -1500,7 +1500,7 @@ def test_get_dlm_option_dict(mock_run): "key1": "value1", "key2": "value2" } - mock_run.assert_called_once_with("dlm_tool dump_config") + mock_run.assert_called_once_with("dlm_tool dump_config", remote=None) @mock.patch('crmsh.utils.get_dlm_option_dict') @@ -1522,14 +1522,14 @@ def test_set_dlm_option(mock_get_dict, mock_run): "key2": "value2" } utils.set_dlm_option(key2="test") - mock_run.assert_called_once_with('dlm_tool set_config "key2=test"') + mock_run.assert_called_once_with('dlm_tool set_config "key2=test"', remote=None) -@mock.patch('crmsh.xmlutil.CrmMonXmlParser.is_resource_configured') +@mock.patch('crmsh.utils.has_resource_configured') def test_is_dlm_configured(mock_configured): mock_configured.return_value = True assert utils.is_dlm_configured() is True - mock_configured.assert_called_once_with(constants.DLM_CONTROLD_RA) + mock_configured.assert_called_once_with(constants.DLM_CONTROLD_RA, peer=None) @mock.patch('crmsh.utils.get_stdout_or_raise_error') @@ -1538,7 +1538,7 @@ def test_is_quorate_exception(mock_run): with pytest.raises(ValueError) as err: utils.is_quorate() assert str(err.value) == "Failed to get quorate status from corosync-quorumtool" - mock_run.assert_called_once_with("corosync-quorumtool -s", success_val_list=[0, 2]) + mock_run.assert_called_once_with("corosync-quorumtool -s", remote=None, success_val_list=[0, 2]) @mock.patch('crmsh.utils.get_stdout_or_raise_error') @@ -1548,7 +1548,7 @@ def test_is_quorate(mock_run): Quorate: Yes """ assert utils.is_quorate() is True - mock_run.assert_called_once_with("corosync-quorumtool -s", success_val_list=[0, 2]) + mock_run.assert_called_once_with("corosync-quorumtool -s", remote=None, success_val_list=[0, 2]) @mock.patch('crmsh.utils.etree.fromstring') @@ -1617,12 +1617,12 @@ def test_list_cluster_nodes(mock_run, mock_env, mock_isfile, mock_file2elem): @mock.patch('os.getenv') -@mock.patch('crmsh.utils.get_stdout_stderr') +@mock.patch('crmsh.utils.get_stdout_or_raise_error') def test_get_property(mock_run, mock_env): - mock_run.return_value = (0, "data", None) + mock_run.return_value = "data" mock_env.return_value = "cib.xml" assert utils.get_property("no-quorum-policy") == "data" - mock_run.assert_called_once_with("CIB_file=cib.xml crm configure get_property no-quorum-policy") + mock_run.assert_called_once_with("CIB_file=cib.xml sudo --preserve-env=CIB_file crm configure get_property no-quorum-policy", remote=None, no_raise=True) @mock.patch('crmsh.utils.get_stdout_or_raise_error')