Merge pull request ClusterLabs#1279 from liangxin1300/20230803_stop_c…

…luster_all_crmsh44 [crmsh-4.4] Fix: ui_cluster: Improve the process of 'crm cluster stop' (bsc#1213889)
nicholasyang2022 · Dec 12, 2023 · 3f8e358 · 3f8e358
2 parents c4f7b53 + e0c7d93
commit 3f8e358
Show file tree

Hide file tree

Showing 9 changed files with 216 additions and 104 deletions.
diff --git a/crmsh/ui_cluster.py b/crmsh/ui_cluster.py
@@ -172,40 +172,73 @@ def do_start(self, context, *args):
         for node in node_list:
             logger.info("Cluster services started on {}".format(node))
 
+    @staticmethod
+    def _node_ready_to_stop_cluster_service(node):
+        """
+        Check if the specific node is ready to stop cluster service
+
+        If both corosync.service and pacemaker.service is active, return True
+        If some services started, stop them first and return False
+        """
+        corosync_active = utils.service_is_active("corosync.service", remote_addr=node)
+        sbd_active = utils.service_is_active("sbd.service", remote_addr=node)
+        pacemaker_active = utils.service_is_active("pacemaker.service", remote_addr=node)
+
+        if not corosync_active:
+            if sbd_active:
+                utils.stop_service("corosync", remote_addr=node)
+                logger.info(f"The cluster stack stopped on {node}")
+            else:
+                logger.info(f"The cluster stack already stopped on {node}")
+            return False
+
+        elif not pacemaker_active:
+            utils.stop_service("corosync", remote_addr=node)
+            logger.info("The cluster stack stopped on {}".format(node))
+            return False
+
+        return True
+
+    @staticmethod
+    def _wait_for_dc(node=None):
+        """
+        Wait for the cluster's DC to become available
+        """
+        if not utils.service_is_active("pacemaker.service", remote_addr=node):
+            return
+
+        dc_deadtime = utils.get_property("dc-deadtime", peer=node) or str(constants.DC_DEADTIME_DEFAULT)
+        dc_timeout = int(dc_deadtime.strip('s')) + 5
+        try:
+            utils.check_function_with_timeout(utils.get_dc, wait_timeout=dc_timeout, peer=node)
+        except TimeoutError:
+            logger.error("No DC found currently, please wait if the cluster is still starting")
+            raise utils.TerminateSubCommand
+
+    @staticmethod
+    def _set_dlm(node=None):
+        """
+        When dlm running and quorum is lost, before stop cluster service, should set
+        enable_quorum_fencing=0, enable_quorum_lockspace=0 for dlm config option
+        """
+        if utils.is_dlm_running(node) and not utils.is_quorate(node):
+            logger.debug("Quorum is lost; Set enable_quorum_fencing=0 and enable_quorum_lockspace=0 for dlm")
+            utils.set_dlm_option(peer=node, enable_quorum_fencing=0, enable_quorum_lockspace=0)
+
     @command.skill_level('administrator')
     def do_stop(self, context, *args):
         '''
         Stops the cluster services on all nodes or specific node(s)
         '''
         node_list = parse_option_for_nodes(context, *args)
-        for node in node_list[:]:
-            if not utils.service_is_active("corosync.service", remote_addr=node):
-                if utils.service_is_active("sbd.service", remote_addr=node):
-                    utils.stop_service("corosync", remote_addr=node)
-                    logger.info("Cluster services stopped on {}".format(node))
-                else:
-                    logger.info("Cluster services already stopped on {}".format(node))
-                node_list.remove(node)
-            elif not utils.service_is_active("pacemaker.service", remote_addr=node):
-                utils.stop_service("corosync", remote_addr=node)
-                logger.info("Cluster services stopped on {}".format(node))
-                node_list.remove(node)
+        node_list = [n for n in node_list if self._node_ready_to_stop_cluster_service(n)]
         if not node_list:
             return
+        logger.debug(f"stop node list: {node_list}")
 
-        dc_deadtime = utils.get_property("dc-deadtime") or constants.DC_DEADTIME_DEFAULT
-        dc_timeout = int(dc_deadtime.strip('s')) + 5
-        try:
-            utils.check_function_with_timeout(utils.get_dc, wait_timeout=dc_timeout)
-        except TimeoutError:
-            logger.error("No DC found currently, please wait if the cluster is still starting")
-            return False
+        self._wait_for_dc(node_list[0])
 
-        # When dlm running and quorum is lost, before stop cluster service, should set
-        # enable_quorum_fencing=0, enable_quorum_lockspace=0 for dlm config option
-        if utils.is_dlm_running() and not utils.is_quorate():
-            logger.debug("Quorum is lost; Set enable_quorum_fencing=0 and enable_quorum_lockspace=0 for dlm")
-            utils.set_dlm_option(enable_quorum_fencing=0, enable_quorum_lockspace=0)
+        self._set_dlm(node_list[0])
 
         # Stop pacemaker since it can make sure cluster has quorum until stop corosync
         utils.stop_service("pacemaker", node_list=node_list)
@@ -216,7 +249,7 @@ def do_stop(self, context, *args):
         utils.stop_service("corosync", node_list=node_list)
 
         for node in node_list:
-            logger.info("Cluster services stopped on {}".format(node))
+            logger.info("The cluster stack stopped on {}".format(node))
 
     @command.skill_level('administrator')
     def do_restart(self, context, *args):

diff --git a/crmsh/utils.py b/crmsh/utils.py
@@ -853,14 +853,14 @@ def append_file(dest, src):
         return False
 
 
-def get_dc():
+def get_dc(peer=None):
     cmd = "crmadmin -D -t 1"
-    rc, s, _ = get_stdout_stderr(add_sudo(cmd))
-    if rc != 0:
+    out = get_stdout_or_raise_error(add_sudo(cmd), remote=peer, no_raise=True)
+    if not out:
         return None
-    if not s.startswith("Designated"):
+    if not out.startswith("Designated"):
         return None
-    return s.split()[-1]
+    return out.split()[-1]
 
 
 def wait4dc(what="", show_progress=True):
@@ -2945,47 +2945,62 @@ def is_standby(node):
     return re.search(r'Node\s+{}:\s+standby'.format(node), out) is not None
 
 
-def get_dlm_option_dict():
+def get_dlm_option_dict(peer=None):
     """
     Get dlm config option dictionary
     """
-    out = get_stdout_or_raise_error("dlm_tool dump_config")
+    out = get_stdout_or_raise_error("dlm_tool dump_config", remote=peer)
     return dict(re.findall("(\w+)=(\w+)", out))
 
 
-def set_dlm_option(**kargs):
+def set_dlm_option(peer=None, **kargs):
     """
     Set dlm option
     """
-    dlm_option_dict = get_dlm_option_dict()
+    dlm_option_dict = get_dlm_option_dict(peer=peer)
     for option, value in kargs.items():
         if option not in dlm_option_dict:
-            raise ValueError('"{}" is not dlm config option'.format(option))
+            raise ValueError(f'"{option}" is not dlm config option')
         if dlm_option_dict[option] != value:
-            get_stdout_or_raise_error('dlm_tool set_config "{}={}"'.format(option, value))
+            get_stdout_or_raise_error(f'dlm_tool set_config "{option}={value}"', remote=peer)
 
 
-def is_dlm_running():
+def is_dlm_running(peer=None):
     """
     Check if dlm ra controld is running
     """
-    from . import xmlutil
-    return xmlutil.CrmMonXmlParser.is_resource_started(constants.DLM_CONTROLD_RA)
+    return is_resource_running(constants.DLM_CONTROLD_RA, peer=peer)
 
 
-def is_dlm_configured():
+def has_resource_configured(ra_type, peer=None):
+    """
+    Check if the RA configured
+    """
+    out = get_stdout_or_raise_error("crm_mon -1rR", remote=peer)
+    return re.search(ra_type, out) is not None
+
+
+def is_resource_running(ra_type, peer=None):
+    """
+    Check if the RA running
+    """
+    out = get_stdout_or_raise_error("crm_mon -1rR", remote=peer)
+    patt = f"\({ra_type}\):\s*Started"
+    return re.search(patt, out) is not None
+
+
+def is_dlm_configured(peer=None):
     """
     Check if dlm configured
     """
-    from . import xmlutil
-    return xmlutil.CrmMonXmlParser.is_resource_configured(constants.DLM_CONTROLD_RA)
+    return has_resource_configured(constants.DLM_CONTROLD_RA, peer=peer)
 
 
-def is_quorate():
+def is_quorate(peer=None):
     """
     Check if cluster is quorated
     """
-    out = get_stdout_or_raise_error("corosync-quorumtool -s", success_val_list=[0, 2])
+    out = get_stdout_or_raise_error("corosync-quorumtool -s", remote=peer, success_val_list=[0, 2])
     res = re.search(r'Quorate:\s+(.*)', out)
     if res:
         return res.group(1) == "Yes"
@@ -3012,14 +3027,16 @@ def get_pcmk_delay_max(two_node_without_qdevice=False):
     return 0
 
 
-def get_property(name):
+def get_property(name, property_type="crm_config", peer=None):
     """
     Get cluster properties
     """
-    cib_path = os.getenv('CIB_file', constants.CIB_RAW_FILE)
-    cmd = "CIB_file={} crm configure get_property {}".format(cib_path, name)
-    rc, stdout, _ = get_stdout_stderr(cmd)
-    return stdout if rc == 0 else None
+    if property_type == "crm_config":
+        cib_path = os.getenv('CIB_file', constants.CIB_RAW_FILE)
+        cmd = "CIB_file={} sudo --preserve-env=CIB_file crm configure get_property {}".format(cib_path, name)
+    else:
+        cmd = "sudo crm_attribute -t {} -n {} -Gq".format(property_type, name)
+    return get_stdout_or_raise_error(cmd, remote=peer, no_raise=True)
 
 
 def set_property(**kwargs):
@@ -3145,7 +3162,7 @@ def read_from_file(infile):
     return to_ascii(data)
 
 
-def check_function_with_timeout(check_function, wait_timeout=30, interval=1):
+def check_function_with_timeout(check_function, wait_timeout=30, interval=1, *args, **kwargs):
     """
     Run check_function in a loop
     Return when check_function is true
@@ -3154,7 +3171,7 @@ def check_function_with_timeout(check_function, wait_timeout=30, interval=1):
     current_time = int(time.time())
     timeout = current_time + wait_timeout
     while current_time <= timeout:
-        if check_function():
+        if check_function(*args, **kwargs):
             return
         time.sleep(interval)
         current_time = int(time.time())

diff --git a/test/features/bootstrap_bugs.feature b/test/features/bootstrap_bugs.feature
@@ -131,3 +131,25 @@ Feature: Regression test for bootstrap bugs
     Then    Service "corosync" is "started" on "hanode1"
     When    Run "crm cluster stop" on "hanode1"
     Then    Service "corosync" is "stopped" on "hanode1"
+
+  @clean
+  Scenario: Can't stop all nodes' cluster service when local node's service is down(bsc#1213889)
+    Given   Cluster service is "stopped" on "hanode1"
+    And     Cluster service is "stopped" on "hanode2"
+    When    Run "crm cluster init -y" on "hanode1"
+    Then    Cluster service is "started" on "hanode1"
+    When    Run "crm cluster join -c hanode1 -y" on "hanode2"
+    Then    Cluster service is "started" on "hanode2"
+    When    Wait for DC
+    When    Wait "10" seconds
+    Then    Online nodes are "hanode1 hanode2"
+    # Add more operations
+    When    Run "crm node standby hanode1" on "hanode1"
+    When    Run "crm node online hanode1" on "hanode1"
+    When    Run "crm node standby hanode2" on "hanode1"
+    When    Run "crm node online hanode2" on "hanode1"
+    When    Wait "10" seconds
+    When    Run "crm cluster stop" on "hanode1"
+    And     Run "crm cluster stop --all" on "hanode1"
+    Then    Cluster service is "stopped" on "hanode1"
+    And     Cluster service is "stopped" on "hanode2"
diff --git a/test/testcases/confbasic-xml.exp b/test/testcases/confbasic-xml.exp
@@ -67,7 +67,7 @@
           <operations>
             <op name="start" timeout="60s" interval="0s" id="d2-start-0s"/>
             <op name="stop" timeout="60s" interval="0s" id="d2-stop-0s"/>
-            <op name="monitor" timeout="30s" interval="10s" id="d2-monitor-10s"/>
+            <op name="monitor" timeout="40s" interval="10s" id="d2-monitor-10s"/>
             <op name="monitor" role="Started" interval="60s" timeout="30s" id="d2-monitor-60s"/>
           </operations>
         </primitive>

diff --git a/test/testcases/confbasic.exp b/test/testcases/confbasic.exp
@@ -89,7 +89,7 @@ primitive d2 Delay \
 	params mondelay=45 \
 	op start timeout=60s interval=0s \
 	op stop timeout=60s interval=0s \
-	op monitor timeout=30s interval=10s \
+	op monitor timeout=40s interval=10s \
 	op monitor role=Started interval=60s timeout=30s
 primitive d3 ocf:pacemaker:Dummy \
 	op monitor timeout=20s interval=10s \

diff --git a/test/testcases/file.exp b/test/testcases/file.exp
@@ -11,9 +11,9 @@ primitive p1 ocf:pacemaker:Dummy \
         op stop timeout=20s interval=0s
 primitive p2 Delay \
 	params startdelay=2 mondelay=2 stopdelay=2 \
-        op monitor timeout=30s interval=10s \
+        op monitor timeout=40s interval=10s \
         op start timeout=30s interval=0s \
-        op stop timeout=30s interval=0s
+        op stop timeout=40s interval=0s
 primitive p3 ocf:pacemaker:Dummy \
         op monitor timeout=20s interval=10s \
         op start timeout=20s interval=0s \
@@ -56,9 +56,9 @@ primitive p0 ocf:pacemaker:Dummy \
         op stop timeout=20s interval=0s
 primitive p2 Delay \
 	params startdelay=2 mondelay=2 stopdelay=2 \
-        op monitor timeout=30s interval=10s \
+        op monitor timeout=40s interval=10s \
         op start timeout=30s interval=0s \
-        op stop timeout=30s interval=0s
+        op stop timeout=40s interval=0s
 primitive p3 ocf:pacemaker:Dummy \
         op monitor timeout=20s interval=10s \
         op start timeout=20s interval=0s \

diff --git a/test/testcases/resource.exp b/test/testcases/resource.exp
@@ -133,9 +133,9 @@ resource p0 is NOT running
             <nvpair name="stopdelay" value="2" id="p2-instance_attributes-stopdelay"/>
           </instance_attributes>
           <operations>
-            <op name="monitor" timeout="30s" interval="10s" id="p2-monitor-10s"/>
+            <op name="monitor" timeout="40s" interval="10s" id="p2-monitor-10s"/>
             <op name="start" timeout="30s" interval="0s" id="p2-start-0s"/>
-            <op name="stop" timeout="30s" interval="0s" id="p2-stop-0s"/>
+            <op name="stop" timeout="40s" interval="0s" id="p2-stop-0s"/>
           </operations>
         </primitive>
       </clone>
@@ -167,9 +167,9 @@ resource p0 is NOT running
             <nvpair name="stopdelay" value="2" id="p2-instance_attributes-stopdelay"/>
           </instance_attributes>
           <operations>
-            <op name="monitor" timeout="30s" interval="10s" id="p2-monitor-10s"/>
+            <op name="monitor" timeout="40s" interval="10s" id="p2-monitor-10s"/>
             <op name="start" timeout="30s" interval="0s" id="p2-start-0s"/>
-            <op name="stop" timeout="30s" interval="0s" id="p2-stop-0s"/>
+            <op name="stop" timeout="40s" interval="0s" id="p2-stop-0s"/>
           </operations>
         </primitive>
       </clone>