Skip to content

Commit

Permalink
Merge pull request ClusterLabs#1279 from liangxin1300/20230803_stop_c…
Browse files Browse the repository at this point in the history
…luster_all_crmsh44

[crmsh-4.4] Fix: ui_cluster: Improve the process of 'crm cluster stop' (bsc#1213889)
  • Loading branch information
liangxin1300 authored Dec 12, 2023
2 parents c4f7b53 + e0c7d93 commit 3f8e358
Show file tree
Hide file tree
Showing 9 changed files with 216 additions and 104 deletions.
83 changes: 58 additions & 25 deletions crmsh/ui_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,40 +172,73 @@ def do_start(self, context, *args):
for node in node_list:
logger.info("Cluster services started on {}".format(node))

@staticmethod
def _node_ready_to_stop_cluster_service(node):
"""
Check if the specific node is ready to stop cluster service
If both corosync.service and pacemaker.service is active, return True
If some services started, stop them first and return False
"""
corosync_active = utils.service_is_active("corosync.service", remote_addr=node)
sbd_active = utils.service_is_active("sbd.service", remote_addr=node)
pacemaker_active = utils.service_is_active("pacemaker.service", remote_addr=node)

if not corosync_active:
if sbd_active:
utils.stop_service("corosync", remote_addr=node)
logger.info(f"The cluster stack stopped on {node}")
else:
logger.info(f"The cluster stack already stopped on {node}")
return False

elif not pacemaker_active:
utils.stop_service("corosync", remote_addr=node)
logger.info("The cluster stack stopped on {}".format(node))
return False

return True

@staticmethod
def _wait_for_dc(node=None):
"""
Wait for the cluster's DC to become available
"""
if not utils.service_is_active("pacemaker.service", remote_addr=node):
return

dc_deadtime = utils.get_property("dc-deadtime", peer=node) or str(constants.DC_DEADTIME_DEFAULT)
dc_timeout = int(dc_deadtime.strip('s')) + 5
try:
utils.check_function_with_timeout(utils.get_dc, wait_timeout=dc_timeout, peer=node)
except TimeoutError:
logger.error("No DC found currently, please wait if the cluster is still starting")
raise utils.TerminateSubCommand

@staticmethod
def _set_dlm(node=None):
"""
When dlm running and quorum is lost, before stop cluster service, should set
enable_quorum_fencing=0, enable_quorum_lockspace=0 for dlm config option
"""
if utils.is_dlm_running(node) and not utils.is_quorate(node):
logger.debug("Quorum is lost; Set enable_quorum_fencing=0 and enable_quorum_lockspace=0 for dlm")
utils.set_dlm_option(peer=node, enable_quorum_fencing=0, enable_quorum_lockspace=0)

@command.skill_level('administrator')
def do_stop(self, context, *args):
'''
Stops the cluster services on all nodes or specific node(s)
'''
node_list = parse_option_for_nodes(context, *args)
for node in node_list[:]:
if not utils.service_is_active("corosync.service", remote_addr=node):
if utils.service_is_active("sbd.service", remote_addr=node):
utils.stop_service("corosync", remote_addr=node)
logger.info("Cluster services stopped on {}".format(node))
else:
logger.info("Cluster services already stopped on {}".format(node))
node_list.remove(node)
elif not utils.service_is_active("pacemaker.service", remote_addr=node):
utils.stop_service("corosync", remote_addr=node)
logger.info("Cluster services stopped on {}".format(node))
node_list.remove(node)
node_list = [n for n in node_list if self._node_ready_to_stop_cluster_service(n)]
if not node_list:
return
logger.debug(f"stop node list: {node_list}")

dc_deadtime = utils.get_property("dc-deadtime") or constants.DC_DEADTIME_DEFAULT
dc_timeout = int(dc_deadtime.strip('s')) + 5
try:
utils.check_function_with_timeout(utils.get_dc, wait_timeout=dc_timeout)
except TimeoutError:
logger.error("No DC found currently, please wait if the cluster is still starting")
return False
self._wait_for_dc(node_list[0])

# When dlm running and quorum is lost, before stop cluster service, should set
# enable_quorum_fencing=0, enable_quorum_lockspace=0 for dlm config option
if utils.is_dlm_running() and not utils.is_quorate():
logger.debug("Quorum is lost; Set enable_quorum_fencing=0 and enable_quorum_lockspace=0 for dlm")
utils.set_dlm_option(enable_quorum_fencing=0, enable_quorum_lockspace=0)
self._set_dlm(node_list[0])

# Stop pacemaker since it can make sure cluster has quorum until stop corosync
utils.stop_service("pacemaker", node_list=node_list)
Expand All @@ -216,7 +249,7 @@ def do_stop(self, context, *args):
utils.stop_service("corosync", node_list=node_list)

for node in node_list:
logger.info("Cluster services stopped on {}".format(node))
logger.info("The cluster stack stopped on {}".format(node))

@command.skill_level('administrator')
def do_restart(self, context, *args):
Expand Down
69 changes: 43 additions & 26 deletions crmsh/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -853,14 +853,14 @@ def append_file(dest, src):
return False


def get_dc():
def get_dc(peer=None):
cmd = "crmadmin -D -t 1"
rc, s, _ = get_stdout_stderr(add_sudo(cmd))
if rc != 0:
out = get_stdout_or_raise_error(add_sudo(cmd), remote=peer, no_raise=True)
if not out:
return None
if not s.startswith("Designated"):
if not out.startswith("Designated"):
return None
return s.split()[-1]
return out.split()[-1]


def wait4dc(what="", show_progress=True):
Expand Down Expand Up @@ -2945,47 +2945,62 @@ def is_standby(node):
return re.search(r'Node\s+{}:\s+standby'.format(node), out) is not None


def get_dlm_option_dict():
def get_dlm_option_dict(peer=None):
"""
Get dlm config option dictionary
"""
out = get_stdout_or_raise_error("dlm_tool dump_config")
out = get_stdout_or_raise_error("dlm_tool dump_config", remote=peer)
return dict(re.findall("(\w+)=(\w+)", out))


def set_dlm_option(**kargs):
def set_dlm_option(peer=None, **kargs):
"""
Set dlm option
"""
dlm_option_dict = get_dlm_option_dict()
dlm_option_dict = get_dlm_option_dict(peer=peer)
for option, value in kargs.items():
if option not in dlm_option_dict:
raise ValueError('"{}" is not dlm config option'.format(option))
raise ValueError(f'"{option}" is not dlm config option')
if dlm_option_dict[option] != value:
get_stdout_or_raise_error('dlm_tool set_config "{}={}"'.format(option, value))
get_stdout_or_raise_error(f'dlm_tool set_config "{option}={value}"', remote=peer)


def is_dlm_running():
def is_dlm_running(peer=None):
"""
Check if dlm ra controld is running
"""
from . import xmlutil
return xmlutil.CrmMonXmlParser.is_resource_started(constants.DLM_CONTROLD_RA)
return is_resource_running(constants.DLM_CONTROLD_RA, peer=peer)


def is_dlm_configured():
def has_resource_configured(ra_type, peer=None):
"""
Check if the RA configured
"""
out = get_stdout_or_raise_error("crm_mon -1rR", remote=peer)
return re.search(ra_type, out) is not None


def is_resource_running(ra_type, peer=None):
"""
Check if the RA running
"""
out = get_stdout_or_raise_error("crm_mon -1rR", remote=peer)
patt = f"\({ra_type}\):\s*Started"
return re.search(patt, out) is not None


def is_dlm_configured(peer=None):
"""
Check if dlm configured
"""
from . import xmlutil
return xmlutil.CrmMonXmlParser.is_resource_configured(constants.DLM_CONTROLD_RA)
return has_resource_configured(constants.DLM_CONTROLD_RA, peer=peer)


def is_quorate():
def is_quorate(peer=None):
"""
Check if cluster is quorated
"""
out = get_stdout_or_raise_error("corosync-quorumtool -s", success_val_list=[0, 2])
out = get_stdout_or_raise_error("corosync-quorumtool -s", remote=peer, success_val_list=[0, 2])
res = re.search(r'Quorate:\s+(.*)', out)
if res:
return res.group(1) == "Yes"
Expand All @@ -3012,14 +3027,16 @@ def get_pcmk_delay_max(two_node_without_qdevice=False):
return 0


def get_property(name):
def get_property(name, property_type="crm_config", peer=None):
"""
Get cluster properties
"""
cib_path = os.getenv('CIB_file', constants.CIB_RAW_FILE)
cmd = "CIB_file={} crm configure get_property {}".format(cib_path, name)
rc, stdout, _ = get_stdout_stderr(cmd)
return stdout if rc == 0 else None
if property_type == "crm_config":
cib_path = os.getenv('CIB_file', constants.CIB_RAW_FILE)
cmd = "CIB_file={} sudo --preserve-env=CIB_file crm configure get_property {}".format(cib_path, name)
else:
cmd = "sudo crm_attribute -t {} -n {} -Gq".format(property_type, name)
return get_stdout_or_raise_error(cmd, remote=peer, no_raise=True)


def set_property(**kwargs):
Expand Down Expand Up @@ -3145,7 +3162,7 @@ def read_from_file(infile):
return to_ascii(data)


def check_function_with_timeout(check_function, wait_timeout=30, interval=1):
def check_function_with_timeout(check_function, wait_timeout=30, interval=1, *args, **kwargs):
"""
Run check_function in a loop
Return when check_function is true
Expand All @@ -3154,7 +3171,7 @@ def check_function_with_timeout(check_function, wait_timeout=30, interval=1):
current_time = int(time.time())
timeout = current_time + wait_timeout
while current_time <= timeout:
if check_function():
if check_function(*args, **kwargs):
return
time.sleep(interval)
current_time = int(time.time())
Expand Down
22 changes: 22 additions & 0 deletions test/features/bootstrap_bugs.feature
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,25 @@ Feature: Regression test for bootstrap bugs
Then Service "corosync" is "started" on "hanode1"
When Run "crm cluster stop" on "hanode1"
Then Service "corosync" is "stopped" on "hanode1"

@clean
Scenario: Can't stop all nodes' cluster service when local node's service is down(bsc#1213889)
Given Cluster service is "stopped" on "hanode1"
And Cluster service is "stopped" on "hanode2"
When Run "crm cluster init -y" on "hanode1"
Then Cluster service is "started" on "hanode1"
When Run "crm cluster join -c hanode1 -y" on "hanode2"
Then Cluster service is "started" on "hanode2"
When Wait for DC
When Wait "10" seconds
Then Online nodes are "hanode1 hanode2"
# Add more operations
When Run "crm node standby hanode1" on "hanode1"
When Run "crm node online hanode1" on "hanode1"
When Run "crm node standby hanode2" on "hanode1"
When Run "crm node online hanode2" on "hanode1"
When Wait "10" seconds
When Run "crm cluster stop" on "hanode1"
And Run "crm cluster stop --all" on "hanode1"
Then Cluster service is "stopped" on "hanode1"
And Cluster service is "stopped" on "hanode2"
2 changes: 1 addition & 1 deletion test/testcases/confbasic-xml.exp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
<operations>
<op name="start" timeout="60s" interval="0s" id="d2-start-0s"/>
<op name="stop" timeout="60s" interval="0s" id="d2-stop-0s"/>
<op name="monitor" timeout="30s" interval="10s" id="d2-monitor-10s"/>
<op name="monitor" timeout="40s" interval="10s" id="d2-monitor-10s"/>
<op name="monitor" role="Started" interval="60s" timeout="30s" id="d2-monitor-60s"/>
</operations>
</primitive>
Expand Down
2 changes: 1 addition & 1 deletion test/testcases/confbasic.exp
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ primitive d2 Delay \
params mondelay=45 \
op start timeout=60s interval=0s \
op stop timeout=60s interval=0s \
op monitor timeout=30s interval=10s \
op monitor timeout=40s interval=10s \
op monitor role=Started interval=60s timeout=30s
primitive d3 ocf:pacemaker:Dummy \
op monitor timeout=20s interval=10s \
Expand Down
8 changes: 4 additions & 4 deletions test/testcases/file.exp
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ primitive p1 ocf:pacemaker:Dummy \
op stop timeout=20s interval=0s
primitive p2 Delay \
params startdelay=2 mondelay=2 stopdelay=2 \
op monitor timeout=30s interval=10s \
op monitor timeout=40s interval=10s \
op start timeout=30s interval=0s \
op stop timeout=30s interval=0s
op stop timeout=40s interval=0s
primitive p3 ocf:pacemaker:Dummy \
op monitor timeout=20s interval=10s \
op start timeout=20s interval=0s \
Expand Down Expand Up @@ -56,9 +56,9 @@ primitive p0 ocf:pacemaker:Dummy \
op stop timeout=20s interval=0s
primitive p2 Delay \
params startdelay=2 mondelay=2 stopdelay=2 \
op monitor timeout=30s interval=10s \
op monitor timeout=40s interval=10s \
op start timeout=30s interval=0s \
op stop timeout=30s interval=0s
op stop timeout=40s interval=0s
primitive p3 ocf:pacemaker:Dummy \
op monitor timeout=20s interval=10s \
op start timeout=20s interval=0s \
Expand Down
8 changes: 4 additions & 4 deletions test/testcases/resource.exp
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,9 @@ resource p0 is NOT running
<nvpair name="stopdelay" value="2" id="p2-instance_attributes-stopdelay"/>
</instance_attributes>
<operations>
<op name="monitor" timeout="30s" interval="10s" id="p2-monitor-10s"/>
<op name="monitor" timeout="40s" interval="10s" id="p2-monitor-10s"/>
<op name="start" timeout="30s" interval="0s" id="p2-start-0s"/>
<op name="stop" timeout="30s" interval="0s" id="p2-stop-0s"/>
<op name="stop" timeout="40s" interval="0s" id="p2-stop-0s"/>
</operations>
</primitive>
</clone>
Expand Down Expand Up @@ -167,9 +167,9 @@ resource p0 is NOT running
<nvpair name="stopdelay" value="2" id="p2-instance_attributes-stopdelay"/>
</instance_attributes>
<operations>
<op name="monitor" timeout="30s" interval="10s" id="p2-monitor-10s"/>
<op name="monitor" timeout="40s" interval="10s" id="p2-monitor-10s"/>
<op name="start" timeout="30s" interval="0s" id="p2-start-0s"/>
<op name="stop" timeout="30s" interval="0s" id="p2-stop-0s"/>
<op name="stop" timeout="40s" interval="0s" id="p2-stop-0s"/>
</operations>
</primitive>
</clone>
Expand Down
Loading

0 comments on commit 3f8e358

Please sign in to comment.