Skip to content

Commit

Permalink
New way to determine hop node health
Browse files Browse the repository at this point in the history
Old way of detecting health depended on
Receptor Service advertisements. However, hop
nodes need not run any services, thus they won't
show up in the Advertisement lists.

Instead, we can detect health of hop nodes based
on whether they show up in the Known Connection Costs
list.
  • Loading branch information
fosterseth committed Sep 5, 2023
1 parent 224e9e0 commit e210b91
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 9 deletions.
7 changes: 5 additions & 2 deletions awx/main/models/ha.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,12 +315,15 @@ def refresh_capacity_fields(self):

def save_health_data(self, version=None, cpu=0, memory=0, uuid=None, update_last_seen=False, errors=''):
update_fields = ['errors']
if self.node_type != 'hop':
if self.node_type != Instance.Types.HOP:
self.last_health_check = now()
update_fields.append('last_health_check')

if update_last_seen:
self.last_seen = self.last_health_check
if self.node_type == Instance.Types.HOP:
self.last_seen = now()
else:
self.last_seen = self.last_health_check
update_fields.append('last_seen')

if uuid is not None and self.uuid != uuid:
Expand Down
17 changes: 10 additions & 7 deletions awx/main/tasks/system.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,12 @@ def inspect_execution_and_hop_nodes(instance_list):

inspect_established_receptor_connections(mesh_status)

for instance in instance_list:
if instance.node_type == Instance.Types.HOP and mesh_status['KnownConnectionCosts'].get(instance.hostname):
if instance.node_state in (Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
logger.warning(f'Hop node {instance.hostname}, has joined the receptor mesh')
instance.save_health_data(errors='', update_last_seen=True)

nowtime = now()
workers = mesh_status['Advertisements']

Expand All @@ -530,6 +536,10 @@ def inspect_execution_and_hop_nodes(instance_list):
logger.warning(f"Unrecognized node advertising on mesh: {hostname}")
continue

# Only execution nodes should be dealt with by execution_node_health_check
if instance.node_type == Instance.Types.HOP:
continue

# Control-plane nodes are dealt with via local_health_check instead.
if instance.node_type in (Instance.Types.CONTROL, Instance.Types.HYBRID):
continue
Expand All @@ -540,13 +550,6 @@ def inspect_execution_and_hop_nodes(instance_list):
instance.last_seen = last_seen
instance.save(update_fields=['last_seen'])

# Only execution nodes should be dealt with by execution_node_health_check
if instance.node_type == Instance.Types.HOP:
if instance.node_state in (Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
logger.warning(f'Hop node {hostname}, has rejoined the receptor mesh')
instance.save_health_data(errors='')
continue

if instance.node_state in (Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
# if the instance *was* lost, but has appeared again,
# attempt to re-establish the initial capacity and version
Expand Down

0 comments on commit e210b91

Please sign in to comment.