Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[mongo] add mongodb.rs_status service status for check state all replset members #3121

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 34 additions & 12 deletions checks.d/mongo.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class MongoDb(AgentCheck):

# Service check
SERVICE_CHECK_NAME = 'mongodb.can_connect'
SERVICE_RS_STATUS_CHECK_NAME = 'mongodb.rs_status'

# Metrics
"""
Expand Down Expand Up @@ -388,18 +389,19 @@ class MongoDb(AgentCheck):
MongoDB replica set states, as documented at
https://docs.mongodb.org/manual/reference/replica-states/
"""
# the third value is weight of emergency when rs has node with this status
REPLSET_MEMBER_STATES = {
0: ('STARTUP', 'Starting Up'),
1: ('PRIMARY', 'Primary'),
2: ('SECONDARY', 'Secondary'),
3: ('RECOVERING', 'Recovering'),
4: ('Fatal', 'Fatal'), # MongoDB docs don't list this state
5: ('STARTUP2', 'Starting up (forking threads)'),
6: ('UNKNOWN', 'Unknown to this replset member'),
7: ('ARBITER', 'Arbiter'),
8: ('DOWN', 'Down'),
9: ('ROLLBACK', 'Rollback'),
10: ('REMOVED', 'Removed'),
0: ('STARTUP', 'Starting Up', AgentCheck.WARNING),
1: ('PRIMARY', 'Primary', AgentCheck.OK),
2: ('SECONDARY', 'Secondary', AgentCheck.OK),
3: ('RECOVERING', 'Recovering', AgentCheck.WARNING),
4: ('Fatal', 'Fatal', AgentCheck.CRITICAL), # MongoDB docs don't list this state
5: ('STARTUP2', 'Starting up (forking threads)', AgentCheck.WARNING),
6: ('UNKNOWN', 'Unknown to this replset member', AgentCheck.CRITICAL),
7: ('ARBITER', 'Arbiter', AgentCheck.OK),
8: ('DOWN', 'Down', AgentCheck.CRITICAL),
9: ('ROLLBACK', 'Rollback', AgentCheck.CRITICAL),
10: ('REMOVED', 'Removed', AgentCheck.CRITICAL),
}

def __init__(self, name, init_config, agentConfig, instances=None):
Expand Down Expand Up @@ -778,13 +780,33 @@ def total_seconds(td):
u"replset_state:{0}".format(replset_state),
])

rs_status = AgentCheck.OK
rs_status_details = []

# Find nodes: master and current node (ourself)
for member in replSet.get('members'):

member_state = int(member.get('state'))
member_state_info = self.REPLSET_MEMBER_STATES[member_state]

# checking all replset members
if member_state_info[2] > AgentCheck.OK:
rs_status_details.append("%s has state \"%s\"" % (member.get('name'),
member_state_info[1]))
if rs_status < member_state_info[2]:
rs_status = member_state_info[2]

if member.get('self'):
current = member
if int(member.get('state')) == 1:
if member_state == 1:
primary = member

self.service_check(
self.SERVICE_RS_STATUS_CHECK_NAME,
rs_status,
tags=service_check_tags,
message='\n'.join(rs_status_details))

# Compute a lag time
if current is not None and primary is not None:
if 'optimeDate' in primary and 'optimeDate' in current:
Expand Down