Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[consul] add maintenance metric to Consul catalog. #1267

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions consul/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# CHANGELOG - consul

1.3.1 / Unreleased
=================

* [IMPROVEMENT] Add maintenance metrics (services_maintenance and nodes_maintenance)

1.3.0 / 2018-01-10
==================

Expand Down
2 changes: 1 addition & 1 deletion consul/datadog_checks/consul/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@

ConsulCheck = consul.ConsulCheck

__version__ = "1.3.0"
__version__ = "1.3.1"

__all__ = ['consul']
22 changes: 19 additions & 3 deletions consul/datadog_checks/consul/consul.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,15 @@ class ConsulCheck(AgentCheck):
'passing': AgentCheck.OK,
'warning': AgentCheck.WARNING,
'critical': AgentCheck.CRITICAL,
'maintenance': AgentCheck.MAINTENANCE
}

STATUS_SEVERITY = {
AgentCheck.UNKNOWN: 0,
AgentCheck.OK: 1,
AgentCheck.WARNING: 2,
AgentCheck.CRITICAL: 3,
AgentCheck.MAINTENANCE: 4,
}

def __init__(self, name, init_config, agentConfig, instances=None):
Expand Down Expand Up @@ -322,12 +324,13 @@ def check(self, instance):
# `consul.catalog.nodes_passing` : # of Nodes with service status `passing` from those registered
# `consul.catalog.nodes_warning` : # of Nodes with service status `warning` from those registered
# `consul.catalog.nodes_critical` : # of Nodes with service status `critical` from those registered
# `consul.catalog.nodes_maintenance` : # of Nodes set in maintenance from those registered

service_tags = self._get_service_tags(service, services[service])

nodes_with_service = self.get_nodes_with_service(instance, service)

# {'up': 0, 'passing': 0, 'warning': 0, 'critical': 0}
# {'up': 0, 'passing': 0, 'warning': 0, 'critical': 0, 'maintenance': 0}
node_status = defaultdict(int)

for node in nodes_with_service:
Expand All @@ -345,8 +348,18 @@ def check(self, instance):
found_critical = False
found_warning = False
found_serf_health = False
found_maint_critical = False

for check in node['Checks']:

# If a node is in maintenance state, it means that, for some reason, we don't
# really expect it to be healthy. We don't really care about it, until the maintenance
# window is over. So, we just move on.
if check['CheckID'] == '_node_maintenance':
if check['Status'] == 'critical':
found_maint_critical = True
break

if check['CheckID'] == 'serfHealth':
found_serf_health = True

Expand All @@ -367,8 +380,11 @@ def check(self, instance):
# Keep looping in case there is a critical status

# Increment the counters based on what was found in Checks
# `critical` checks override `warning`s, and if neither are found, register the node as `passing`
if found_critical:
# `maintenance` checks override `critical`s, which override `warning`s. If none is found, register the node as `passing`
if found_maint_critical:
node_status['maintenance'] += 1
nodes_to_service_status[node_id]["maintenance"] += 1
elif found_critical:
node_status['critical'] += 1
nodes_to_service_status[node_id]["critical"] += 1
elif found_warning:
Expand Down
2 changes: 1 addition & 1 deletion consul/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"mac_os",
"windows"
],
"version": "1.3.0",
"version": "1.3.1",
"guid": "ec1e9fac-a339-49a3-b501-60656d2a5671",
"public_title": "Datadog-Consul Integration",
"categories":["containers", "orchestration", "configuration & deployment", "notification"],
Expand Down
2 changes: 2 additions & 0 deletions consul/metadata.csv
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name
consul.catalog.nodes_critical,gauge,,node,,Number of nodes with service status `critical` from those registered,-1,consul,nodes crit
consul.catalog.nodes_maintenance,gauge,,node,,Number of nodes in maintenance from those registered,-1,consul,nodes maint
consul.catalog.nodes_passing,gauge,,node,,Number of nodes with service status `passing` from those registered,1,consul,nodes pass
consul.catalog.nodes_up,gauge,,node,,Number of nodes,0,consul,nodes up
consul.catalog.nodes_warning,gauge,,node,,Number of nodes with service status `warning` from those registered,-1,consul,nodes warn
consul.catalog.services_critical,gauge,,service,,Total critical services on nodes,-1,consul,svc crit
consul.catalog.services_maintenance,gauge,,service,,Total services in maintenance on nodes,-1,consul,svc maint
consul.catalog.services_passing,gauge,,service,,Total passing services on nodes,1,consul,svc pass
consul.catalog.services_up,gauge,,service,,Total services registered on nodes,0,consul,svc up
consul.catalog.services_warning,gauge,,service,,Total warning services on nodes,-1,consul,svc warn
Expand Down
78 changes: 78 additions & 0 deletions consul/test/test_consul.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,68 @@ def mock_get_nodes_with_service_critical(self, instance, service):
}
]

def mock_get_nodes_with_service_critical_in_maitenance(self, instance, service):

return [
{
"Checks": [
{
"CheckID": "_node_maintenance",
"Name": "Node Maintenance Mode",
"Node": "node-1",
"Notes": "",
"Output": "",
"ServiceID": service,
"ServiceName": "",
"Status": "critical",
},
{
"CheckID": "serfHealth",
"Name": "Serf Health Status",
"Node": "node-1",
"Notes": "",
"Output": "Agent alive and reachable",
"ServiceID": "",
"ServiceName": "",
"Status": "passing"
},
{
"CheckID": "service:{0}".format(service),
"Name": "service check {0}".format(service),
"Node": "node-1",
"Notes": "",
"Output": "Service {0} alive".format(service),
"ServiceID": service,
"ServiceName": "",
"Status": "warning"
},
{
"CheckID": "service:{0}".format(service),
"Name": "service check {0}".format(service),
"Node": "node-1",
"Notes": "",
"Output": "Service {0} alive".format(service),
"ServiceID": service,
"ServiceName": "",
"Status": "critical"
}
],
"Node": {
"Address": _get_random_ip(),
"Node": "node-1"
},
"Service": {
"Address": "",
"ID": service,
"Port": 80,
"Service": service,
"Tags": [
"az-us-east-1a"
]
}
}
]

def mock_get_coord_datacenters(self, instance):
return [{
"Datacenter": "dc1",
Expand Down Expand Up @@ -500,6 +562,22 @@ def test_get_nodes_with_service_critical(self):
self.assertMetric('consul.catalog.services_warning', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1'])
self.assertMetric('consul.catalog.services_critical', value=6, tags=['consul_datacenter:dc1', 'consul_node_id:node-1'])

def test_get_nodes_with_service_critical_in_maintenance(self):
my_mocks = self._get_consul_mocks()
my_mocks['get_nodes_with_service'] = self.mock_get_nodes_with_service_critical_in_maitenance

self.run_check(MOCK_CONFIG, mocks=my_mocks)
self.assertMetric('consul.catalog.nodes_up', value=1, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a'])
self.assertMetric('consul.catalog.nodes_passing', value=0, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a'])
self.assertMetric('consul.catalog.nodes_warning', value=0, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a'])
self.assertMetric('consul.catalog.nodes_critical', value=0, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a'])
self.assertMetric('consul.catalog.nodes_maintenance', value=1, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a'])
self.assertMetric('consul.catalog.services_up', value=6, tags=['consul_datacenter:dc1', 'consul_node_id:node-1'])
self.assertMetric('consul.catalog.services_passing', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1'])
self.assertMetric('consul.catalog.services_warning', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1'])
self.assertMetric('consul.catalog.services_critical', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1'])
self.assertMetric('consul.catalog.services_maintenance', value=6, tags=['consul_datacenter:dc1', 'consul_node_id:node-1'])

def test_service_checks(self):
my_mocks = self._get_consul_mocks()
my_mocks['consul_request'] = self.mock_get_health_check
Expand Down