Skip to content

Commit

Permalink
[consul] add maintenance metric to Consul catalog.
Browse files Browse the repository at this point in the history
Currently, if a node in Consul is set into maintenance state, it is reported
as a node in critical state to Datadog metrics. This makes it confusing to
determine by the metric whether it is a node failing with problems or an planned
intervention. A user made a PR to Datadog some time ago, but it was not merged due to
Datadog code organization changes and got forgotten (DataDog/dd-agent#2496).
I'm pushing the change forward. I've tested it using dd-agent
version 5.8.0.

This PR also depends on DataDog/dd-agent#3708
  • Loading branch information
Diogo Kiss committed Mar 19, 2018
1 parent 07d41b6 commit d69de86
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 5 deletions.
5 changes: 5 additions & 0 deletions consul/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# CHANGELOG - consul

1.3.1 / Unreleased
=================

* [IMPROVEMENT] Add maintenance metrics (services_maintenance and nodes_maintenance)

1.3.0 / 2018-01-10
==================

Expand Down
2 changes: 1 addition & 1 deletion consul/datadog_checks/consul/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@

ConsulCheck = consul.ConsulCheck

__version__ = "1.3.0"
__version__ = "1.3.1"

__all__ = ['consul']
22 changes: 19 additions & 3 deletions consul/datadog_checks/consul/consul.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,15 @@ class ConsulCheck(AgentCheck):
'passing': AgentCheck.OK,
'warning': AgentCheck.WARNING,
'critical': AgentCheck.CRITICAL,
'maintenance': AgentCheck.MAINTENANCE
}

STATUS_SEVERITY = {
AgentCheck.UNKNOWN: 0,
AgentCheck.OK: 1,
AgentCheck.WARNING: 2,
AgentCheck.CRITICAL: 3,
AgentCheck.MAINTENANCE: 4,
}

def __init__(self, name, init_config, agentConfig, instances=None):
Expand Down Expand Up @@ -322,12 +324,13 @@ def check(self, instance):
# `consul.catalog.nodes_passing` : # of Nodes with service status `passing` from those registered
# `consul.catalog.nodes_warning` : # of Nodes with service status `warning` from those registered
# `consul.catalog.nodes_critical` : # of Nodes with service status `critical` from those registered
# `consul.catalog.nodes_maintenance` : # of Nodes set in maintenance from those registered

service_tags = self._get_service_tags(service, services[service])

nodes_with_service = self.get_nodes_with_service(instance, service)

# {'up': 0, 'passing': 0, 'warning': 0, 'critical': 0}
# {'up': 0, 'passing': 0, 'warning': 0, 'critical': 0, 'maintenance': 0}
node_status = defaultdict(int)

for node in nodes_with_service:
Expand All @@ -345,8 +348,18 @@ def check(self, instance):
found_critical = False
found_warning = False
found_serf_health = False
found_maint_critical = False

for check in node['Checks']:

# If a node is in maintenance state, it means that, for some reason, we don't
# really expect it to be healthy. We don't really care about it, until the maintenance
# window is over. So, we just move on.
if check['CheckID'] == '_node_maintenance':
if check['Status'] == 'critical':
found_maint_critical = True
break

if check['CheckID'] == 'serfHealth':
found_serf_health = True

Expand All @@ -367,8 +380,11 @@ def check(self, instance):
# Keep looping in case there is a critical status

# Increment the counters based on what was found in Checks
# `critical` checks override `warning`s, and if neither are found, register the node as `passing`
if found_critical:
# `maintenance` checks override `critical`s, which override `warning`s. If none is found, register the node as `passing`
if found_maint_critical:
node_status['maintenance'] += 1
nodes_to_service_status[node_id]["maintenance"] += 1
elif found_critical:
node_status['critical'] += 1
nodes_to_service_status[node_id]["critical"] += 1
elif found_warning:
Expand Down
2 changes: 1 addition & 1 deletion consul/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"mac_os",
"windows"
],
"version": "1.3.0",
"version": "1.3.1",
"guid": "ec1e9fac-a339-49a3-b501-60656d2a5671",
"public_title": "Datadog-Consul Integration",
"categories":["containers", "orchestration", "configuration & deployment", "notification"],
Expand Down
2 changes: 2 additions & 0 deletions consul/metadata.csv
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name
consul.catalog.nodes_critical,gauge,,node,,Number of nodes with service status `critical` from those registered,-1,consul,nodes crit
consul.catalog.nodes_maintenance,gauge,,node,,Number of nodes in maintenance from those registered,-1,consul,nodes maint
consul.catalog.nodes_passing,gauge,,node,,Number of nodes with service status `passing` from those registered,1,consul,nodes pass
consul.catalog.nodes_up,gauge,,node,,Number of nodes,0,consul,nodes up
consul.catalog.nodes_warning,gauge,,node,,Number of nodes with service status `warning` from those registered,-1,consul,nodes warn
consul.catalog.services_critical,gauge,,service,,Total critical services on nodes,-1,consul,svc crit
consul.catalog.services_maintenance,gauge,,service,,Total services in maintenance on nodes,-1,consul,svc maint
consul.catalog.services_passing,gauge,,service,,Total passing services on nodes,1,consul,svc pass
consul.catalog.services_up,gauge,,service,,Total services registered on nodes,0,consul,svc up
consul.catalog.services_warning,gauge,,service,,Total warning services on nodes,-1,consul,svc warn
Expand Down
78 changes: 78 additions & 0 deletions consul/test/test_consul.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,68 @@ def mock_get_nodes_with_service_critical(self, instance, service):
}
]

def mock_get_nodes_with_service_critical_in_maitenance(self, instance, service):

return [
{
"Checks": [
{
"CheckID": "_node_maintenance",
"Name": "Node Maintenance Mode",
"Node": "node-1",
"Notes": "",
"Output": "",
"ServiceID": service,
"ServiceName": "",
"Status": "critical",
},
{
"CheckID": "serfHealth",
"Name": "Serf Health Status",
"Node": "node-1",
"Notes": "",
"Output": "Agent alive and reachable",
"ServiceID": "",
"ServiceName": "",
"Status": "passing"
},
{
"CheckID": "service:{0}".format(service),
"Name": "service check {0}".format(service),
"Node": "node-1",
"Notes": "",
"Output": "Service {0} alive".format(service),
"ServiceID": service,
"ServiceName": "",
"Status": "warning"
},
{
"CheckID": "service:{0}".format(service),
"Name": "service check {0}".format(service),
"Node": "node-1",
"Notes": "",
"Output": "Service {0} alive".format(service),
"ServiceID": service,
"ServiceName": "",
"Status": "critical"
}
],
"Node": {
"Address": _get_random_ip(),
"Node": "node-1"
},
"Service": {
"Address": "",
"ID": service,
"Port": 80,
"Service": service,
"Tags": [
"az-us-east-1a"
]
}
}
]

def mock_get_coord_datacenters(self, instance):
return [{
"Datacenter": "dc1",
Expand Down Expand Up @@ -500,6 +562,22 @@ def test_get_nodes_with_service_critical(self):
self.assertMetric('consul.catalog.services_warning', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1'])
self.assertMetric('consul.catalog.services_critical', value=6, tags=['consul_datacenter:dc1', 'consul_node_id:node-1'])

def test_get_nodes_with_service_critical_in_maintenance(self):
my_mocks = self._get_consul_mocks()
my_mocks['get_nodes_with_service'] = self.mock_get_nodes_with_service_critical_in_maitenance

self.run_check(MOCK_CONFIG, mocks=my_mocks)
self.assertMetric('consul.catalog.nodes_up', value=1, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a'])
self.assertMetric('consul.catalog.nodes_passing', value=0, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a'])
self.assertMetric('consul.catalog.nodes_warning', value=0, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a'])
self.assertMetric('consul.catalog.nodes_critical', value=0, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a'])
self.assertMetric('consul.catalog.nodes_maintenance', value=1, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a'])
self.assertMetric('consul.catalog.services_up', value=6, tags=['consul_datacenter:dc1', 'consul_node_id:node-1'])
self.assertMetric('consul.catalog.services_passing', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1'])
self.assertMetric('consul.catalog.services_warning', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1'])
self.assertMetric('consul.catalog.services_critical', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1'])
self.assertMetric('consul.catalog.services_maintenance', value=6, tags=['consul_datacenter:dc1', 'consul_node_id:node-1'])

def test_service_checks(self):
my_mocks = self._get_consul_mocks()
my_mocks['consul_request'] = self.mock_get_health_check
Expand Down

0 comments on commit d69de86

Please sign in to comment.