From 93f344fb9f5b43965605386e916e6a38870693e7 Mon Sep 17 00:00:00 2001 From: Diogo Kiss Date: Fri, 16 Mar 2018 15:56:17 +0100 Subject: [PATCH] [consul] add maintenance metric to Consul catalog. Currently, if a node in Consul is set into maintenance state, it is reported as a node in critical state to Datadog metrics. This makes it confusing to determine by the metric whether it is a node failing with problems or an planned intervention. A user made a PR to Datadog some time ago, but it was not merged due to Datadog code organization changes and got forgotten (DataDog/dd-agent#2496). I'm pushing the change forward. I've tested it using dd-agent version 5.8.0. This PR also depends on https://github.com/DataDog/dd-agent/pull/3708 --- consul/datadog_checks/consul/consul.py | 22 +++++++- consul/manifest.json | 2 +- consul/test/test_consul.py | 78 ++++++++++++++++++++++++++ 3 files changed, 98 insertions(+), 4 deletions(-) diff --git a/consul/datadog_checks/consul/consul.py b/consul/datadog_checks/consul/consul.py index fad679ebb72304..7eccfe31149fc8 100644 --- a/consul/datadog_checks/consul/consul.py +++ b/consul/datadog_checks/consul/consul.py @@ -63,6 +63,7 @@ class ConsulCheck(AgentCheck): 'passing': AgentCheck.OK, 'warning': AgentCheck.WARNING, 'critical': AgentCheck.CRITICAL, + 'maintenance': AgentCheck.MAINTENANCE } STATUS_SEVERITY = { @@ -70,6 +71,7 @@ class ConsulCheck(AgentCheck): AgentCheck.OK: 1, AgentCheck.WARNING: 2, AgentCheck.CRITICAL: 3, + AgentCheck.MAINTENANCE: 4, } def __init__(self, name, init_config, agentConfig, instances=None): @@ -322,12 +324,13 @@ def check(self, instance): # `consul.catalog.nodes_passing` : # of Nodes with service status `passing` from those registered # `consul.catalog.nodes_warning` : # of Nodes with service status `warning` from those registered # `consul.catalog.nodes_critical` : # of Nodes with service status `critical` from those registered + # `consul.catalog.nodes_maintenance` : # of Nodes set in maintenance from those registered service_tags = self._get_service_tags(service, services[service]) nodes_with_service = self.get_nodes_with_service(instance, service) - # {'up': 0, 'passing': 0, 'warning': 0, 'critical': 0} + # {'up': 0, 'passing': 0, 'warning': 0, 'critical': 0, 'maintenance': 0} node_status = defaultdict(int) for node in nodes_with_service: @@ -345,8 +348,18 @@ def check(self, instance): found_critical = False found_warning = False found_serf_health = False + found_maint_critical = False for check in node['Checks']: + + # If a node is in maintenance state, it means that, for some reason, we don't + # really expect it to be healthy. We don't really care about it, until the maintenance + # window is over. So, we just move on. + if check['CheckID'] == '_node_maintenance': + if check['Status'] == 'critical': + found_maint_critical = True + break + if check['CheckID'] == 'serfHealth': found_serf_health = True @@ -367,8 +380,11 @@ def check(self, instance): # Keep looping in case there is a critical status # Increment the counters based on what was found in Checks - # `critical` checks override `warning`s, and if neither are found, register the node as `passing` - if found_critical: + # `maintenance` checks override `critical`s, which override `warning`s. If none is found, register the node as `passing` + if found_maint_critical: + node_status['maintenance'] += 1 + nodes_to_service_status[node_id]["maintenance"] += 1 + elif found_critical: node_status['critical'] += 1 nodes_to_service_status[node_id]["critical"] += 1 elif found_warning: diff --git a/consul/manifest.json b/consul/manifest.json index d942a33824a48e..29141e2e1eac0f 100644 --- a/consul/manifest.json +++ b/consul/manifest.json @@ -11,7 +11,7 @@ "mac_os", "windows" ], - "version": "1.3.0", + "version": "1.3.1", "guid": "ec1e9fac-a339-49a3-b501-60656d2a5671", "public_title": "Datadog-Consul Integration", "categories":["containers", "orchestration", "configuration & deployment", "notification"], diff --git a/consul/test/test_consul.py b/consul/test/test_consul.py index 492d10d15a468e..0984b7034e8b18 100644 --- a/consul/test/test_consul.py +++ b/consul/test/test_consul.py @@ -287,6 +287,68 @@ def mock_get_nodes_with_service_critical(self, instance, service): } ] + def mock_get_nodes_with_service_critical_in_maitenance(self, instance, service): + + return [ + { + "Checks": [ + { + "CheckID": "_node_maintenance", + "Name": "Node Maintenance Mode", + "Node": "node-1", + "Notes": "", + "Output": "", + "ServiceID": service, + "ServiceName": "", + "Status": "critical", + }, + { + "CheckID": "serfHealth", + "Name": "Serf Health Status", + "Node": "node-1", + "Notes": "", + "Output": "Agent alive and reachable", + "ServiceID": "", + "ServiceName": "", + "Status": "passing" + }, + { + "CheckID": "service:{0}".format(service), + "Name": "service check {0}".format(service), + "Node": "node-1", + "Notes": "", + "Output": "Service {0} alive".format(service), + "ServiceID": service, + "ServiceName": "", + "Status": "warning" + }, + { + "CheckID": "service:{0}".format(service), + "Name": "service check {0}".format(service), + "Node": "node-1", + "Notes": "", + "Output": "Service {0} alive".format(service), + "ServiceID": service, + "ServiceName": "", + "Status": "critical" + } + ], + "Node": { + "Address": _get_random_ip(), + "Node": "node-1" + }, + "Service": { + "Address": "", + "ID": service, + "Port": 80, + "Service": service, + "Tags": [ + "az-us-east-1a" + ] + } + } + ] + def mock_get_coord_datacenters(self, instance): return [{ "Datacenter": "dc1", @@ -500,6 +562,22 @@ def test_get_nodes_with_service_critical(self): self.assertMetric('consul.catalog.services_warning', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) self.assertMetric('consul.catalog.services_critical', value=6, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) + def test_get_nodes_with_service_critical_in_maintenance(self): + my_mocks = self._get_consul_mocks() + my_mocks['get_nodes_with_service'] = self.mock_get_nodes_with_service_critical_in_maitenance + + self.run_check(MOCK_CONFIG, mocks=my_mocks) + self.assertMetric('consul.catalog.nodes_up', value=1, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a']) + self.assertMetric('consul.catalog.nodes_passing', value=0, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a']) + self.assertMetric('consul.catalog.nodes_warning', value=0, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a']) + self.assertMetric('consul.catalog.nodes_critical', value=0, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a']) + self.assertMetric('consul.catalog.nodes_maintenance', value=1, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a']) + self.assertMetric('consul.catalog.services_up', value=6, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) + self.assertMetric('consul.catalog.services_passing', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) + self.assertMetric('consul.catalog.services_warning', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) + self.assertMetric('consul.catalog.services_critical', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) + self.assertMetric('consul.catalog.services_maintenance', value=6, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) + def test_service_checks(self): my_mocks = self._get_consul_mocks() my_mocks['consul_request'] = self.mock_get_health_check