From 32be53d4ab50a6f2b7d6f04e9eb372099a111c6d Mon Sep 17 00:00:00 2001 From: Diogo Kiss Date: Fri, 16 Mar 2018 15:56:17 +0100 Subject: [PATCH] [consul] add maintenance metric to Consul catalog. Currently, if a node in Consul is set into maintenance state, it is reported as a node in critical state to Datadog metrics. This makes it confusing to determine by the metric whether it is a node failing with problems or an planned intervention. A user made a PR to Datadog some time ago, but it was not merged due to Datadog code organization changes and got forgotten (DataDog/dd-agent#2496). I'm pushing the change forward. I've tested it using dd-agent version 5.8.0. This PR also depends on https://github.com/DataDog/dd-agent/pull/3708 --- consul/CHANGELOG.md | 5 ++ consul/datadog_checks/consul/__init__.py | 2 +- consul/datadog_checks/consul/consul.py | 22 ++++++- consul/manifest.json | 2 +- consul/metadata.csv | 2 + consul/test/test_consul.py | 78 ++++++++++++++++++++++++ 6 files changed, 106 insertions(+), 5 deletions(-) diff --git a/consul/CHANGELOG.md b/consul/CHANGELOG.md index 56a30108e811a6..17bbec0e8c4ecf 100644 --- a/consul/CHANGELOG.md +++ b/consul/CHANGELOG.md @@ -1,5 +1,10 @@ # CHANGELOG - consul +1.3.1 / Unreleased +================= + +* [IMPROVEMENT] Add maintenance metrics (services_maintenance and nodes_maintenance) + 1.3.0 / 2018-01-10 ================== diff --git a/consul/datadog_checks/consul/__init__.py b/consul/datadog_checks/consul/__init__.py index 2e35d54188f47d..6e1ce5bbe74927 100644 --- a/consul/datadog_checks/consul/__init__.py +++ b/consul/datadog_checks/consul/__init__.py @@ -2,6 +2,6 @@ ConsulCheck = consul.ConsulCheck -__version__ = "1.3.0" +__version__ = "1.3.1" __all__ = ['consul'] diff --git a/consul/datadog_checks/consul/consul.py b/consul/datadog_checks/consul/consul.py index fad679ebb72304..7eccfe31149fc8 100644 --- a/consul/datadog_checks/consul/consul.py +++ b/consul/datadog_checks/consul/consul.py @@ -63,6 +63,7 @@ class ConsulCheck(AgentCheck): 'passing': AgentCheck.OK, 'warning': AgentCheck.WARNING, 'critical': AgentCheck.CRITICAL, + 'maintenance': AgentCheck.MAINTENANCE } STATUS_SEVERITY = { @@ -70,6 +71,7 @@ class ConsulCheck(AgentCheck): AgentCheck.OK: 1, AgentCheck.WARNING: 2, AgentCheck.CRITICAL: 3, + AgentCheck.MAINTENANCE: 4, } def __init__(self, name, init_config, agentConfig, instances=None): @@ -322,12 +324,13 @@ def check(self, instance): # `consul.catalog.nodes_passing` : # of Nodes with service status `passing` from those registered # `consul.catalog.nodes_warning` : # of Nodes with service status `warning` from those registered # `consul.catalog.nodes_critical` : # of Nodes with service status `critical` from those registered + # `consul.catalog.nodes_maintenance` : # of Nodes set in maintenance from those registered service_tags = self._get_service_tags(service, services[service]) nodes_with_service = self.get_nodes_with_service(instance, service) - # {'up': 0, 'passing': 0, 'warning': 0, 'critical': 0} + # {'up': 0, 'passing': 0, 'warning': 0, 'critical': 0, 'maintenance': 0} node_status = defaultdict(int) for node in nodes_with_service: @@ -345,8 +348,18 @@ def check(self, instance): found_critical = False found_warning = False found_serf_health = False + found_maint_critical = False for check in node['Checks']: + + # If a node is in maintenance state, it means that, for some reason, we don't + # really expect it to be healthy. We don't really care about it, until the maintenance + # window is over. So, we just move on. + if check['CheckID'] == '_node_maintenance': + if check['Status'] == 'critical': + found_maint_critical = True + break + if check['CheckID'] == 'serfHealth': found_serf_health = True @@ -367,8 +380,11 @@ def check(self, instance): # Keep looping in case there is a critical status # Increment the counters based on what was found in Checks - # `critical` checks override `warning`s, and if neither are found, register the node as `passing` - if found_critical: + # `maintenance` checks override `critical`s, which override `warning`s. If none is found, register the node as `passing` + if found_maint_critical: + node_status['maintenance'] += 1 + nodes_to_service_status[node_id]["maintenance"] += 1 + elif found_critical: node_status['critical'] += 1 nodes_to_service_status[node_id]["critical"] += 1 elif found_warning: diff --git a/consul/manifest.json b/consul/manifest.json index d942a33824a48e..29141e2e1eac0f 100644 --- a/consul/manifest.json +++ b/consul/manifest.json @@ -11,7 +11,7 @@ "mac_os", "windows" ], - "version": "1.3.0", + "version": "1.3.1", "guid": "ec1e9fac-a339-49a3-b501-60656d2a5671", "public_title": "Datadog-Consul Integration", "categories":["containers", "orchestration", "configuration & deployment", "notification"], diff --git a/consul/metadata.csv b/consul/metadata.csv index d5036e31ce3312..d98bd7ae325ea6 100644 --- a/consul/metadata.csv +++ b/consul/metadata.csv @@ -1,9 +1,11 @@ metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name consul.catalog.nodes_critical,gauge,,node,,Number of nodes with service status `critical` from those registered,-1,consul,nodes crit +consul.catalog.nodes_maintenance,gauge,,node,,Number of nodes in maintenance from those registered,-1,consul,nodes maint consul.catalog.nodes_passing,gauge,,node,,Number of nodes with service status `passing` from those registered,1,consul,nodes pass consul.catalog.nodes_up,gauge,,node,,Number of nodes,0,consul,nodes up consul.catalog.nodes_warning,gauge,,node,,Number of nodes with service status `warning` from those registered,-1,consul,nodes warn consul.catalog.services_critical,gauge,,service,,Total critical services on nodes,-1,consul,svc crit +consul.catalog.services_maintenance,gauge,,service,,Total services in maintenance on nodes,-1,consul,svc maint consul.catalog.services_passing,gauge,,service,,Total passing services on nodes,1,consul,svc pass consul.catalog.services_up,gauge,,service,,Total services registered on nodes,0,consul,svc up consul.catalog.services_warning,gauge,,service,,Total warning services on nodes,-1,consul,svc warn diff --git a/consul/test/test_consul.py b/consul/test/test_consul.py index 492d10d15a468e..0984b7034e8b18 100644 --- a/consul/test/test_consul.py +++ b/consul/test/test_consul.py @@ -287,6 +287,68 @@ def mock_get_nodes_with_service_critical(self, instance, service): } ] + def mock_get_nodes_with_service_critical_in_maitenance(self, instance, service): + + return [ + { + "Checks": [ + { + "CheckID": "_node_maintenance", + "Name": "Node Maintenance Mode", + "Node": "node-1", + "Notes": "", + "Output": "", + "ServiceID": service, + "ServiceName": "", + "Status": "critical", + }, + { + "CheckID": "serfHealth", + "Name": "Serf Health Status", + "Node": "node-1", + "Notes": "", + "Output": "Agent alive and reachable", + "ServiceID": "", + "ServiceName": "", + "Status": "passing" + }, + { + "CheckID": "service:{0}".format(service), + "Name": "service check {0}".format(service), + "Node": "node-1", + "Notes": "", + "Output": "Service {0} alive".format(service), + "ServiceID": service, + "ServiceName": "", + "Status": "warning" + }, + { + "CheckID": "service:{0}".format(service), + "Name": "service check {0}".format(service), + "Node": "node-1", + "Notes": "", + "Output": "Service {0} alive".format(service), + "ServiceID": service, + "ServiceName": "", + "Status": "critical" + } + ], + "Node": { + "Address": _get_random_ip(), + "Node": "node-1" + }, + "Service": { + "Address": "", + "ID": service, + "Port": 80, + "Service": service, + "Tags": [ + "az-us-east-1a" + ] + } + } + ] + def mock_get_coord_datacenters(self, instance): return [{ "Datacenter": "dc1", @@ -500,6 +562,22 @@ def test_get_nodes_with_service_critical(self): self.assertMetric('consul.catalog.services_warning', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) self.assertMetric('consul.catalog.services_critical', value=6, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) + def test_get_nodes_with_service_critical_in_maintenance(self): + my_mocks = self._get_consul_mocks() + my_mocks['get_nodes_with_service'] = self.mock_get_nodes_with_service_critical_in_maitenance + + self.run_check(MOCK_CONFIG, mocks=my_mocks) + self.assertMetric('consul.catalog.nodes_up', value=1, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a']) + self.assertMetric('consul.catalog.nodes_passing', value=0, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a']) + self.assertMetric('consul.catalog.nodes_warning', value=0, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a']) + self.assertMetric('consul.catalog.nodes_critical', value=0, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a']) + self.assertMetric('consul.catalog.nodes_maintenance', value=1, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a']) + self.assertMetric('consul.catalog.services_up', value=6, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) + self.assertMetric('consul.catalog.services_passing', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) + self.assertMetric('consul.catalog.services_warning', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) + self.assertMetric('consul.catalog.services_critical', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) + self.assertMetric('consul.catalog.services_maintenance', value=6, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) + def test_service_checks(self): my_mocks = self._get_consul_mocks() my_mocks['consul_request'] = self.mock_get_health_check