diff --git a/helm-charts/tca/templates/main/servicemonitor.yaml b/helm-charts/tca/templates/main/servicemonitor.yaml index efa25b998..4cf1b1b5a 100644 --- a/helm-charts/tca/templates/main/servicemonitor.yaml +++ b/helm-charts/tca/templates/main/servicemonitor.yaml @@ -11,4 +11,7 @@ spec: - port: http path: /prometheus/metrics interval: 30s + - port: http + path: api/nodes/nodestate/metrics/ + interval: 1m {{- end }} \ No newline at end of file diff --git a/server/projects/main/apps/nodemgr/api_urls/v1.py b/server/projects/main/apps/nodemgr/api_urls/v1.py index c1372fc51..05862bbd8 100644 --- a/server/projects/main/apps/nodemgr/api_urls/v1.py +++ b/server/projects/main/apps/nodemgr/api_urls/v1.py @@ -20,4 +20,5 @@ path("/heartbeat/", v1.NodeHeartBeatApiView.as_view(), name="apiv1_node_heart_beat"), path("/status/", v1.NodeStatusApiView.as_view(), name="apiv1_node_status"), path("register/", v1.NodeRegisterApiView.as_view(), name="apiv1_node_register"), + path("nodestate/metrics/", v1.NodeStateExporterApiView.as_view(), name="apiv1_node_state_metrics"), ] diff --git a/server/projects/main/apps/nodemgr/apis/v1.py b/server/projects/main/apps/nodemgr/apis/v1.py index ede018dca..741ebde36 100644 --- a/server/projects/main/apps/nodemgr/apis/v1.py +++ b/server/projects/main/apps/nodemgr/apis/v1.py @@ -22,6 +22,8 @@ from rest_framework.permissions import IsAdminUser from rest_framework.response import Response from rest_framework.views import APIView +from prometheus_client import CollectorRegistry, Gauge, generate_latest +from django.http import HttpResponse # 项目内 import from apps.authen.backends import TCANodeTokenBackend @@ -120,3 +122,40 @@ def post(self, request): raise ParseError("未指定团队org_sid,无法注册节点") data = NodeManager.register_node(request, slz.validated_data) return Response(data) + + +class NodeStateExporterApiView(APIView): + """节点状态查询接口 + + ### GET + 应用场景:查询所有节点池中的全部节点数、活跃节点数以及空闲节点数,为Prometheus节点状态监控提供metrics。 + """ + authentication_classes = [] + permission_classes = [] + + def get(self, request, *args, **kwargs): + REGISTRY = CollectorRegistry() + LABELS = ['tag'] # 标签定义 + + # 指标定义 + nodes_total = Gauge('nodes_total', 'Total number of nodes', LABELS, registry=REGISTRY) + nodes_active = Gauge('nodes_active', 'Number of active nodes', LABELS, registry=REGISTRY) + nodes_free = Gauge('nodes_free', 'Number of free nodes', LABELS, registry=REGISTRY) + + try: + # 先获取所有的tag + tags = models.ExecTag.objects.all() + for tag in tags: + # 遍历tag,也就是每一个节点池,统计每个节点池的节点总数和空闲节点数,在filter添加tag过滤条件 + total = models.Node.objects.filter(exec_tags=tag.id).count() + active = models.Node.objects.filter(exec_tags=tag.id, enabled=1).count() + free = models.Node.objects.filter(exec_tags=tag.id, enabled=1, state=0).count() # 获取state字段为0的数据的条数 + + nodes_total.labels(tag.name).set(total) + nodes_active.labels(tag.name).set(active) + nodes_free.labels(tag.name).set(free) + + return HttpResponse(generate_latest(REGISTRY), status=200, content_type="text/plain") + except Exception as e: + logger.exception(e) + return HttpResponse('# HELP Error occured', status=500, content_type="text/plain")