Skip to content

Commit

Permalink
Merge pull request #1156 from yyyangw/feature/prometheus_metrics_node
Browse files Browse the repository at this point in the history
增加“节点池中空闲节点情况”相关的prometheus监控数据metrics
  • Loading branch information
Lingghh authored Aug 6, 2024
2 parents a188866 + cbf6a68 commit 2fbad7e
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 0 deletions.
3 changes: 3 additions & 0 deletions helm-charts/tca/templates/main/servicemonitor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,7 @@ spec:
- port: http
path: /prometheus/metrics
interval: 30s
- port: http
path: api/nodes/nodestate/metrics/
interval: 1m
{{- end }}
1 change: 1 addition & 0 deletions server/projects/main/apps/nodemgr/api_urls/v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,5 @@
path("<int:node_id>/heartbeat/", v1.NodeHeartBeatApiView.as_view(), name="apiv1_node_heart_beat"),
path("<int:node_id>/status/", v1.NodeStatusApiView.as_view(), name="apiv1_node_status"),
path("register/", v1.NodeRegisterApiView.as_view(), name="apiv1_node_register"),
path("nodestate/metrics/", v1.NodeStateExporterApiView.as_view(), name="apiv1_node_state_metrics"),
]
39 changes: 39 additions & 0 deletions server/projects/main/apps/nodemgr/apis/v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
from rest_framework.permissions import IsAdminUser
from rest_framework.response import Response
from rest_framework.views import APIView
from prometheus_client import CollectorRegistry, Gauge, generate_latest
from django.http import HttpResponse

# 项目内 import
from apps.authen.backends import TCANodeTokenBackend
Expand Down Expand Up @@ -120,3 +122,40 @@ def post(self, request):
raise ParseError("未指定团队org_sid,无法注册节点")
data = NodeManager.register_node(request, slz.validated_data)
return Response(data)


class NodeStateExporterApiView(APIView):
"""节点状态查询接口
### GET
应用场景:查询所有节点池中的全部节点数、活跃节点数以及空闲节点数,为Prometheus节点状态监控提供metrics。
"""
authentication_classes = []
permission_classes = []

def get(self, request, *args, **kwargs):
REGISTRY = CollectorRegistry()
LABELS = ['tag'] # 标签定义

# 指标定义
nodes_total = Gauge('nodes_total', 'Total number of nodes', LABELS, registry=REGISTRY)
nodes_active = Gauge('nodes_active', 'Number of active nodes', LABELS, registry=REGISTRY)
nodes_free = Gauge('nodes_free', 'Number of free nodes', LABELS, registry=REGISTRY)

try:
# 先获取所有的tag
tags = models.ExecTag.objects.all()
for tag in tags:
# 遍历tag,也就是每一个节点池,统计每个节点池的节点总数和空闲节点数,在filter添加tag过滤条件
total = models.Node.objects.filter(exec_tags=tag.id).count()
active = models.Node.objects.filter(exec_tags=tag.id, enabled=1).count()
free = models.Node.objects.filter(exec_tags=tag.id, enabled=1, state=0).count() # 获取state字段为0的数据的条数

nodes_total.labels(tag.name).set(total)
nodes_active.labels(tag.name).set(active)
nodes_free.labels(tag.name).set(free)

return HttpResponse(generate_latest(REGISTRY), status=200, content_type="text/plain")
except Exception as e:
logger.exception(e)
return HttpResponse('# HELP Error occured', status=500, content_type="text/plain")

0 comments on commit 2fbad7e

Please sign in to comment.