Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

增加“节点池中空闲节点情况”相关的prometheus监控数据metrics #1156

Merged
merged 1 commit into from
Aug 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions helm-charts/tca/templates/main/servicemonitor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,7 @@ spec:
- port: http
path: /prometheus/metrics
interval: 30s
- port: http
path: api/nodes/nodestate/metrics/
interval: 1m
{{- end }}
1 change: 1 addition & 0 deletions server/projects/main/apps/nodemgr/api_urls/v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,5 @@
path("<int:node_id>/heartbeat/", v1.NodeHeartBeatApiView.as_view(), name="apiv1_node_heart_beat"),
path("<int:node_id>/status/", v1.NodeStatusApiView.as_view(), name="apiv1_node_status"),
path("register/", v1.NodeRegisterApiView.as_view(), name="apiv1_node_register"),
path("nodestate/metrics/", v1.NodeStateExporterApiView.as_view(), name="apiv1_node_state_metrics"),
]
39 changes: 39 additions & 0 deletions server/projects/main/apps/nodemgr/apis/v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
from rest_framework.permissions import IsAdminUser
from rest_framework.response import Response
from rest_framework.views import APIView
from prometheus_client import CollectorRegistry, Gauge, generate_latest
from django.http import HttpResponse

# 项目内 import
from apps.authen.backends import TCANodeTokenBackend
Expand Down Expand Up @@ -120,3 +122,40 @@ def post(self, request):
raise ParseError("未指定团队org_sid,无法注册节点")
data = NodeManager.register_node(request, slz.validated_data)
return Response(data)


class NodeStateExporterApiView(APIView):
"""节点状态查询接口

### GET
应用场景:查询所有节点池中的全部节点数、活跃节点数以及空闲节点数,为Prometheus节点状态监控提供metrics。
"""
authentication_classes = []
permission_classes = []

def get(self, request, *args, **kwargs):
REGISTRY = CollectorRegistry()
LABELS = ['tag'] # 标签定义

# 指标定义
nodes_total = Gauge('nodes_total', 'Total number of nodes', LABELS, registry=REGISTRY)
nodes_active = Gauge('nodes_active', 'Number of active nodes', LABELS, registry=REGISTRY)
nodes_free = Gauge('nodes_free', 'Number of free nodes', LABELS, registry=REGISTRY)

try:
# 先获取所有的tag
tags = models.ExecTag.objects.all()
for tag in tags:
# 遍历tag,也就是每一个节点池,统计每个节点池的节点总数和空闲节点数,在filter添加tag过滤条件
total = models.Node.objects.filter(exec_tags=tag.id).count()
active = models.Node.objects.filter(exec_tags=tag.id, enabled=1).count()
free = models.Node.objects.filter(exec_tags=tag.id, enabled=1, state=0).count() # 获取state字段为0的数据的条数

nodes_total.labels(tag.name).set(total)
nodes_active.labels(tag.name).set(active)
nodes_free.labels(tag.name).set(free)

return HttpResponse(generate_latest(REGISTRY), status=200, content_type="text/plain")
except Exception as e:
logger.exception(e)
return HttpResponse('# HELP Error occured', status=500, content_type="text/plain")
Loading