Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: 检查Agent、bkmonitorbeat异常状态并发送邮件告知运维 (closed #2512) #2537

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions apps/node_man/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -572,6 +572,8 @@ def _get_member__alias_map(cls) -> Dict[Enum, str]:
QUERY_MODULE_ID_THRESHOLD = 15
UPDATE_CMDB_CLOUD_AREA_LIMIT = 50
VERSION_PATTERN = re.compile(r"[vV]?(\d+\.){1,5}\d+(-rc\d)?$")
# 进程表插件查询分片数量
PROC_CHUNK_SIZE = 30000
# 语义化版本正则,参考:https://semver.org/#is-there-a-suggested-regular-expression-regex-to-check-a-semver-string
SEMANTIC_VERSION_PATTERN = re.compile(
r"^v?(?P<major>0|[1-9]\d*)\.(?P<minor>0|[1-9]\d*)\.(?P<patch>0|[1-9]\d*)"
Expand Down
2 changes: 2 additions & 0 deletions apps/node_man/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ class KeyEnum(Enum):
QUERY_PROC_STATUS_HOST_LENS = "QUERY_PROC_STATUS_HOST_LENS"
# 业务最大插件版本
PLUGIN_VERSION_CONFIG = "PLUGIN_VERSION_CONFIG"
# 发送邮件业务黑名单
SEND_MAIL_BIZ_BLACKLIST = "SEND_MAIL_BIZ_BLACKLIST"

key = models.CharField(_("键"), max_length=255, db_index=True, primary_key=True)
v_json = JSONField(_("值"))
Expand Down
3 changes: 3 additions & 0 deletions apps/node_man/periodic_tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,6 @@

if getattr(settings, "CONFIG_POLICY_BY_TENCENT_VPC", False):
from .configuration_policy import configuration_policy # noqa

if all(getattr(settings, attr, False) for attr in ["TAIHU_MAIL_SENDER", "TAIHU_TOKEN", "TAIHU_SEND_MAIL_API"]):
from .send_mail_to_maintainer import send_mail_to_maintainer_periodic_task # noqa
99 changes: 99 additions & 0 deletions apps/node_man/periodic_tasks/send_mail_to_maintainer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# -*- coding: utf-8 -*-
"""
TencentBlueKing is pleased to support the open source community by making 蓝鲸智云-节点管理(BlueKing-BK-NODEMAN) available.
Copyright (C) 2017-2022 THL A29 Limited, a Tencent company. All rights reserved.
Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License.
You may obtain a copy of the License at https://opensource.org/licenses/MIT
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
"""
import collections
from typing import Any, Dict, List, Set

from celery.schedules import crontab
from celery.task import periodic_task
from django.conf import settings
from django.db.models import QuerySet

from apps.node_man import constants, models
from common.api import CCApi
from common.api.modules.taihu_apis import taihu_client
from common.log import logger


def send_mail_to_maintainer(task_id):
logger.info(f"start send_mail_to_maintainer, task_id -> {task_id}")

query_kwargs = {"fields": ["bk_biz_id", "bk_biz_name", "bk_biz_maintainer"]}
try:
biz_infos: List[Dict[str, Any]] = CCApi.search_business(query_kwargs)["info"]
# 去除业务运维为空的数据
biz_infos: List[Dict[str, Any]] = [biz_info for biz_info in biz_infos if biz_info["bk_biz_maintainer"]]
# 构建成业务ID映射业务信息字典
biz_id_biz_info_map: Dict[int, Dict[str, Any]] = {biz_info["bk_biz_id"]: biz_info for biz_info in biz_infos}
except Exception as e:
logger.exception(f"get business info error: {str(e)}")
return

# 异常Agent HostID
terminated_agent: QuerySet = models.ProcessStatus.objects.filter(
status=constants.ProcStateType.TERMINATED, name=models.ProcessStatus.GSE_AGENT_PROCESS_NAME
).values_list("bk_host_id", flat=True)
# 异常bkmonitorbeat HostID
terminated_plugin: QuerySet = models.ProcessStatus.objects.filter(
status=constants.ProcStateType.TERMINATED, name="bkmonitorbeat"
).values_list("bk_host_id", flat=True)

agent_counter, plugin_counter = collections.Counter(), collections.Counter()

for chunk_size in range(0, terminated_agent.count(), constants.PROC_CHUNK_SIZE):
bulk_terminated_agent: Set[int] = set(terminated_agent[chunk_size : chunk_size + constants.PROC_CHUNK_SIZE])
bk_biz_ids = models.Host.objects.filter(bk_host_id__in=bulk_terminated_agent).values_list(
"bk_biz_id", flat=True
)

agent_counter.update(collections.Counter(bk_biz_ids))

for chunk_size in range(0, terminated_plugin.count(), constants.PROC_CHUNK_SIZE):
bulk_terminated_plugin: Set[int] = set(terminated_plugin[chunk_size : chunk_size + constants.PROC_CHUNK_SIZE])
bk_biz_ids = models.Host.objects.filter(bk_host_id__in=bulk_terminated_plugin).values_list(
"bk_biz_id", flat=True
)
plugin_counter.update(collections.Counter(bk_biz_ids))

final_handle_biz = set(agent_counter.keys()) | set(plugin_counter.keys())
biz_blacklist = models.GlobalSettings.get_config(
key=models.GlobalSettings.KeyEnum.SEND_MAIL_BIZ_BLACKLIST.value, default=[]
)
for bk_biz_id in final_handle_biz:
biz_info = biz_id_biz_info_map.get(bk_biz_id)
# 没有运维信息的业务、在黑名单中的不发送邮件
if not biz_info or bk_biz_id in biz_blacklist:
continue
biz_name = biz_info["bk_biz_name"]
biz_maintainer = biz_info["bk_biz_maintainer"]
try:
taihu_client.send_mail(
to=biz_maintainer,
title="业务-{}-ID-{}:Agent-bkmonitorbeat状态异常通知".format(biz_name, bk_biz_id),
content="Agent异常数量: {}, bkmonitorbeat异常数量: {}, 详情点击<a href={} target='_blank'>节点管理</a>".format(
agent_counter[bk_biz_id], plugin_counter[bk_biz_id], settings.BK_NODEMAN_URL
),
)
except Exception as e:
logger.exception(f"bk_biz_id -> {bk_biz_id} send mail to maintainer error: {str(e)}")
continue

logger.info(f"send mail to maintainer success, task_id -> {task_id}")


@periodic_task(
queue="default",
options={"queue": "default"},
run_every=crontab(hour="9", minute="0", day_of_week="*", day_of_month="*", month_of_year="*"),
)
def send_mail_to_maintainer_periodic_task():
"""定时发送邮件给运维"""
task_id = send_mail_to_maintainer_periodic_task.request.id
send_mail_to_maintainer(task_id)
69 changes: 69 additions & 0 deletions common/api/modules/taihu_apis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# -*- coding: utf-8 -*-
"""
TencentBlueKing is pleased to support the open source community by making 蓝鲸智云-节点管理(BlueKing-BK-NODEMAN) available.
Copyright (C) 2017-2022 THL A29 Limited, a Tencent company. All rights reserved.
Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License.
You may obtain a copy of the License at https://opensource.org/licenses/MIT
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
"""
import hashlib
import time
from uuid import uuid4

import requests
from django.conf import settings


class TaiHuApis(object):
def __init__(self):
self.passid = settings.APP_CODE
self.sender = settings.TAIHU_MAIL_SENDER
self.token = settings.TAIHU_TOKEN
self.url_root = settings.TAIHU_API_ROOT
self.session = requests.Session()

@property
def random_timestamp(self) -> str:
return str(int(time.time()))

@property
def request_headers(self) -> dict:
"""请求头"""
timestamp = self.random_timestamp
nonce = self.random_nonce
hash_obj = hashlib.sha256()
# 签名算法:x-rio-signature= sha256(x-rio-timestamp+Token+x-rio-nonce+x-rio-timestamp).upper()
string = timestamp + self.token + nonce + timestamp
hash_obj.update(string.encode())
signature = hash_obj.hexdigest().upper()
headers = {
"x-rio-paasid": self.passid,
"x-rio-nonce": nonce,
"x-rio-timestamp": timestamp,
"x-rio-signature": signature,
}
return headers

@property
def random_nonce(self) -> str:
return str(uuid4())

def send_mail(self, to: str, title: str, content: str):
"""发送邮件"""
data = {
"From": self.sender,
"To": to,
"Title": title,
"Content": content,
}
headers = self.request_headers
self.session.post(url=self.url_root + "/ebus/tof4_msg/api/v1/Message/SendMailInfo", headers=headers, json=data)


# 注:新增太湖API时,请确保环境变量中token和API root已配置
if all(getattr(settings, attr, False) for attr in ["TAIHU_MAIL_SENDER", "TAIHU_TOKEN", "TAIHU_API_ROOT"]):
taihu_client = TaiHuApis()
else:
taihu_client = object
6 changes: 6 additions & 0 deletions config/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -776,6 +776,8 @@ def get_standard_redis_mode(cls, config_redis_mode: str, default: Optional[str]
BK_NODEMAN_API_ADDR = os.getenv("BK_NODEMAN_API_ADDR", "")
BK_NODEMAN_NGINX_DOWNLOAD_PORT = os.getenv("BK_NODEMAN_NGINX_DOWNLOAD_PORT") or 17980
BK_NODEMAN_NGINX_PROXY_PASS_PORT = os.getenv("BK_NODEMAN_NGINX_PROXY_PASS_PORT") or 17981
# 节点管理访问地址
BK_NODEMAN_URL = os.getenv("BK_NODEMAN_URL", "")

# 使用标准运维开通策略相关变量
BKAPP_REQUEST_EE_SOPS_APP_CODE = os.getenv("BKAPP_REQUEST_EE_SOPS_APP_CODE")
Expand Down Expand Up @@ -824,6 +826,10 @@ def get_standard_redis_mode(cls, config_redis_mode: str, default: Optional[str]

# 腾讯云endpoint
TXY_ENDPOINT = env.TXY_ENDPOINT
# 太湖:邮件发送人、token、API
TAIHU_MAIL_SENDER = os.getenv("TAIHU_MAIL_SENDER")
TAIHU_TOKEN = os.getenv("TAIHU_TOKEN")
TAIHU_API_ROOT = os.getenv("TAIHU_API_ROOT")

# ==============================================================================
# 可观测
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,4 +137,7 @@ data:
TXY_ENDPOINT: "{{ .Values.config.TXYEndpoint }}"
TXY_SECRETID: "{{ .Values.config.TXYSecretId }}"
TXY_SECRETKEY: "{{ .Values.config.TXYSecretKey }}"
BKAPP_UNASSIGNED_CLOUD_ID: "{{ .Values.config.bkAppUnassignedCloudId}}"
BKAPP_UNASSIGNED_CLOUD_ID: "{{ .Values.config.bkAppUnassignedCloudId }}"
TAIHU_MAIL_SENDER: "{{ .Values.config.TaiHuMailSender }}"
TAIHU_TOKEN: "{{ .Values.config.TaiHuToken }}"
TAIHU_API_ROOT: "{{ .Values.config.TaiHuApiRoot }}"