Skip to content

Commit

Permalink
BIGTOP-4304: Support Prometheus configuration (apache#127)
Browse files Browse the repository at this point in the history
  • Loading branch information
lhpqaq authored Dec 25, 2024
1 parent 928f6ce commit a6d80e4
Show file tree
Hide file tree
Showing 5 changed files with 204 additions and 8 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
<?xml version="1.0"?>
<!--
~ Licensed to the Apache Software Foundation (ASF) under one
~ or more contributor license agreements. See the NOTICE file
~ distributed with this work for additional information
~ regarding copyright ownership. The ASF licenses this file
~ to you under the Apache License, Version 2.0 (the
~ "License"); you may not use this file except in compliance
~ with the License. You may obtain a copy of the License at
~
~ https://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing,
~ software distributed under the License is distributed on an
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
~ KIND, either express or implied. See the License for the
~ specific language governing permissions and limitations
~ under the License.
-->

<configuration>
<property>
<name>rules_file_name</name>
<value>prometheus_rules.yml</value>
<description>Rules file name</description>
</property>
<property>
<name>content</name>
<description>This is the freemarker template for rules file</description>
<value><![CDATA[
groups:
# Recording rules group: Used to calculate and save new aggregated metrics
- name: example_recording_rules
interval: 1m # The frequency at which the rules are evaluated
rules:
# Recording rule: Calculate the average CPU usage over the last 5 minutes for each job
- record: job:cpu_usage:avg
expr: avg(rate(node_cpu_seconds_total{mode="user"}[5m])) by (job)
# This creates a new metric `job:cpu_usage:avg` representing the average CPU usage per job
# Alerting rules group: Used to trigger alerts based on conditions
- name: example_alerting_rules
interval: 1m # The frequency at which the alerting rules are evaluated
rules:
# Alerting rule: Trigger an alert if the average CPU usage is over 90% for the last 5 minutes
- alert: HighCpuUsage
expr: avg(rate(node_cpu_seconds_total{mode="user"}[5m])) by (instance) > 0.9
# This expression checks if the average CPU usage over the last 5 minutes for each instance is greater than 90%
for: 5m # The condition must hold true for 5 minutes before the alert is triggered
labels:
severity: critical # Set the severity of the alert as 'critical'
annotations:
summary: "CPU usage on instance {{ $labels.instance }} is over 90% for the last 5 minutes"
# Summary of the alert that will appear when it triggers
description: "The CPU usage on instance {{ $labels.instance }} has been over 90% for the past 5 minutes."
# Detailed description of the alert that will provide more context
]]>
</value>
<attrs>
<type>longtext</type>
</attrs>
</property>
</configuration>
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@
-->

<configuration>
<property>
<name>port</name>
<description>Port on which Prometheus server listens</description>
<value>9090</value>
</property>
<property>
<name>content</name>
<description>This is the freemarker template for prometheus.yml file</description>
Expand All @@ -31,17 +36,26 @@ global:
external_labels:
monitor: 'codelab-monitor'
# Rule files specifies a list of globs. Rules and alerts are read from
# all matching files.
rule_files:
<#if rules_file_name??>
- ${rules_file_name}
</#if>
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
# Override the global default and scrape targets from this job every 5 seconds.
scrape_interval: 5s
<#list scrape_jobs as job>
- job_name: '${job.name}'
<#if job.metrics_path??>
metrics_path: "${job.metrics_path}"
</#if>
file_sd_configs:
- files: ['${job.targets_file}']
static_configs:
- targets: ['localhost:9090']
</#list>
]]>
</value>
<attrs>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
package org.apache.bigtop.manager.stack.infra.v1_0_0.prometheus;

import org.apache.bigtop.manager.common.message.entity.payload.CommandPayload;
import org.apache.bigtop.manager.stack.core.annotations.GlobalParams;
import org.apache.bigtop.manager.stack.core.spi.param.Params;
import org.apache.bigtop.manager.stack.core.utils.LocalSettings;
import org.apache.bigtop.manager.stack.infra.param.InfraParams;

import com.google.auto.service.AutoService;
Expand All @@ -28,23 +30,101 @@
import lombok.extern.slf4j.Slf4j;

import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

@Getter
@Slf4j
@AutoService(Params.class)
@NoArgsConstructor
public class PrometheusParams extends InfraParams {

protected final String PROMETHEUS_SELF_JOB_NAME = "prometheus";
protected final String BM_AGENT_JOB_NAME = "bm-agent";
protected final String BM_AGENT_PORT = "8081";

private Map<String, Object> prometheusScrapeJob;
private Map<String, Object> agentScrapeJob;
private List<Map<String, Object>> scrapeJobs;
private String prometheusPort;
private String prometheusContent;
private String prometheusRulesFilename;
private String prometheusRulesFileContent;

public PrometheusParams(CommandPayload commandPayload) {
super(commandPayload);
scrapeJobs = new ArrayList<>();
scrapeJobs.add(prometheusScrapeJob);
scrapeJobs.add(agentScrapeJob);
globalParamsMap.put("scrape_jobs", scrapeJobs);
globalParamsMap.put("rules_file_name", prometheusRulesFilename);
}

public String dataDir() {
return MessageFormat.format("{0}/data", serviceHome());
}

public String targetsConfigFile(String jobName) {
return MessageFormat.format("{0}/{1}_targets.json", confDir(), jobName);
}

@Override
public String getServiceName() {
return "prometheus";
}

protected List<String> getAllHost() {
List<String> ips = LocalSettings.hosts().get("all");
List<String> hosts = new ArrayList<>();
for (String ip : ips) {
hosts.add(MessageFormat.format("{0}:{1}", ip, BM_AGENT_PORT));
}
return hosts;
}

@GlobalParams
public Map<String, Object> prometheusJob() {
Map<String, Object> configuration = LocalSettings.configurations(getServiceName(), "prometheus");
prometheusPort = (String) configuration.get("port");
Map<String, Object> job = new HashMap<>();
job.put("name", PROMETHEUS_SELF_JOB_NAME);
job.put("targets_file", targetsConfigFile(PROMETHEUS_SELF_JOB_NAME));
job.put("targets_list", List.of(MessageFormat.format("localhost:{0}", prometheusPort)));
prometheusScrapeJob = job;
return configuration;
}

@GlobalParams
public Map<String, Object> agentJob() {
Map<String, Object> job = new HashMap<>();
job.put("name", BM_AGENT_JOB_NAME);
job.put("targets_file", targetsConfigFile(BM_AGENT_JOB_NAME));
job.put("targets_list", getAllHost());
job.put("metrics_path", "/actuator/prometheus");
agentScrapeJob = job;
return LocalSettings.configurations(getServiceName(), "prometheus");
}

@GlobalParams
public Map<String, Object> configs() {
Map<String, Object> configuration = LocalSettings.configurations(getServiceName(), "prometheus");

prometheusContent = (String) configuration.get("content");
return configuration;
}

@GlobalParams
public Map<String, Object> rules() {
Map<String, Object> configuration = LocalSettings.configurations(getServiceName(), "prometheus-rule");

prometheusRulesFilename = (String) configuration.get("rules_file_name");
prometheusRulesFileContent = (String) configuration.get("content");
return configuration;
}

public String listenAddress() {
return MessageFormat.format("0.0.0.0:{0}", prometheusPort);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ public ShellResult start(Params params) {
configure(params);
PrometheusParams prometheusParams = (PrometheusParams) params;
String cmd = MessageFormat.format(
"nohup {0}/prometheus --config.file={0}/prometheus.yml --storage.tsdb.path={0}/data > {0}/nohup.out 2>&1 &",
prometheusParams.serviceHome());
"nohup {0}/prometheus --config.file={1}/prometheus.yml --web.listen-address={2} --storage.tsdb.path={0}/data > {0}/nohup.out 2>&1 &",
prometheusParams.serviceHome(), prometheusParams.confDir(), prometheusParams.listenAddress());
try {
ShellResult shellResult = LinuxOSUtils.sudoExecCmd(cmd, prometheusParams.user());
if (shellResult.getExitCode() != 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,24 +20,61 @@

import org.apache.bigtop.manager.common.constants.Constants;
import org.apache.bigtop.manager.common.shell.ShellResult;
import org.apache.bigtop.manager.stack.core.enums.ConfigType;
import org.apache.bigtop.manager.stack.core.spi.param.Params;
import org.apache.bigtop.manager.stack.core.utils.linux.LinuxFileUtils;

import lombok.AccessLevel;
import lombok.NoArgsConstructor;
import lombok.extern.slf4j.Slf4j;

import java.text.MessageFormat;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

@Slf4j
@NoArgsConstructor(access = AccessLevel.PRIVATE)
public class PrometheusSetup {

@SuppressWarnings("unchecked")
public static ShellResult config(Params params) {
PrometheusParams prometheusParams = (PrometheusParams) params;
String user = prometheusParams.user();
String group = prometheusParams.group();

LinuxFileUtils.createDirectories(prometheusParams.dataDir(), user, group, Constants.PERMISSION_755, true);
LinuxFileUtils.createDirectories(prometheusParams.confDir(), user, group, Constants.PERMISSION_755, true);

LinuxFileUtils.toFileByTemplate(
prometheusParams.getPrometheusContent(),
MessageFormat.format("{0}/prometheus.yml", prometheusParams.confDir()),
user,
group,
Constants.PERMISSION_644,
prometheusParams.getGlobalParamsMap());

LinuxFileUtils.toFileByTemplate(
prometheusParams.getPrometheusRulesFileContent(),
MessageFormat.format(
"{0}/{1}", prometheusParams.confDir(), prometheusParams.getPrometheusRulesFilename()),
user,
group,
Constants.PERMISSION_644,
prometheusParams.getGlobalParamsMap());

for (int i = 0; i < prometheusParams.getScrapeJobs().size(); i++) {
Map<String, Object> job = prometheusParams.getScrapeJobs().get(i);
Map<String, List<String>> targets = new HashMap<>();
targets.put("targets", (List<String>) job.get("targets_list"));
LinuxFileUtils.toFile(
ConfigType.JSON,
(String) job.get("targets_file"),
user,
group,
Constants.PERMISSION_644,
List.of(targets));
}
return ShellResult.success("Prometheus Configure success!");
}
}

0 comments on commit a6d80e4

Please sign in to comment.