Skip to content

Commit

Permalink
feat: add emergency events to health dashboard (#3238)
Browse files Browse the repository at this point in the history
* feat: add emergency events to health dashboard

* feat: add emergency events to health dashboard

* feat: address review comments

* feat: address review comments
  • Loading branch information
rahulguptajss authored Oct 30, 2024
1 parent 8b62df9 commit dbd23ae
Show file tree
Hide file tree
Showing 5 changed files with 265 additions and 121 deletions.
106 changes: 105 additions & 1 deletion cmd/collectors/rest/plugins/health/health.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package health

import (
"errors"
"fmt"
"github.com/netapp/harvest/v2/cmd/collectors"
"github.com/netapp/harvest/v2/cmd/poller/plugin"
Expand All @@ -14,6 +15,7 @@ import (
"github.com/tidwall/gjson"
"log/slog"
"strconv"
"strings"
"time"
)

Expand All @@ -33,6 +35,7 @@ const (
volumeRansomwareHealthMatrix = "health_volume_ransomware"
volumeMoveHealthMatrix = "health_volume_move"
licenseHealthMatrix = "health_license"
emsHealthMatrix = "health_ems"
severityLabel = "severity"
defaultDataPollDuration = 3 * time.Minute
)
Expand All @@ -44,6 +47,7 @@ type Health struct {
lastFilterTime int64
previousData map[string]*matrix.Matrix
resolutionData map[string]*matrix.Matrix
emsSeverity []string
}

func New(p *plugin.AbstractPlugin) plugin.Plugin {
Expand All @@ -66,6 +70,20 @@ func (h *Health) Init() error {
return err
}

ems := h.Params.GetChildS("ems")

// Set default severity to "emergency"
h.emsSeverity = []string{"emergency"}
if ems != nil {
severity := ems.GetChildS("severity")
if severity != nil {
severities := severity.GetAllChildContentS()
if len(severities) > 0 {
h.emsSeverity = severities
}
}
}

timeout, _ := time.ParseDuration(rest.DefaultTimeout)
if h.client, err = rest.New(conf.ZapiPoller(h.ParentParams), timeout, h.Auth); err != nil {
return err
Expand Down Expand Up @@ -147,6 +165,14 @@ func (h *Health) Run(dataMap map[string]*matrix.Matrix) ([]*matrix.Matrix, *util
h.resolutionData[k].SetGlobalLabels(data.GetGlobalLabels())
}

// Initialize emsMatrix separately as it doesn't need to be stored or processed for resolution
emsMat := matrix.New(h.Parent+emsHealthMatrix, emsHealthMatrix, emsHealthMatrix)
emsMat.SetGlobalLabels(data.GetGlobalLabels())
if err := h.initMatrix(emsHealthMatrix, "", map[string]*matrix.Matrix{emsHealthMatrix: emsMat}); err != nil {
h.SLogger.Warn("error while initializing emsHealthMatrix", slogx.Err(err))
return nil, nil, err
}

diskAlertCount := h.collectDiskAlerts()
shelfAlertCount := h.collectShelfAlerts()
supportAlertCount := h.collectSupportAlerts()
Expand All @@ -158,6 +184,7 @@ func (h *Health) Run(dataMap map[string]*matrix.Matrix) ([]*matrix.Matrix, *util
volumeRansomwareAlertCount := h.collectVolumeRansomwareAlerts()
volumeMoveAlertCount := h.collectVolumeMoveAlerts()
licenseAlertCount := h.collectLicenseAlerts()
emsAlertCount := h.collectEmsAlerts(emsMat)

resolutionInstancesCount := h.generateResolutionMetrics()

Expand All @@ -170,6 +197,8 @@ func (h *Health) Run(dataMap map[string]*matrix.Matrix) ([]*matrix.Matrix, *util
for _, value := range h.resolutionData {
result = append(result, value)
}

result = append(result, emsMat)
h.SLogger.Info(
"Collected",
slog.Int("numLicenseAlerts", licenseAlertCount),
Expand All @@ -183,12 +212,13 @@ func (h *Health) Run(dataMap map[string]*matrix.Matrix) ([]*matrix.Matrix, *util
slog.Int("numSupportAlerts", supportAlertCount),
slog.Int("numShelfAlerts", shelfAlertCount),
slog.Int("numDiskAlerts", diskAlertCount),
slog.Int("numEmsAlerts", emsAlertCount),
slog.Int("numResolutionInstanceCount", resolutionInstancesCount),
)

//nolint:gosec
h.client.Metadata.PluginInstances = uint64(diskAlertCount + shelfAlertCount + supportAlertCount + nodeAlertCount + HAAlertCount + networkEthernetPortAlertCount + networkFcpPortAlertCount +
networkInterfaceAlertCount + volumeRansomwareAlertCount + volumeMoveAlertCount + licenseAlertCount + resolutionInstancesCount)
networkInterfaceAlertCount + volumeRansomwareAlertCount + volumeMoveAlertCount + licenseAlertCount + emsAlertCount + resolutionInstancesCount)

return result, h.client.Metadata, nil
}
Expand Down Expand Up @@ -635,6 +665,51 @@ func (h *Health) collectDiskAlerts() int {
return diskAlertCount
}

func (h *Health) collectEmsAlerts(emsMat *matrix.Matrix) int {
var (
instance *matrix.Instance
)
emsAlertCount := 0
records, err := h.getEmsAlerts()
if err != nil {
if errs.IsRestErr(err, errs.APINotFound) {
h.SLogger.Debug("API not found", slogx.Err(err))
} else {
h.SLogger.Error("Failed to collect ems data", slogx.Err(err))
}
return 0
}
for _, record := range records {
node := record.Get("node.name").String()
severity := record.Get("message.severity").String()
message := record.Get("message.name").String()
source := record.Get("source").String()
if instance = emsMat.GetInstance(message); instance == nil {
instance, err = emsMat.NewInstance(message)
if err != nil {
h.SLogger.Warn("error while creating instance", slog.String("key", message))
continue
}
instance.SetLabel("node", node)
instance.SetLabel("message", message)
instance.SetLabel("source", source)
instance.SetLabel(severityLabel, severity)
h.setAlertMetric(emsMat, instance, 1)
emsAlertCount++
} else {
// Increment the alert metric count by 1
currentCount, err := h.getAlertMetric(emsMat, instance)
if err != nil {
h.SLogger.Error("Failed to get alert metric", slogx.Err(err))
continue
}
h.setAlertMetric(emsMat, instance, currentCount+1)
}
}

return emsAlertCount
}

func (h *Health) getDisks() ([]gjson.Result, error) {
fields := []string{"name", "container_type"}
query := "api/storage/disks"
Expand Down Expand Up @@ -761,6 +836,26 @@ func (h *Health) getEthernetPorts() ([]gjson.Result, error) {
return collectors.InvokeRestCall(h.client, href, h.SLogger)
}

func (h *Health) getEmsAlerts() ([]gjson.Result, error) {
clusterTime, err := collectors.GetClusterTime(h.client, nil, h.SLogger)
if err != nil {
return nil, err
}
fromTime := clusterTime.Add(-24 * time.Hour).Unix()
timeFilter := fmt.Sprintf("time=>=%d", fromTime)
severityFilter := "message.severity=" + strings.Join(h.emsSeverity, "|")
fields := []string{"node,message,source"}
query := "api/support/ems/events"
href := rest.NewHrefBuilder().
APIPath(query).
Fields(fields).
MaxRecords(collectors.DefaultBatchSize).
Filter([]string{timeFilter, severityFilter}).
Build()

return collectors.InvokeRestCall(h.client, href, h.SLogger)
}

func (h *Health) getSupportAlerts(filter []string) ([]gjson.Result, error) {
query := "api/private/support/alerts"
href := rest.NewHrefBuilder().
Expand Down Expand Up @@ -813,6 +908,15 @@ func (h *Health) setAlertMetric(mat *matrix.Matrix, instance *matrix.Instance, v
}
}

func (h *Health) getAlertMetric(mat *matrix.Matrix, instance *matrix.Instance) (float64, error) {
m := mat.GetMetric("alerts")
if m != nil {
v, _ := m.GetValueFloat64(instance)
return v, nil
}
return 0, errors.New("alert metric doesn't exist")
}

func (h *Health) generateResolutionMetrics() int {
resolutionInstancesCount := 0
for prevKey, prevMat := range h.previousData {
Expand Down
9 changes: 9 additions & 0 deletions cmd/tools/generate/counter.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1328,6 +1328,15 @@ counters:
ONTAPCounter: Harvest generated
Template: conf/rest/9.6.0/health.yaml

- Name: health_ems_alerts
Description: The health_ems_alerts metric monitors EMS (Event Management System), providing a count based on their severity and other attributes. This metric includes labels such as node, message, source, and severity (e.g., emergency, alert, error).
By default, it monitors alerts with emergency severity.
APIs:
- API: REST
Endpoint: NA
ONTAPCounter: Harvest generated
Template: conf/rest/9.6.0/health.yaml

- Name: qos_policy_adaptive_absolute_min_iops
Description: Specifies the absolute minimum IOPS that is used as an override when the expected_iops is less than this value.
APIs:
Expand Down
10 changes: 9 additions & 1 deletion conf/rest/9.6.0/health.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@ counters:
- ^name

plugins:
- Health
- Health:
# Description:
# This configuration enables the Health plugin to monitor EMS alerts based on specified severities.
# - severity: A list of severities to monitor. Possible values are emergency, alert, error, notice, informational, debug
ems:
severity:
- emergency
# - alert
# - error

export_data: false
9 changes: 9 additions & 0 deletions docs/ontap-metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -3288,6 +3288,15 @@ Provides any issues related to Disks health check if disks are broken or unassig
| REST | `NA` | `Harvest generated` | conf/rest/9.6.0/health.yaml |


### health_ems_alerts

The health_ems_alerts metric monitors EMS (Event Management System), providing a count based on their severity and other attributes. This metric includes labels such as node, message, source, and severity (e.g., emergency, alert, error). By default, it monitors alerts with emergency severity.

| API | Endpoint | Metric | Template |
|--------|----------|--------|---------|
| REST | `NA` | `Harvest generated` | conf/rest/9.6.0/health.yaml |


### health_ha_alerts

Provides any issues related to HA health check. Value of 1 means issue is happening and 0 means that issue is resolved.
Expand Down
Loading

0 comments on commit dbd23ae

Please sign in to comment.