Skip to content

Commit

Permalink
Add endpoint for machine issues. (#471)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gerrit91 authored Oct 11, 2023
1 parent 6f952f6 commit 72b0fc0
Show file tree
Hide file tree
Showing 28 changed files with 2,056 additions and 203 deletions.
122 changes: 122 additions & 0 deletions cmd/metal-api/internal/issues/asn-uniqueness.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
package issues

import (
"fmt"
"sort"
"strings"

"github.com/metal-stack/metal-api/cmd/metal-api/internal/metal"
)

const (
TypeASNUniqueness Type = "asn-not-unique"
)

type (
issueASNUniqueness struct {
details string
}
)

func (i *issueASNUniqueness) Spec() *spec {
return &spec{
Type: TypeASNUniqueness,
Severity: SeverityMinor,
Description: "The ASN is not unique (only impact on firewalls)",
RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#asn-not-unique",
}
}

func (i *issueASNUniqueness) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool {
var (
machineASNs = map[uint32]metal.Machines{}
overlaps []string
isNoFirewall = func(m metal.Machine) bool {
return m.Allocation == nil || m.Allocation.Role != metal.RoleFirewall
}
)

if isNoFirewall(m) {
return false
}

for _, n := range m.Allocation.MachineNetworks {
n := n

if n.ASN == 0 {
continue
}

machineASNs[n.ASN] = nil
}

for _, machineFromAll := range c.Machines {
machineFromAll := machineFromAll

if machineFromAll.ID == m.ID {
continue
}
otherMachine := machineFromAll

if isNoFirewall(otherMachine) {
continue
}

for _, n := range otherMachine.Allocation.MachineNetworks {
n := n

if n.ASN == 0 {
continue
}

_, ok := machineASNs[n.ASN]
if !ok {
continue
}

machineASNs[n.ASN] = append(machineASNs[n.ASN], otherMachine)
}
}

var asnList []uint32
for asn := range machineASNs {
asn := asn
asnList = append(asnList, asn)
}
sort.Slice(asnList, func(i, j int) bool {
return asnList[i] < asnList[j]
})

for _, asn := range asnList {
asn := asn

overlappingMachines, ok := machineASNs[asn]
if !ok || len(overlappingMachines) == 0 {
continue
}

var sharedIDs []string
for _, m := range overlappingMachines {
m := m
sharedIDs = append(sharedIDs, m.ID)
}

overlaps = append(overlaps, fmt.Sprintf("- ASN (%d) not unique, shared with %s", asn, sharedIDs))
}

if len(overlaps) == 0 {
return false
}

sort.Slice(overlaps, func(i, j int) bool {
return overlaps[i] < overlaps[j]
})

i.details = strings.Join(overlaps, "\n")

return true
}

func (i *issueASNUniqueness) Details() string {
return i.details
}
47 changes: 47 additions & 0 deletions cmd/metal-api/internal/issues/bmc-info-outdated.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package issues

import (
"fmt"
"time"

"github.com/metal-stack/metal-api/cmd/metal-api/internal/metal"
)

const (
TypeBMCInfoOutdated Type = "bmc-info-outdated"
)

type (
issueBMCInfoOutdated struct {
details string
}
)

func (i *issueBMCInfoOutdated) Details() string {
return i.details
}

func (i *issueBMCInfoOutdated) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool {
if m.IPMI.LastUpdated.IsZero() {
i.details = "machine ipmi has never been set"
return true
}

lastUpdated := time.Since(m.IPMI.LastUpdated)

if lastUpdated > 20*time.Minute {
i.details = fmt.Sprintf("last updated %s ago", lastUpdated.String())
return true
}

return false
}

func (*issueBMCInfoOutdated) Spec() *spec {
return &spec{
Type: TypeBMCInfoOutdated,
Severity: SeverityMajor,
Description: "BMC has not been updated from either metal-hammer or metal-bmc",
RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#bmc-info-outdated",
}
}
28 changes: 28 additions & 0 deletions cmd/metal-api/internal/issues/bmc-without-ip.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package issues

import "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal"

const (
TypeBMCWithoutIP Type = "bmc-without-ip"
)

type (
issueBMCWithoutIP struct{}
)

func (i *issueBMCWithoutIP) Spec() *spec {
return &spec{
Type: TypeBMCWithoutIP,
Severity: SeverityMajor,
Description: "BMC has no ip address",
RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#bmc-without-ip",
}
}

func (i *issueBMCWithoutIP) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool {
return m.IPMI.Address == ""
}

func (i *issueBMCWithoutIP) Details() string {
return ""
}
28 changes: 28 additions & 0 deletions cmd/metal-api/internal/issues/bmc-without-mac.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package issues

import "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal"

const (
TypeBMCWithoutMAC Type = "bmc-without-mac"
)

type (
issueBMCWithoutMAC struct{}
)

func (i *issueBMCWithoutMAC) Spec() *spec {
return &spec{
Type: TypeBMCWithoutMAC,
Severity: SeverityMajor,
Description: "BMC has no mac address",
RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#bmc-without-mac",
}
}

func (i *issueBMCWithoutMAC) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool {
return m.IPMI.MacAddress == ""
}

func (i *issueBMCWithoutMAC) Details() string {
return ""
}
38 changes: 38 additions & 0 deletions cmd/metal-api/internal/issues/crash-loop.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package issues

import (
"github.com/metal-stack/metal-api/cmd/metal-api/internal/metal"
"github.com/metal-stack/metal-lib/pkg/pointer"
)

const (
TypeCrashLoop Type = "crashloop"
)

type (
issueCrashLoop struct{}
)

func (i *issueCrashLoop) Spec() *spec {
return &spec{
Type: TypeCrashLoop,
Severity: SeverityMajor,
Description: "machine is in a provisioning crash loop (⭕)",
RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#crashloop",
}
}

func (i *issueCrashLoop) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool {
if ec.CrashLoop {
if pointer.FirstOrZero(ec.Events).Event == metal.ProvisioningEventWaiting {
// Machine which are waiting are not considered to have issues
} else {
return true
}
}
return false
}

func (i *issueCrashLoop) Details() string {
return ""
}
41 changes: 41 additions & 0 deletions cmd/metal-api/internal/issues/failed-machine-reclaim.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package issues

import (
"github.com/metal-stack/metal-api/cmd/metal-api/internal/metal"
"github.com/metal-stack/metal-lib/pkg/pointer"
)

const (
TypeFailedMachineReclaim Type = "failed-machine-reclaim"
)

type (
issueFailedMachineReclaim struct{}
)

func (i *issueFailedMachineReclaim) Spec() *spec {
return &spec{
Type: TypeFailedMachineReclaim,
Severity: SeverityCritical,
Description: "machine phones home but not allocated",
RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#failed-machine-reclaim",
}
}

func (i *issueFailedMachineReclaim) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool {
if ec.FailedMachineReclaim {
return true
}

// compatibility: before the provisioning FSM was renewed, this state could be detected the following way
// we should keep this condition
if m.Allocation == nil && pointer.FirstOrZero(ec.Events).Event == metal.ProvisioningEventPhonedHome {
return true
}

return false
}

func (i *issueFailedMachineReclaim) Details() string {
return ""
}
Loading

0 comments on commit 72b0fc0

Please sign in to comment.