Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ Add reference to HostUpdatePolicy in Servicing. #1969

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions apis/metal3.io/v1alpha1/baremetalhost_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,10 @@ const (
// OperationalStatusDetached is the status value when the host is
// marked unmanaged via the detached annotation.
OperationalStatusDetached OperationalStatus = "detached"

// OperationalStatusServicing is the status value when the host is
// undergoing servicing (e.g. checking firmware settings).
OperationalStatusServicing OperationalStatus = "servicing"
)

// OperationalStatusAllowed represents the allowed values of OperationalStatus.
Expand Down Expand Up @@ -179,6 +183,9 @@ const (
// DetachError is an error condition occurring when the
// controller is unable to detatch the host from the provisioner.
DetachError ErrorType = "detach error"
// ServicingError is an error condition occurring when
// service steps failed.
ServicingError ErrorType = "servicing error"
)

// ErrorTypeAllowed represents the allowed values of ErrorType.
Expand Down Expand Up @@ -800,12 +807,12 @@ type BareMetalHostStatus struct {
// after modifying this file

// OperationalStatus holds the status of the host
// +kubebuilder:validation:Enum="";OK;discovered;error;delayed;detached
// +kubebuilder:validation:Enum="";OK;discovered;error;delayed;detached;servicing
OperationalStatus OperationalStatus `json:"operationalStatus"`

// ErrorType indicates the type of failure encountered when the
// OperationalStatus is OperationalStatusError
// +kubebuilder:validation:Enum=provisioned registration error;registration error;inspection error;preparation error;provisioning error;power management error
// +kubebuilder:validation:Enum=provisioned registration error;registration error;inspection error;preparation error;provisioning error;power management error;servicing error
ErrorType ErrorType `json:"errorType,omitempty"`

// LastUpdated identifies when this status was last observed.
Expand Down
2 changes: 2 additions & 0 deletions config/base/crds/bases/metal3.io_baremetalhosts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,7 @@ spec:
- preparation error
- provisioning error
- power management error
- servicing error
type: string
goodCredentials:
description: The last credentials we were able to validate as working.
Expand Down Expand Up @@ -832,6 +833,7 @@ spec:
- error
- delayed
- detached
- servicing
type: string
poweredOn:
description: Whether or not the host is currently powered on. This
Expand Down
2 changes: 2 additions & 0 deletions config/render/capm3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,7 @@ spec:
- preparation error
- provisioning error
- power management error
- servicing error
type: string
goodCredentials:
description: The last credentials we were able to validate as working.
Expand Down Expand Up @@ -832,6 +833,7 @@ spec:
- error
- delayed
- detached
- servicing
type: string
poweredOn:
description: Whether or not the host is currently powered on. This
Expand Down
137 changes: 118 additions & 19 deletions controllers/metal3.io/baremetalhost_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,8 @@ func recordActionFailure(info *reconcileInfo, errorType metal3api.ErrorType, err
metal3api.InspectionError: "InspectionError",
metal3api.ProvisioningError: "ProvisioningError",
metal3api.PowerManagementError: "PowerManagementError",
metal3api.PreparationError: "PreparationError",
metal3api.ServicingError: "ServicingError",
}[errorType]

counter := actionFailureCounters.WithLabelValues(eventType)
Expand Down Expand Up @@ -460,9 +462,9 @@ func hasInspectAnnotation(host *metal3api.BareMetalHost) bool {
return false
}

// clearError removes any existing error message.
func clearError(host *metal3api.BareMetalHost) (dirty bool) {
dirty = host.SetOperationalStatus(metal3api.OperationalStatusOK)
// clearErrorWithStatus removes any existing error message and sets operational status.
func clearErrorWithStatus(host *metal3api.BareMetalHost, status metal3api.OperationalStatus) (dirty bool) {
dirty = host.SetOperationalStatus(status)
var emptyErrType metal3api.ErrorType
if host.Status.ErrorType != emptyErrType {
host.Status.ErrorType = emptyErrType
Expand All @@ -475,6 +477,11 @@ func clearError(host *metal3api.BareMetalHost) (dirty bool) {
return dirty
}

// clearError removes any existing error message.
func clearError(host *metal3api.BareMetalHost) (dirty bool) {
return clearErrorWithStatus(host, metal3api.OperationalStatusOK)
}

// setErrorMessage updates the ErrorMessage in the host Status struct
// and increases the ErrorCount.
func setErrorMessage(host *metal3api.BareMetalHost, errType metal3api.ErrorType, message string) {
Expand Down Expand Up @@ -882,7 +889,6 @@ func (r *BareMetalHostReconciler) registerHost(prov provisioner.Provisioner, inf

// Create the hostFirmwareSettings resource with same host name/namespace if it doesn't exist
// Create the hostFirmwareComponents resource with same host name/namespace if it doesn't exist
// Set owner reference on hostUpdatePolicy resource if not set
if info.host.Name != "" {
if !info.host.DeletionTimestamp.IsZero() {
info.log.Info(fmt.Sprintf("will not attempt to create new hostFirmwareSettings and hostFirmwareComponents in %s", info.host.Namespace))
Expand All @@ -895,10 +901,6 @@ func (r *BareMetalHostReconciler) registerHost(prov provisioner.Provisioner, inf
info.log.Info("failed creating hostfirmwarecomponents")
return actionError{errors.Wrap(err, "failed creating hostFirmwareComponents")}
}
if err = r.hostUpdatePolicySetOwnerReference(info); err != nil {
info.log.Info("failed setting owner reference on hostupdatepolicy")
return actionError{errors.Wrap(err, "failed setting owner reference on hostUpdatePolicy")}
}
}
}
// Reaching this point means the credentials are valid and worked,
Expand Down Expand Up @@ -1355,6 +1357,86 @@ func (r *BareMetalHostReconciler) actionDeprovisioning(prov provisioner.Provisio
return actionComplete{}
}

func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner, info *reconcileInfo, hup *metal3api.HostUpdatePolicy) (result actionResult) {
servicingData := provisioner.ServicingData{}

// (NOTE)janders: since Servicing is an opt-in feature that requires HostUpdatePolicy to be created and set to onReboot
// set below booleans to false by default and change to true based on policy settings

var fwDirty bool
var hfsDirty bool
var liveFirmwareSettingsAllowed bool

if hup != nil {
liveFirmwareSettingsAllowed = (hup.Spec.FirmwareSettings == metal3api.HostUpdatePolicyOnReboot)
}

if liveFirmwareSettingsAllowed {
// handling pre-HFS FirmwareSettings here
if !reflect.DeepEqual(info.host.Status.Provisioning.Firmware, info.host.Spec.Firmware) {
servicingData.FirmwareConfig = info.host.Spec.Firmware
fwDirty = true
}
// handling HFS based FirmwareSettings here
var hfs *metal3api.HostFirmwareSettings
var err error
hfsDirty, hfs, err = r.getHostFirmwareSettings(info)
if err != nil {
return actionError{fmt.Errorf("could not determine updated settings: %w", err)}
}
if hfsDirty {
servicingData.ActualFirmwareSettings = hfs.Status.Settings
servicingData.TargetFirmwareSettings = hfs.Spec.Settings
}
}

dirty := fwDirty || hfsDirty

// Even if settings are clean, we need to check the result of the current servicing.
if !dirty && info.host.Status.OperationalStatus != metal3api.OperationalStatusServicing && info.host.Status.ErrorType != metal3api.ServicingError {
// If nothing is going on, return control to the power management.
return nil
}

// FIXME(janders/dtantsur): this implementation may lead to a scenario where if we never actually
// succeed before leaving this state (e.g. by deprovisioning) we lose the signal that the
// update didn't actually happen. This is deemed an acceptable risk for the moment since it is only
// going to impact a small subset of Firmware Settings implementations.
currentError := info.host.Status.ErrorType
if clearErrorWithStatus(info.host, metal3api.OperationalStatusServicing) {
dirty = true
}
provResult, _, err := prov.Service(servicingData, dirty,
currentError == metal3api.ServicingError)
if err != nil {
return actionError{fmt.Errorf("error servicing host: %w", err)}
}
if provResult.ErrorMessage != "" {
result = recordActionFailure(info, metal3api.ServicingError, provResult.ErrorMessage)
return result
}

if fwDirty {
info.host.Status.Provisioning.Firmware = info.host.Spec.Firmware.DeepCopy()
}

if provResult.Dirty {
result := actionContinue{provResult.RequeueAfter}
if dirty {
return actionUpdate{result}
}
return result
}

// Servicing is finished at this point, clean up operational status
if clearErrorWithStatus(info.host, metal3api.OperationalStatusOK) {
// FIXME(janders/dtantsur): this can be racy. We should consider
// using a generation number to decide if we start servicing or not.
return actionUpdate{actionContinue{delay: subResourceNotReadyRetryDelay}}
}
return nil
}

// Check the current power status against the desired power status.
func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, info *reconcileInfo) actionResult {
var provResult provisioner.Result
Expand All @@ -1368,12 +1450,20 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner,
if hwState.PoweredOn != nil && *hwState.PoweredOn != info.host.Status.PoweredOn {
info.log.Info("updating power status", "discovered", *hwState.PoweredOn)
info.host.Status.PoweredOn = *hwState.PoweredOn
clearError(info.host)
if info.host.Status.OperationalStatus == metal3api.OperationalStatusError && info.host.Status.ErrorType == metal3api.PowerManagementError {
clearError(info.host)
}
return actionUpdate{}
}

desiredPowerOnState := info.host.Spec.Online

provState := info.host.Status.Provisioning.State
// Normal reboots only work in provisioned states, changing online is also possible for available hosts.
isProvisioned := provState == metal3api.StateProvisioned || provState == metal3api.StateExternallyProvisioned
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't like that we're doing state-dependent stuff outside of the state machine here. Maybe we could pass this in as an argument?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

$ grep metal3api.State controllers/metal3.io/baremetalhost_controller.go | wc -l
9

Also, this is a copy of an existing line (see below).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

consensus: follow up material potentially.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

adding a FIXME note for this also

// FIXME(janders/dtantsur) it would be preferrable to pass in state as an argument
// however this falls outside the scope of this specific change.

if !info.host.Status.PoweredOn {
if _, suffixlessAnnotationExists := info.host.Annotations[metal3api.RebootAnnotationPrefix]; suffixlessAnnotationExists {
delete(info.host.Annotations, metal3api.RebootAnnotationPrefix)
Expand All @@ -1385,10 +1475,19 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner,
return actionContinue{}
}
}
hup, err := r.hostUpdatePolicySetOwnerReference(info)
if err != nil {
info.log.Info("failed setting owner reference on hostupdatepolicy")
return actionError{errors.Wrap(err, "failed setting owner reference on hostUpdatePolicy")}
}

provState := info.host.Status.Provisioning.State
// Normal reboots only work in provisioned states, changing online is also possible for available hosts.
isProvisioned := provState == metal3api.StateProvisioned || provState == metal3api.StateExternallyProvisioned
servicingAllowed := isProvisioned && !info.host.Status.PoweredOn && desiredPowerOnState
if servicingAllowed || info.host.Status.OperationalStatus == metal3api.OperationalStatusServicing || info.host.Status.ErrorType == metal3api.ServicingError {
result := r.doServiceIfNeeded(prov, info, hup)
if result != nil {
return result
}
}

desiredReboot, desiredRebootMode := hasRebootAnnotation(info, !isProvisioned)
rhjanders marked this conversation as resolved.
Show resolved Hide resolved

Expand Down Expand Up @@ -1862,7 +1961,7 @@ func (r *BareMetalHostReconciler) createHostFirmwareSettings(info *reconcileInfo
return nil
}

func (r *BareMetalHostReconciler) hostUpdatePolicySetOwnerReference(info *reconcileInfo) error {
func (r *BareMetalHostReconciler) hostUpdatePolicySetOwnerReference(info *reconcileInfo) (policy *metal3api.HostUpdatePolicy, err error) {
// NOTE(janders) the goal here is to ensure that the controller reads the hup resource and adds OwnerReference to it
hup := &metal3api.HostUpdatePolicy{}
if err := r.Get(info.ctx, info.request.NamespacedName, hup); err != nil {
Expand All @@ -1872,23 +1971,23 @@ func (r *BareMetalHostReconciler) hostUpdatePolicySetOwnerReference(info *reconc
// garbage collected. For additional cleanup logic use
// finalizers. Return and don't requeue

return nil
return nil, nil
}
// Error reading the object
return fmt.Errorf("could not load hostUpdatePolicy resource due to %w", err)
return nil, fmt.Errorf("could not load hostUpdatePolicy resource due to %w", err)
}
if !ownerReferenceExists(info.host, hup) {
if err := controllerutil.SetOwnerReference(info.host, hup, r.Scheme()); err != nil {
return fmt.Errorf("could not set bmh as owner for hostUpdatePolicy due to %w", err)
return hup, fmt.Errorf("could not set bmh as owner for hostUpdatePolicy due to %w", err)
}
if err := r.Update(info.ctx, hup); err != nil {
return fmt.Errorf("failure updating hostUpdatePolicy resource due to %w", err)
return hup, fmt.Errorf("failure updating hostUpdatePolicy resource due to %w", err)
}

return nil
return hup, nil
}

return nil
return hup, nil
}

// Get the stored firmware settings if there are valid changes.
Expand Down
Loading