From 9262b7f04d2142b735b3d58562eed14f7ef86bac Mon Sep 17 00:00:00 2001 From: Dmitry Tantsur Date: Tue, 16 Apr 2024 18:32:34 +0200 Subject: [PATCH 1/5] Provisioner code for servicing Signed-off-by: Dmitry Tantsur --- .../metal3.io/host_state_machine_test.go | 4 + pkg/provisioner/demo/demo.go | 22 ++ pkg/provisioner/fixture/fixture.go | 7 + pkg/provisioner/ironic/clients/features.go | 8 + pkg/provisioner/ironic/ironic.go | 64 +++--- pkg/provisioner/ironic/servicing.go | 124 +++++++++++ pkg/provisioner/ironic/servicing_test.go | 196 ++++++++++++++++++ pkg/provisioner/ironic/testserver/ironic.go | 2 +- pkg/provisioner/provisioner.go | 10 + 9 files changed, 405 insertions(+), 32 deletions(-) create mode 100644 pkg/provisioner/ironic/servicing.go create mode 100644 pkg/provisioner/ironic/servicing_test.go diff --git a/controllers/metal3.io/host_state_machine_test.go b/controllers/metal3.io/host_state_machine_test.go index 6fd35754e3..2e70b08d57 100644 --- a/controllers/metal3.io/host_state_machine_test.go +++ b/controllers/metal3.io/host_state_machine_test.go @@ -1350,6 +1350,10 @@ func (m *mockProvisioner) Prepare(_ provisioner.PrepareData, _ bool, _ bool) (re return m.getNextResultByMethod("Prepare"), m.nextResults["Prepare"].Dirty, err } +func (m *mockProvisioner) Service(_ provisioner.ServicingData, _ bool, _ bool) (result provisioner.Result, started bool, err error) { + return m.getNextResultByMethod("Service"), m.nextResults["Service"].Dirty, err +} + func (m *mockProvisioner) Adopt(_ provisioner.AdoptData, _ bool) (result provisioner.Result, err error) { return m.getNextResultByMethod("Adopt"), err } diff --git a/pkg/provisioner/demo/demo.go b/pkg/provisioner/demo/demo.go index bbae60aa39..d4e8acd453 100644 --- a/pkg/provisioner/demo/demo.go +++ b/pkg/provisioner/demo/demo.go @@ -211,6 +211,28 @@ func (p *demoProvisioner) Prepare(_ provisioner.PrepareData, unprepared bool, _ return } +func (p *demoProvisioner) Service(_ provisioner.ServicingData, unprepared bool, _ bool) (result provisioner.Result, started bool, err error) { + hostName := p.objectMeta.Name + + switch hostName { + case PreparingErrorHost: + p.log.Info("servicing error host") + result.ErrorMessage = "servicing failed" + + case PreparingHost: + p.log.Info("servicing host") + started = unprepared + result.Dirty = true + result.RequeueAfter = time.Second * 5 + + default: + p.log.Info("finished servicing") + started = true + } + + return +} + // Adopt notifies the provisioner that the state machine believes the host // to be currently provisioned, and that it should be managed as such. func (p *demoProvisioner) Adopt(_ provisioner.AdoptData, _ bool) (result provisioner.Result, err error) { diff --git a/pkg/provisioner/fixture/fixture.go b/pkg/provisioner/fixture/fixture.go index 349db0b1a0..797938b955 100644 --- a/pkg/provisioner/fixture/fixture.go +++ b/pkg/provisioner/fixture/fixture.go @@ -210,6 +210,13 @@ func (p *fixtureProvisioner) Prepare(_ provisioner.PrepareData, unprepared bool, return } +// Service remove existing configuration and set new configuration. +func (p *fixtureProvisioner) Service(_ provisioner.ServicingData, unprepared bool, _ bool) (result provisioner.Result, started bool, err error) { + p.log.Info("servicing host") + started = unprepared + return +} + // Adopt notifies the provisioner that the state machine believes the host // to be currently provisioned, and that it should be managed as such. func (p *fixtureProvisioner) Adopt(_ provisioner.AdoptData, _ bool) (result provisioner.Result, err error) { diff --git a/pkg/provisioner/ironic/clients/features.go b/pkg/provisioner/ironic/clients/features.go index 10c4b7be91..6998d5f909 100644 --- a/pkg/provisioner/ironic/clients/features.go +++ b/pkg/provisioner/ironic/clients/features.go @@ -50,6 +50,10 @@ func (af AvailableFeatures) HasFirmwareUpdates() bool { return af.MaxVersion >= 86 } +func (af AvailableFeatures) HasServicing() bool { + return af.MaxVersion >= 87 +} + func (af AvailableFeatures) HasDataImage() bool { return af.MaxVersion >= 89 } @@ -59,6 +63,10 @@ func (af AvailableFeatures) ChooseMicroversion() string { return "1.89" } + if af.HasServicing() { + return "1.87" + } + if af.HasFirmwareUpdates() { return "1.86" } diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index 7b5fbc1b38..9d3bf66e60 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -1179,31 +1179,12 @@ func (p *ironicProvisioner) ironicHasSameImage(ironicNode *nodes.Node, image met return sameImage } -func (p *ironicProvisioner) buildManualCleaningSteps(bmcAccess bmc.AccessDetails, data provisioner.PrepareData) (cleanSteps []nodes.CleanStep, err error) { - // Build raid clean steps - raidCleanSteps, err := BuildRAIDCleanSteps(bmcAccess.RAIDInterface(), data.TargetRAIDConfig, data.ActualRAIDConfig) - if err != nil { - return nil, err - } - cleanSteps = append(cleanSteps, raidCleanSteps...) - - // Get the subset (currently 3) of vendor specific BIOS settings converted from common names - var firmwareConfig *bmc.FirmwareConfig - if data.FirmwareConfig != nil { - bmcConfig := bmc.FirmwareConfig(*data.FirmwareConfig) - firmwareConfig = &bmcConfig - } - fwConfigSettings, err := bmcAccess.BuildBIOSSettings(firmwareConfig) - if err != nil { - return nil, err - } - - var newSettings []map[string]interface{} - if data.ActualFirmwareSettings != nil { +func (p *ironicProvisioner) getNewFirmwareSettings(actualFirmwareSettings metal3api.SettingsMap, targetFirmwareSettings metal3api.DesiredSettingsMap, fwConfigSettings []map[string]string) (newSettings []map[string]interface{}) { + if actualFirmwareSettings != nil { // If we have the current settings from Ironic, update the settings to contain: // 1. settings converted by BMC drivers that are different than current settings for _, fwConfigSetting := range fwConfigSettings { - if val, exists := data.ActualFirmwareSettings[fwConfigSetting["name"]]; exists { + if val, exists := actualFirmwareSettings[fwConfigSetting["name"]]; exists { if fwConfigSetting["value"] != val { newSettings = buildFirmwareSettings(newSettings, fwConfigSetting["name"], intstr.FromString(fwConfigSetting["value"])) } @@ -1213,17 +1194,15 @@ func (p *ironicProvisioner) buildManualCleaningSteps(bmcAccess bmc.AccessDetails } // 2. target settings that are different than current settings - if data.TargetFirmwareSettings != nil { - for k, v := range data.TargetFirmwareSettings { - if data.ActualFirmwareSettings[k] != v.String() { - // Skip changing this setting if it was defined in the vendor specific settings - for _, fwConfigSetting := range fwConfigSettings { - if fwConfigSetting["name"] == k { - continue - } + for k, v := range targetFirmwareSettings { + if actualFirmwareSettings[k] != v.String() { + // Skip changing this setting if it was defined in the vendor specific settings + for _, fwConfigSetting := range fwConfigSettings { + if fwConfigSetting["name"] == k { + continue } - newSettings = buildFirmwareSettings(newSettings, k, v) } + newSettings = buildFirmwareSettings(newSettings, k, v) } } } else { @@ -1233,6 +1212,29 @@ func (p *ironicProvisioner) buildManualCleaningSteps(bmcAccess bmc.AccessDetails } } + return newSettings +} + +func (p *ironicProvisioner) buildManualCleaningSteps(bmcAccess bmc.AccessDetails, data provisioner.PrepareData) (cleanSteps []nodes.CleanStep, err error) { + // Build raid clean steps + raidCleanSteps, err := BuildRAIDCleanSteps(bmcAccess.RAIDInterface(), data.TargetRAIDConfig, data.ActualRAIDConfig) + if err != nil { + return nil, err + } + cleanSteps = append(cleanSteps, raidCleanSteps...) + + // Get the subset (currently 3) of vendor specific BIOS settings converted from common names + var firmwareConfig *bmc.FirmwareConfig + if data.FirmwareConfig != nil { + bmcConfig := bmc.FirmwareConfig(*data.FirmwareConfig) + firmwareConfig = &bmcConfig + } + fwConfigSettings, err := bmcAccess.BuildBIOSSettings(firmwareConfig) + if err != nil { + return nil, err + } + + newSettings := p.getNewFirmwareSettings(data.ActualFirmwareSettings, data.TargetFirmwareSettings, fwConfigSettings) if len(newSettings) != 0 { p.log.Info("Applying BIOS config clean steps", "settings", newSettings) cleanSteps = append( diff --git a/pkg/provisioner/ironic/servicing.go b/pkg/provisioner/ironic/servicing.go new file mode 100644 index 0000000000..c669847500 --- /dev/null +++ b/pkg/provisioner/ironic/servicing.go @@ -0,0 +1,124 @@ +package ironic + +import ( + "fmt" + + "github.com/gophercloud/gophercloud/v2/openstack/baremetal/v1/nodes" + "github.com/metal3-io/baremetal-operator/pkg/hardwareutils/bmc" + "github.com/metal3-io/baremetal-operator/pkg/provisioner" +) + +func (p *ironicProvisioner) buildServiceSteps(bmcAccess bmc.AccessDetails, data provisioner.ServicingData) (serviceSteps []nodes.ServiceStep, err error) { + // Get the subset (currently 3) of vendor specific BIOS settings converted from common names + var firmwareConfig *bmc.FirmwareConfig + if data.FirmwareConfig != nil { + bmcConfig := bmc.FirmwareConfig(*data.FirmwareConfig) + firmwareConfig = &bmcConfig + } + fwConfigSettings, err := bmcAccess.BuildBIOSSettings(firmwareConfig) + if err != nil { + return nil, err + } + + newSettings := p.getNewFirmwareSettings(data.ActualFirmwareSettings, data.TargetFirmwareSettings, fwConfigSettings) + if len(newSettings) != 0 { + p.log.Info("Applying BIOS config clean steps", "settings", newSettings) + serviceSteps = append( + serviceSteps, + nodes.ServiceStep{ + Interface: nodes.InterfaceBIOS, + Step: "apply_configuration", + Args: map[string]interface{}{ + "settings": newSettings, + }, + }, + ) + } + + // TODO: Add service steps for firmware updates + + return +} + +func (p *ironicProvisioner) startServicing(bmcAccess bmc.AccessDetails, ironicNode *nodes.Node, data provisioner.ServicingData) (success bool, result provisioner.Result, err error) { + // Build service steps + serviceSteps, err := p.buildServiceSteps(bmcAccess, data) + if err != nil { + result, err = operationFailed(err.Error()) + return + } + + // Start servicing + if len(serviceSteps) != 0 { + p.log.Info("remove existing configuration and set new configuration", "serviceSteps", serviceSteps) + return p.tryChangeNodeProvisionState( + ironicNode, + nodes.ProvisionStateOpts{ + Target: nodes.TargetService, + ServiceSteps: serviceSteps, + }, + ) + } + result, err = operationComplete() + return +} + +func (p *ironicProvisioner) Service(data provisioner.ServicingData, unprepared, restartOnFailure bool) (result provisioner.Result, started bool, err error) { + if !p.availableFeatures.HasServicing() { + result, err = operationFailed(fmt.Sprintf("servicing not supported: requires API version 1.87, available is 1.%d", p.availableFeatures.MaxVersion)) + return result, started, err + } + + bmcAccess, err := p.bmcAccess() + if err != nil { + result, err = transientError(err) + return result, started, err + } + + ironicNode, err := p.getNode() + if err != nil { + result, err = transientError(err) + return result, started, err + } + + switch nodes.ProvisionState(ironicNode.ProvisionState) { + case nodes.ServiceFail: + // When servicing failed, we need to clean host provisioning settings. + // If restartOnFailure is false, it means the settings aren't cleared. + if !restartOnFailure { + result, err = operationFailed(ironicNode.LastError) + return result, started, err + } + + if ironicNode.Maintenance { + p.log.Info("clearing maintenance flag after a servicing failure") + result, err = p.setMaintenanceFlag(ironicNode, false, "") + return result, started, err + } + + p.log.Info("restarting servicing because of a previous failure") + unprepared = true + fallthrough + case nodes.Active: + if unprepared { + started, result, err = p.startServicing(bmcAccess, ironicNode, data) + if started || result.Dirty || result.ErrorMessage != "" || err != nil { + return result, started, err + } + // nothing to do + started = true + } + // Servicing finished + p.log.Info("servicing finished on the host") + result, err = operationComplete() + case nodes.Servicing, nodes.ServiceWait: + p.log.Info("waiting for host to become active", + "state", ironicNode.ProvisionState, + "serviceStep", ironicNode.ServiceStep) + result, err = operationContinuing(provisionRequeueDelay) + + default: + result, err = transientError(fmt.Errorf("have unexpected ironic node state %s", ironicNode.ProvisionState)) + } + return result, started, err +} diff --git a/pkg/provisioner/ironic/servicing_test.go b/pkg/provisioner/ironic/servicing_test.go new file mode 100644 index 0000000000..a878afce40 --- /dev/null +++ b/pkg/provisioner/ironic/servicing_test.go @@ -0,0 +1,196 @@ +package ironic + +import ( + "net/url" + "testing" + "time" + + "github.com/gophercloud/gophercloud/v2/openstack/baremetal/v1/nodes" + "github.com/stretchr/testify/assert" + "k8s.io/apimachinery/pkg/util/intstr" + + metal3api "github.com/metal3-io/baremetal-operator/apis/metal3.io/v1alpha1" + "github.com/metal3-io/baremetal-operator/pkg/hardwareutils/bmc" + "github.com/metal3-io/baremetal-operator/pkg/provisioner" + "github.com/metal3-io/baremetal-operator/pkg/provisioner/ironic/clients" + "github.com/metal3-io/baremetal-operator/pkg/provisioner/ironic/testserver" +) + +type BIOSTestBMC struct{} + +func (r *BIOSTestBMC) Type() string { return "bios-test" } +func (r *BIOSTestBMC) NeedsMAC() bool { return false } +func (r *BIOSTestBMC) Driver() string { return "bios-test" } +func (r *BIOSTestBMC) DisableCertificateVerification() bool { return false } +func (r *BIOSTestBMC) DriverInfo(bmc.Credentials) (i map[string]interface{}) { return } +func (r *BIOSTestBMC) SupportsISOPreprovisioningImage() bool { return false } +func (r *BIOSTestBMC) BIOSInterface() string { return "" } +func (r *BIOSTestBMC) BootInterface() string { return "" } +func (r *BIOSTestBMC) FirmwareInterface() string { return "" } +func (r *BIOSTestBMC) ManagementInterface() string { return "" } +func (r *BIOSTestBMC) PowerInterface() string { return "" } +func (r *BIOSTestBMC) RAIDInterface() string { return "" } +func (r *BIOSTestBMC) VendorInterface() string { return "" } +func (r *BIOSTestBMC) SupportsSecureBoot() bool { return false } +func (r *BIOSTestBMC) RequiresProvisioningNetwork() bool { return true } +func (r *BIOSTestBMC) BuildBIOSSettings(_ *bmc.FirmwareConfig) ([]map[string]string, error) { + return nil, nil +} + +func TestService(t *testing.T) { + bmc.RegisterFactory("bios-test", func(u *url.URL, dcv bool) (bmc.AccessDetails, error) { + return &BIOSTestBMC{}, nil + }, []string{}) + + nodeUUID := "33ce8659-7400-4c68-9535-d10766f07a58" + cases := []struct { + name string + ironic *testserver.IronicMock + unprepared bool + skipConfig bool + expectedStarted bool + expectedDirty bool + expectedError bool + expectedRequestAfter int + }{ + { + name: "active, no new steps", + ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ + ProvisionState: string(nodes.Active), + UUID: nodeUUID, + }), + skipConfig: true, + unprepared: true, + expectedStarted: true, + expectedRequestAfter: 0, + expectedDirty: false, + }, + { + name: "active with steps", + ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ + ProvisionState: string(nodes.Active), + UUID: nodeUUID, + }), + unprepared: true, + expectedStarted: true, + expectedRequestAfter: 10, + expectedDirty: true, + }, + { + name: "serviceFail state(cleaned provision settings)", + ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ + ProvisionState: string(nodes.ServiceFail), + UUID: nodeUUID, + }), + expectedStarted: false, + expectedRequestAfter: 0, + expectedDirty: false, + }, + { + name: "serviceFail state(retry)", + ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ + ProvisionState: string(nodes.ServiceFail), + UUID: nodeUUID, + }), + unprepared: true, + expectedStarted: true, + expectedRequestAfter: 10, + expectedDirty: true, + }, + { + name: "serviceFail state(retry with maintenance)", + ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ + ProvisionState: string(nodes.ServiceFail), + UUID: nodeUUID, + Maintenance: true, + }).NodeMaintenance(nodes.Node{ + UUID: nodeUUID, + }, false), + unprepared: true, + expectedStarted: false, + expectedDirty: true, + }, + { + name: "servicing state", + ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ + ProvisionState: string(nodes.Servicing), + UUID: nodeUUID, + }), + expectedStarted: false, + expectedRequestAfter: 10, + expectedDirty: true, + }, + { + name: "serviceWait state", + ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ + ProvisionState: string(nodes.ServiceWait), + UUID: nodeUUID, + }), + expectedStarted: false, + expectedRequestAfter: 10, + expectedDirty: true, + }, + { + name: "active state(servicing finished)", + ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ + ProvisionState: string(nodes.Active), + UUID: nodeUUID, + }), + expectedStarted: false, + expectedRequestAfter: 0, + expectedDirty: false, + }, + { + name: "unexpected state", + ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ + ProvisionState: string(nodes.Cleaning), + UUID: nodeUUID, + }), + expectedStarted: false, + expectedRequestAfter: 0, + expectedDirty: false, + expectedError: true, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if tc.ironic != nil { + tc.ironic.Start() + defer tc.ironic.Stop() + } + + host := makeHost() + host.Status.Provisioning.ID = nodeUUID + prepData := provisioner.ServicingData{} + if !tc.skipConfig { + host.Spec.BMC.Address = "raid-test://test.bmc/" + prepData.ActualFirmwareSettings = metal3api.SettingsMap{ + "Answer": "unknown", + } + prepData.TargetFirmwareSettings = metal3api.DesiredSettingsMap{ + "Answer": intstr.FromInt(42), + } + } + + publisher := func(reason, message string) {} + auth := clients.AuthConfig{Type: clients.NoAuth} + prov, err := newProvisionerWithSettings(host, bmc.Credentials{}, publisher, tc.ironic.Endpoint(), auth) + if err != nil { + t.Fatalf("could not create provisioner: %s", err) + } + prov.availableFeatures = clients.AvailableFeatures{MaxVersion: 87} + + result, started, err := prov.Service(prepData, tc.unprepared, tc.unprepared) + + assert.Equal(t, tc.expectedStarted, started) + assert.Equal(t, tc.expectedDirty, result.Dirty) + assert.Equal(t, time.Second*time.Duration(tc.expectedRequestAfter), result.RequeueAfter) + if !tc.expectedError { + assert.NoError(t, err) + } else { + assert.Error(t, err) + } + }) + } +} diff --git a/pkg/provisioner/ironic/testserver/ironic.go b/pkg/provisioner/ironic/testserver/ironic.go index a85351bda1..e56182b0a9 100644 --- a/pkg/provisioner/ironic/testserver/ironic.go +++ b/pkg/provisioner/ironic/testserver/ironic.go @@ -42,7 +42,7 @@ const versionedRootResult = ` "links": [ { "href": "/v1/", "rel": "self" } ], "status": "CURRENT", "min_version": "1.1", - "version": "1.86" + "version": "1.87" } }` diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index f9564ada9e..c18d350615 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -107,6 +107,13 @@ type PrepareData struct { TargetFirmwareComponents []metal3api.FirmwareUpdate } +type ServicingData struct { + FirmwareConfig *metal3api.FirmwareConfig + TargetFirmwareSettings metal3api.DesiredSettingsMap + ActualFirmwareSettings metal3api.SettingsMap + // TargetFirmwareComponents []metal3api.FirmwareUpdate +} + type ProvisionData struct { Image metal3api.Image HostConfig HostConfigData @@ -154,6 +161,9 @@ type Provisioner interface { // Prepare remove existing configuration and set new configuration Prepare(data PrepareData, unprepared bool, restartOnFailure bool) (result Result, started bool, err error) + // Servicing updates configuration for a provisioned host. + Service(data ServicingData, unprepared, restartOnFailure bool) (result Result, started bool, err error) + // Provision writes the image from the host spec to the host. It // may be called multiple times, and should return true for its // dirty flag until the provisioning operation is completed. From 74443110457738d8778d122692817ce5d6105550 Mon Sep 17 00:00:00 2001 From: Dmitry Tantsur Date: Fri, 19 Apr 2024 18:21:14 +0200 Subject: [PATCH 2/5] Wire in servicing (updating provisioned hosts) Servicing only runs when a host is powered off (either completely or by rebooting it). Signed-off-by: Dmitry Tantsur --- .../metal3.io/v1alpha1/baremetalhost_types.go | 11 ++- .../crds/bases/metal3.io_baremetalhosts.yaml | 2 + config/render/capm3.yaml | 2 + .../metal3.io/baremetalhost_controller.go | 93 +++++++++++++++++-- .../baremetalhost_controller_test.go | 58 ++++++++++++ pkg/provisioner/fixture/fixture.go | 7 +- 6 files changed, 162 insertions(+), 11 deletions(-) diff --git a/apis/metal3.io/v1alpha1/baremetalhost_types.go b/apis/metal3.io/v1alpha1/baremetalhost_types.go index 3eb5e83e12..eea7eb5feb 100644 --- a/apis/metal3.io/v1alpha1/baremetalhost_types.go +++ b/apis/metal3.io/v1alpha1/baremetalhost_types.go @@ -147,6 +147,10 @@ const ( // OperationalStatusDetached is the status value when the host is // marked unmanaged via the detached annotation. OperationalStatusDetached OperationalStatus = "detached" + + // OperationalStatusServicing is the status value when the host is + // undergoing servicing (e.g. checking firmware settings). + OperationalStatusServicing OperationalStatus = "servicing" ) // OperationalStatusAllowed represents the allowed values of OperationalStatus. @@ -179,6 +183,9 @@ const ( // DetachError is an error condition occurring when the // controller is unable to detatch the host from the provisioner. DetachError ErrorType = "detach error" + // ServicingError is an error condition occurring when + // service steps failed. + ServicingError ErrorType = "servicing error" ) // ErrorTypeAllowed represents the allowed values of ErrorType. @@ -800,12 +807,12 @@ type BareMetalHostStatus struct { // after modifying this file // OperationalStatus holds the status of the host - // +kubebuilder:validation:Enum="";OK;discovered;error;delayed;detached + // +kubebuilder:validation:Enum="";OK;discovered;error;delayed;detached;servicing OperationalStatus OperationalStatus `json:"operationalStatus"` // ErrorType indicates the type of failure encountered when the // OperationalStatus is OperationalStatusError - // +kubebuilder:validation:Enum=provisioned registration error;registration error;inspection error;preparation error;provisioning error;power management error + // +kubebuilder:validation:Enum=provisioned registration error;registration error;inspection error;preparation error;provisioning error;power management error;servicing error ErrorType ErrorType `json:"errorType,omitempty"` // LastUpdated identifies when this status was last observed. diff --git a/config/base/crds/bases/metal3.io_baremetalhosts.yaml b/config/base/crds/bases/metal3.io_baremetalhosts.yaml index 61f0f1581a..f8ffcfda09 100644 --- a/config/base/crds/bases/metal3.io_baremetalhosts.yaml +++ b/config/base/crds/bases/metal3.io_baremetalhosts.yaml @@ -571,6 +571,7 @@ spec: - preparation error - provisioning error - power management error + - servicing error type: string goodCredentials: description: The last credentials we were able to validate as working. @@ -832,6 +833,7 @@ spec: - error - delayed - detached + - servicing type: string poweredOn: description: Whether or not the host is currently powered on. This diff --git a/config/render/capm3.yaml b/config/render/capm3.yaml index 2745bb09df..3cf66dd69d 100644 --- a/config/render/capm3.yaml +++ b/config/render/capm3.yaml @@ -571,6 +571,7 @@ spec: - preparation error - provisioning error - power management error + - servicing error type: string goodCredentials: description: The last credentials we were able to validate as working. @@ -832,6 +833,7 @@ spec: - error - delayed - detached + - servicing type: string poweredOn: description: Whether or not the host is currently powered on. This diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index c043a9d894..d8e48acf65 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -321,6 +321,8 @@ func recordActionFailure(info *reconcileInfo, errorType metal3api.ErrorType, err metal3api.InspectionError: "InspectionError", metal3api.ProvisioningError: "ProvisioningError", metal3api.PowerManagementError: "PowerManagementError", + metal3api.PreparationError: "PreparationError", + metal3api.ServicingError: "ServicingError", }[errorType] counter := actionFailureCounters.WithLabelValues(eventType) @@ -460,9 +462,9 @@ func hasInspectAnnotation(host *metal3api.BareMetalHost) bool { return false } -// clearError removes any existing error message. -func clearError(host *metal3api.BareMetalHost) (dirty bool) { - dirty = host.SetOperationalStatus(metal3api.OperationalStatusOK) +// clearErrorWithStatus removes any existing error message and sets operational status. +func clearErrorWithStatus(host *metal3api.BareMetalHost, status metal3api.OperationalStatus) (dirty bool) { + dirty = host.SetOperationalStatus(status) var emptyErrType metal3api.ErrorType if host.Status.ErrorType != emptyErrType { host.Status.ErrorType = emptyErrType @@ -475,6 +477,11 @@ func clearError(host *metal3api.BareMetalHost) (dirty bool) { return dirty } +// clearError removes any existing error message. +func clearError(host *metal3api.BareMetalHost) (dirty bool) { + return clearErrorWithStatus(host, metal3api.OperationalStatusOK) +} + // setErrorMessage updates the ErrorMessage in the host Status struct // and increases the ErrorCount. func setErrorMessage(host *metal3api.BareMetalHost, errType metal3api.ErrorType, message string) { @@ -1355,6 +1362,66 @@ func (r *BareMetalHostReconciler) actionDeprovisioning(prov provisioner.Provisio return actionComplete{} } +func (r *BareMetalHostReconciler) checkServicing(prov provisioner.Provisioner, info *reconcileInfo) (result actionResult, isServicing bool) { + servicingData := provisioner.ServicingData{} + + var fwDirty bool + if !reflect.DeepEqual(info.host.Status.Provisioning.Firmware, info.host.Spec.Firmware) { + servicingData.FirmwareConfig = info.host.Spec.Firmware + fwDirty = true + } + + hfsDirty, hfs, err := r.getHostFirmwareSettings(info) + if err != nil { + return actionError{fmt.Errorf("could not determine updated settings: %w", err)}, false + } + if hfsDirty { + servicingData.ActualFirmwareSettings = hfs.Status.Settings + servicingData.TargetFirmwareSettings = hfs.Spec.Settings + } + + dirty := fwDirty || hfsDirty + + // Even if settings are clean, we need to check the result of the current servicing. + if !dirty && info.host.Status.OperationalStatus != metal3api.OperationalStatusServicing && info.host.Status.ErrorType != metal3api.ServicingError { + // If nothing is going on, return control to the power management. + return nil, false + } + + provResult, started, err := prov.Service(servicingData, dirty, + info.host.Status.ErrorType == metal3api.ServicingError) + if err != nil { + return actionError{fmt.Errorf("error servicing host: %w", err)}, false + } + if provResult.ErrorMessage != "" { + result = recordActionFailure(info, metal3api.ServicingError, provResult.ErrorMessage) + return result, true + } + if started && clearErrorWithStatus(info.host, metal3api.OperationalStatusServicing) { + if fwDirty { + info.host.Status.Provisioning.Firmware = info.host.Spec.Firmware.DeepCopy() + } + dirty = true + } + + if provResult.Dirty { + result := actionContinue{provResult.RequeueAfter} + if dirty { + return actionUpdate{result}, true + } + return result, true + } + + // Servicing is finished at this point, clean up operational status + if clearErrorWithStatus(info.host, metal3api.OperationalStatusOK) { + // We need to give the HostFirmwareSettings controller some time to + // catch up with the changes, otherwise we risk starting the same + // operation again. + return actionUpdate{actionContinue{delay: subResourceNotReadyRetryDelay}}, true + } + return nil, false +} + // Check the current power status against the desired power status. func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, info *reconcileInfo) actionResult { var provResult provisioner.Result @@ -1368,12 +1435,20 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, if hwState.PoweredOn != nil && *hwState.PoweredOn != info.host.Status.PoweredOn { info.log.Info("updating power status", "discovered", *hwState.PoweredOn) info.host.Status.PoweredOn = *hwState.PoweredOn - clearError(info.host) + targetOperationalStatus := metal3api.OperationalStatusOK + if info.host.Status.OperationalStatus == metal3api.OperationalStatusServicing { + targetOperationalStatus = metal3api.OperationalStatusServicing + } + clearErrorWithStatus(info.host, targetOperationalStatus) return actionUpdate{} } desiredPowerOnState := info.host.Spec.Online + provState := info.host.Status.Provisioning.State + // Normal reboots only work in provisioned states, changing online is also possible for available hosts. + isProvisioned := provState == metal3api.StateProvisioned || provState == metal3api.StateExternallyProvisioned + if !info.host.Status.PoweredOn { if _, suffixlessAnnotationExists := info.host.Annotations[metal3api.RebootAnnotationPrefix]; suffixlessAnnotationExists { delete(info.host.Annotations, metal3api.RebootAnnotationPrefix) @@ -1386,9 +1461,13 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, } } - provState := info.host.Status.Provisioning.State - // Normal reboots only work in provisioned states, changing online is also possible for available hosts. - isProvisioned := provState == metal3api.StateProvisioned || provState == metal3api.StateExternallyProvisioned + servicingAllowed := isProvisioned && !info.host.Status.PoweredOn && desiredPowerOnState + if servicingAllowed || info.host.Status.OperationalStatus == metal3api.OperationalStatusServicing || info.host.Status.ErrorType == metal3api.ServicingError { + result, isServicing := r.checkServicing(prov, info) + if result != nil && (result.Dirty() || isServicing) { + return result + } + } desiredReboot, desiredRebootMode := hasRebootAnnotation(info, !isProvisioned) diff --git a/controllers/metal3.io/baremetalhost_controller_test.go b/controllers/metal3.io/baremetalhost_controller_test.go index 25ac0df14e..cd7fcf59a3 100644 --- a/controllers/metal3.io/baremetalhost_controller_test.go +++ b/controllers/metal3.io/baremetalhost_controller_test.go @@ -22,6 +22,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -796,6 +797,63 @@ func TestRebootWithSuffixedAnnotation(t *testing.T) { ) } +// TestRebootWithServicing tests full reboot cycle with suffixless +// annotation and servicing. +func TestRebootWithServicing(t *testing.T) { + host := newDefaultHost(t) + host.Annotations = make(map[string]string) + host.Annotations[metal3api.RebootAnnotationPrefix] = "" + host.Status.PoweredOn = true + host.Status.Provisioning.State = metal3api.StateProvisioned + host.Spec.Online = true + host.Spec.Image = &metal3api.Image{URL: "foo", Checksum: "123"} + host.Spec.Image.URL = "foo" + host.Spec.Firmware = &metal3api.FirmwareConfig{ + VirtualizationEnabled: ptr.To(true), + } + host.Status.Provisioning.Image.URL = "foo" + + r := newTestReconciler(host) + + tryReconcile(t, r, host, + func(host *metal3api.BareMetalHost, result reconcile.Result) bool { + return host.Status.OperationalStatus == metal3api.OperationalStatusOK && !host.Status.PoweredOn + }, + ) + + tryReconcile(t, r, host, + func(host *metal3api.BareMetalHost, result reconcile.Result) bool { + _, exists := host.Annotations[metal3api.RebootAnnotationPrefix] + return host.Status.OperationalStatus == metal3api.OperationalStatusOK && !exists + }, + ) + + tryReconcile(t, r, host, + func(host *metal3api.BareMetalHost, result reconcile.Result) bool { + return host.Status.OperationalStatus == metal3api.OperationalStatusServicing && !host.Status.PoweredOn + }, + ) + + tryReconcile(t, r, host, + func(host *metal3api.BareMetalHost, result reconcile.Result) bool { + return host.Status.OperationalStatus == metal3api.OperationalStatusOK && !host.Status.PoweredOn + }, + ) + + tryReconcile(t, r, host, + func(host *metal3api.BareMetalHost, result reconcile.Result) bool { + return host.Status.PoweredOn + }, + ) + + // make sure we don't go into another reboot + tryReconcile(t, r, host, + func(host *metal3api.BareMetalHost, result reconcile.Result) bool { + return host.Status.PoweredOn + }, + ) +} + func getHostSecret(t *testing.T, r *BareMetalHostReconciler, host *metal3api.BareMetalHost) (secret *corev1.Secret) { t.Helper() secret = &corev1.Secret{} diff --git a/pkg/provisioner/fixture/fixture.go b/pkg/provisioner/fixture/fixture.go index 797938b955..3cebc4c23a 100644 --- a/pkg/provisioner/fixture/fixture.go +++ b/pkg/provisioner/fixture/fixture.go @@ -205,15 +205,18 @@ func (p *fixtureProvisioner) UpdateHardwareState() (hwState provisioner.Hardware // Prepare remove existing configuration and set new configuration. func (p *fixtureProvisioner) Prepare(_ provisioner.PrepareData, unprepared bool, _ bool) (result provisioner.Result, started bool, err error) { - p.log.Info("preparing host") + p.log.Info("preparing host", "unprepared", unprepared) started = unprepared return } // Service remove existing configuration and set new configuration. func (p *fixtureProvisioner) Service(_ provisioner.ServicingData, unprepared bool, _ bool) (result provisioner.Result, started bool, err error) { - p.log.Info("servicing host") + p.log.Info("servicing host", "unprepared", unprepared) started = unprepared + if started { + result.Dirty = true + } return } From 2b52594e4da7a160961f5def0dff9d7f0cab9b36 Mon Sep 17 00:00:00 2001 From: Jacob Anders Date: Fri, 20 Sep 2024 20:49:27 +1000 Subject: [PATCH 3/5] Add references to HostUpdatePolicy in checkServicing code. Signed-off-by: Jacob Anders Removed unused ServicingData fields. --- .../metal3.io/baremetalhost_controller.go | 39 +++++++++--- .../baremetalhost_controller_test.go | 61 ++++++++++++++++++- 2 files changed, 90 insertions(+), 10 deletions(-) diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index d8e48acf65..f977f6edb6 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -1365,19 +1365,40 @@ func (r *BareMetalHostReconciler) actionDeprovisioning(prov provisioner.Provisio func (r *BareMetalHostReconciler) checkServicing(prov provisioner.Provisioner, info *reconcileInfo) (result actionResult, isServicing bool) { servicingData := provisioner.ServicingData{} + // (NOTE)janders: since Servicing is an opt-in feature that requires HostUpdatePolicy to be created and set to onReboot + // set below booleans to false by default and change to true based on policy settings + var fwDirty bool - if !reflect.DeepEqual(info.host.Status.Provisioning.Firmware, info.host.Spec.Firmware) { - servicingData.FirmwareConfig = info.host.Spec.Firmware - fwDirty = true - } + var hfsDirty bool + var liveFirmwareSettingsAllowed bool - hfsDirty, hfs, err := r.getHostFirmwareSettings(info) + hup := &metal3api.HostUpdatePolicy{} + err := r.Get(info.ctx, info.request.NamespacedName, hup) if err != nil { - return actionError{fmt.Errorf("could not determine updated settings: %w", err)}, false + return actionError{fmt.Errorf("unable to fetch HostUpdatePolicy, aborting servicing. Error: %w", err)}, false } - if hfsDirty { - servicingData.ActualFirmwareSettings = hfs.Status.Settings - servicingData.TargetFirmwareSettings = hfs.Spec.Settings + + if hup != nil { + liveFirmwareSettingsAllowed = (hup.Spec.FirmwareSettings == metal3api.HostUpdatePolicyOnReboot) + } + + if liveFirmwareSettingsAllowed { + // handling pre-HFS FirmwareSettings here + if !reflect.DeepEqual(info.host.Status.Provisioning.Firmware, info.host.Spec.Firmware) { + servicingData.FirmwareConfig = info.host.Spec.Firmware + fwDirty = true + } + // handling HFS based FirmwareSettings here + var hfs *metal3api.HostFirmwareSettings + var err error + hfsDirty, hfs, err = r.getHostFirmwareSettings(info) + if err != nil { + return actionError{fmt.Errorf("could not determine updated settings: %w", err)}, false + } + if hfsDirty { + servicingData.ActualFirmwareSettings = hfs.Status.Settings + servicingData.TargetFirmwareSettings = hfs.Spec.Settings + } } dirty := fwDirty || hfsDirty diff --git a/controllers/metal3.io/baremetalhost_controller_test.go b/controllers/metal3.io/baremetalhost_controller_test.go index cd7fcf59a3..4d683afc98 100644 --- a/controllers/metal3.io/baremetalhost_controller_test.go +++ b/controllers/metal3.io/baremetalhost_controller_test.go @@ -813,7 +813,19 @@ func TestRebootWithServicing(t *testing.T) { } host.Status.Provisioning.Image.URL = "foo" - r := newTestReconciler(host) + // HUP creation + hup := &metal3api.HostUpdatePolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: host.Name, + Namespace: namespace, + }, + Spec: metal3api.HostUpdatePolicySpec{ + FirmwareSettings: metal3api.HostUpdatePolicyOnReboot, + FirmwareUpdates: metal3api.HostUpdatePolicyOnReboot, + }, + } + + r := newTestReconciler(host, hup) tryReconcile(t, r, host, func(host *metal3api.BareMetalHost, result reconcile.Result) bool { @@ -854,6 +866,53 @@ func TestRebootWithServicing(t *testing.T) { ) } +// TestRebootWithoutServicing ensures Servicing is not triggered if HostUpdatePolicy doesn't exist. +func TestRebootWithoutServicing(t *testing.T) { + host := newDefaultHost(t) + host.Annotations = make(map[string]string) + host.Annotations[metal3api.RebootAnnotationPrefix] = "" + host.Status.PoweredOn = true + host.Status.Provisioning.State = metal3api.StateProvisioned + host.Spec.Online = true + host.Spec.Image = &metal3api.Image{URL: "foo", Checksum: "123"} + host.Spec.Image.URL = "foo" + host.Status.Provisioning.Image.URL = "foo" + host.Spec.Firmware = &metal3api.FirmwareConfig{ + VirtualizationEnabled: ptr.To(true), + } + + r := newTestReconciler(host) + + tryReconcile(t, r, host, + func(host *metal3api.BareMetalHost, result reconcile.Result) bool { + return !host.Status.PoweredOn + }, + ) + + tryReconcile(t, r, host, + func(host *metal3api.BareMetalHost, result reconcile.Result) bool { + if _, exists := host.Annotations[metal3api.RebootAnnotationPrefix]; exists { + return false + } + + return true + }, + ) + + tryReconcile(t, r, host, + func(host *metal3api.BareMetalHost, result reconcile.Result) bool { + return host.Status.PoweredOn + }, + ) + + // make sure we don't go into another reboot + tryReconcile(t, r, host, + func(host *metal3api.BareMetalHost, result reconcile.Result) bool { + return host.Status.PoweredOn + }, + ) +} + func getHostSecret(t *testing.T, r *BareMetalHostReconciler, host *metal3api.BareMetalHost) (secret *corev1.Secret) { t.Helper() secret = &corev1.Secret{} From 5dfc5366f84a3a6206d50520b66f13b6b08ac0dd Mon Sep 17 00:00:00 2001 From: Jacob Anders Date: Tue, 24 Sep 2024 22:31:20 +1000 Subject: [PATCH 4/5] Move hostUpdatePolicySetOwnerReference calls from registerHost to manageHostPower. Signed-off-by: Jacob Anders --- .../metal3.io/baremetalhost_controller.go | 42 ++++++++----------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index f977f6edb6..9ef8b18496 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -889,7 +889,6 @@ func (r *BareMetalHostReconciler) registerHost(prov provisioner.Provisioner, inf // Create the hostFirmwareSettings resource with same host name/namespace if it doesn't exist // Create the hostFirmwareComponents resource with same host name/namespace if it doesn't exist - // Set owner reference on hostUpdatePolicy resource if not set if info.host.Name != "" { if !info.host.DeletionTimestamp.IsZero() { info.log.Info(fmt.Sprintf("will not attempt to create new hostFirmwareSettings and hostFirmwareComponents in %s", info.host.Namespace)) @@ -902,10 +901,6 @@ func (r *BareMetalHostReconciler) registerHost(prov provisioner.Provisioner, inf info.log.Info("failed creating hostfirmwarecomponents") return actionError{errors.Wrap(err, "failed creating hostFirmwareComponents")} } - if err = r.hostUpdatePolicySetOwnerReference(info); err != nil { - info.log.Info("failed setting owner reference on hostupdatepolicy") - return actionError{errors.Wrap(err, "failed setting owner reference on hostUpdatePolicy")} - } } } // Reaching this point means the credentials are valid and worked, @@ -1362,7 +1357,7 @@ func (r *BareMetalHostReconciler) actionDeprovisioning(prov provisioner.Provisio return actionComplete{} } -func (r *BareMetalHostReconciler) checkServicing(prov provisioner.Provisioner, info *reconcileInfo) (result actionResult, isServicing bool) { +func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner, info *reconcileInfo, hup *metal3api.HostUpdatePolicy) (result actionResult, isServicing bool) { servicingData := provisioner.ServicingData{} // (NOTE)janders: since Servicing is an opt-in feature that requires HostUpdatePolicy to be created and set to onReboot @@ -1372,12 +1367,6 @@ func (r *BareMetalHostReconciler) checkServicing(prov provisioner.Provisioner, i var hfsDirty bool var liveFirmwareSettingsAllowed bool - hup := &metal3api.HostUpdatePolicy{} - err := r.Get(info.ctx, info.request.NamespacedName, hup) - if err != nil { - return actionError{fmt.Errorf("unable to fetch HostUpdatePolicy, aborting servicing. Error: %w", err)}, false - } - if hup != nil { liveFirmwareSettingsAllowed = (hup.Spec.FirmwareSettings == metal3api.HostUpdatePolicyOnReboot) } @@ -1456,11 +1445,9 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, if hwState.PoweredOn != nil && *hwState.PoweredOn != info.host.Status.PoweredOn { info.log.Info("updating power status", "discovered", *hwState.PoweredOn) info.host.Status.PoweredOn = *hwState.PoweredOn - targetOperationalStatus := metal3api.OperationalStatusOK - if info.host.Status.OperationalStatus == metal3api.OperationalStatusServicing { - targetOperationalStatus = metal3api.OperationalStatusServicing + if info.host.Status.OperationalStatus == metal3api.OperationalStatusError || info.host.Status.ErrorType == metal3api.PowerManagementError { + clearError(info.host) } - clearErrorWithStatus(info.host, targetOperationalStatus) return actionUpdate{} } @@ -1481,11 +1468,16 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, return actionContinue{} } } + hup, err := r.hostUpdatePolicySetOwnerReference(info) + if err != nil { + info.log.Info("failed setting owner reference on hostupdatepolicy") + return actionError{errors.Wrap(err, "failed setting owner reference on hostUpdatePolicy")} + } servicingAllowed := isProvisioned && !info.host.Status.PoweredOn && desiredPowerOnState if servicingAllowed || info.host.Status.OperationalStatus == metal3api.OperationalStatusServicing || info.host.Status.ErrorType == metal3api.ServicingError { - result, isServicing := r.checkServicing(prov, info) - if result != nil && (result.Dirty() || isServicing) { + result, _ := r.doServiceIfNeeded(prov, info, hup) + if result != nil { return result } } @@ -1962,7 +1954,7 @@ func (r *BareMetalHostReconciler) createHostFirmwareSettings(info *reconcileInfo return nil } -func (r *BareMetalHostReconciler) hostUpdatePolicySetOwnerReference(info *reconcileInfo) error { +func (r *BareMetalHostReconciler) hostUpdatePolicySetOwnerReference(info *reconcileInfo) (policy *metal3api.HostUpdatePolicy, err error) { // NOTE(janders) the goal here is to ensure that the controller reads the hup resource and adds OwnerReference to it hup := &metal3api.HostUpdatePolicy{} if err := r.Get(info.ctx, info.request.NamespacedName, hup); err != nil { @@ -1972,23 +1964,23 @@ func (r *BareMetalHostReconciler) hostUpdatePolicySetOwnerReference(info *reconc // garbage collected. For additional cleanup logic use // finalizers. Return and don't requeue - return nil + return nil, nil } // Error reading the object - return fmt.Errorf("could not load hostUpdatePolicy resource due to %w", err) + return nil, fmt.Errorf("could not load hostUpdatePolicy resource due to %w", err) } if !ownerReferenceExists(info.host, hup) { if err := controllerutil.SetOwnerReference(info.host, hup, r.Scheme()); err != nil { - return fmt.Errorf("could not set bmh as owner for hostUpdatePolicy due to %w", err) + return hup, fmt.Errorf("could not set bmh as owner for hostUpdatePolicy due to %w", err) } if err := r.Update(info.ctx, hup); err != nil { - return fmt.Errorf("failure updating hostUpdatePolicy resource due to %w", err) + return hup, fmt.Errorf("failure updating hostUpdatePolicy resource due to %w", err) } - return nil + return hup, nil } - return nil + return hup, nil } // Get the stored firmware settings if there are valid changes. From 324e111cce79a5691b386031ffb801d7b76b77e8 Mon Sep 17 00:00:00 2001 From: Jacob Anders Date: Tue, 22 Oct 2024 22:59:57 +1000 Subject: [PATCH 5/5] Removed second return value from doServiceIfNeeded function. Signed-off-by: Jacob Anders --- .../metal3.io/baremetalhost_controller.go | 49 +++++++++++-------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index 9ef8b18496..8cdd851fd7 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -1357,7 +1357,7 @@ func (r *BareMetalHostReconciler) actionDeprovisioning(prov provisioner.Provisio return actionComplete{} } -func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner, info *reconcileInfo, hup *metal3api.HostUpdatePolicy) (result actionResult, isServicing bool) { +func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner, info *reconcileInfo, hup *metal3api.HostUpdatePolicy) (result actionResult) { servicingData := provisioner.ServicingData{} // (NOTE)janders: since Servicing is an opt-in feature that requires HostUpdatePolicy to be created and set to onReboot @@ -1382,7 +1382,7 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner var err error hfsDirty, hfs, err = r.getHostFirmwareSettings(info) if err != nil { - return actionError{fmt.Errorf("could not determine updated settings: %w", err)}, false + return actionError{fmt.Errorf("could not determine updated settings: %w", err)} } if hfsDirty { servicingData.ActualFirmwareSettings = hfs.Status.Settings @@ -1395,41 +1395,46 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner // Even if settings are clean, we need to check the result of the current servicing. if !dirty && info.host.Status.OperationalStatus != metal3api.OperationalStatusServicing && info.host.Status.ErrorType != metal3api.ServicingError { // If nothing is going on, return control to the power management. - return nil, false + return nil } - provResult, started, err := prov.Service(servicingData, dirty, - info.host.Status.ErrorType == metal3api.ServicingError) + // FIXME(janders/dtantsur): this implementation may lead to a scenario where if we never actually + // succeed before leaving this state (e.g. by deprovisioning) we lose the signal that the + // update didn't actually happen. This is deemed an acceptable risk for the moment since it is only + // going to impact a small subset of Firmware Settings implementations. + currentError := info.host.Status.ErrorType + if clearErrorWithStatus(info.host, metal3api.OperationalStatusServicing) { + dirty = true + } + provResult, _, err := prov.Service(servicingData, dirty, + currentError == metal3api.ServicingError) if err != nil { - return actionError{fmt.Errorf("error servicing host: %w", err)}, false + return actionError{fmt.Errorf("error servicing host: %w", err)} } if provResult.ErrorMessage != "" { result = recordActionFailure(info, metal3api.ServicingError, provResult.ErrorMessage) - return result, true + return result } - if started && clearErrorWithStatus(info.host, metal3api.OperationalStatusServicing) { - if fwDirty { - info.host.Status.Provisioning.Firmware = info.host.Spec.Firmware.DeepCopy() - } - dirty = true + + if fwDirty { + info.host.Status.Provisioning.Firmware = info.host.Spec.Firmware.DeepCopy() } if provResult.Dirty { result := actionContinue{provResult.RequeueAfter} if dirty { - return actionUpdate{result}, true + return actionUpdate{result} } - return result, true + return result } // Servicing is finished at this point, clean up operational status if clearErrorWithStatus(info.host, metal3api.OperationalStatusOK) { - // We need to give the HostFirmwareSettings controller some time to - // catch up with the changes, otherwise we risk starting the same - // operation again. - return actionUpdate{actionContinue{delay: subResourceNotReadyRetryDelay}}, true + // FIXME(janders/dtantsur): this can be racy. We should consider + // using a generation number to decide if we start servicing or not. + return actionUpdate{actionContinue{delay: subResourceNotReadyRetryDelay}} } - return nil, false + return nil } // Check the current power status against the desired power status. @@ -1445,7 +1450,7 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, if hwState.PoweredOn != nil && *hwState.PoweredOn != info.host.Status.PoweredOn { info.log.Info("updating power status", "discovered", *hwState.PoweredOn) info.host.Status.PoweredOn = *hwState.PoweredOn - if info.host.Status.OperationalStatus == metal3api.OperationalStatusError || info.host.Status.ErrorType == metal3api.PowerManagementError { + if info.host.Status.OperationalStatus == metal3api.OperationalStatusError && info.host.Status.ErrorType == metal3api.PowerManagementError { clearError(info.host) } return actionUpdate{} @@ -1456,6 +1461,8 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, provState := info.host.Status.Provisioning.State // Normal reboots only work in provisioned states, changing online is also possible for available hosts. isProvisioned := provState == metal3api.StateProvisioned || provState == metal3api.StateExternallyProvisioned + // FIXME(janders/dtantsur) it would be preferrable to pass in state as an argument + // however this falls outside the scope of this specific change. if !info.host.Status.PoweredOn { if _, suffixlessAnnotationExists := info.host.Annotations[metal3api.RebootAnnotationPrefix]; suffixlessAnnotationExists { @@ -1476,7 +1483,7 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, servicingAllowed := isProvisioned && !info.host.Status.PoweredOn && desiredPowerOnState if servicingAllowed || info.host.Status.OperationalStatus == metal3api.OperationalStatusServicing || info.host.Status.ErrorType == metal3api.ServicingError { - result, _ := r.doServiceIfNeeded(prov, info, hup) + result := r.doServiceIfNeeded(prov, info, hup) if result != nil { return result }