diff --git a/apis/metal3.io/v1alpha1/baremetalhost_types.go b/apis/metal3.io/v1alpha1/baremetalhost_types.go index 32dfedb6f0..1b0eb5f6fd 100644 --- a/apis/metal3.io/v1alpha1/baremetalhost_types.go +++ b/apis/metal3.io/v1alpha1/baremetalhost_types.go @@ -147,6 +147,10 @@ const ( // OperationalStatusDetached is the status value when the host is // marked unmanaged via the detached annotation. OperationalStatusDetached OperationalStatus = "detached" + + // OperationalStatusServicing is the status value when the host is + // undergoing servicing (e.g. checking firmware settings). + OperationalStatusServicing OperationalStatus = "servicing" ) // OperationalStatusAllowed represents the allowed values of OperationalStatus. @@ -179,6 +183,9 @@ const ( // DetachError is an error condition occurring when the // controller is unable to detatch the host from the provisioner. DetachError ErrorType = "detach error" + // ServicingError is an error condition occurring when + // service steps failed. + ServicingError ErrorType = "servicing error" ) // ErrorTypeAllowed represents the allowed values of ErrorType. @@ -767,12 +774,12 @@ type BareMetalHostStatus struct { // after modifying this file // OperationalStatus holds the status of the host - // +kubebuilder:validation:Enum="";OK;discovered;error;delayed;detached + // +kubebuilder:validation:Enum="";OK;discovered;error;delayed;detached;servicing OperationalStatus OperationalStatus `json:"operationalStatus"` // ErrorType indicates the type of failure encountered when the // OperationalStatus is OperationalStatusError - // +kubebuilder:validation:Enum=provisioned registration error;registration error;inspection error;preparation error;provisioning error;power management error + // +kubebuilder:validation:Enum=provisioned registration error;registration error;inspection error;preparation error;provisioning error;power management error;servicing error ErrorType ErrorType `json:"errorType,omitempty"` // LastUpdated identifies when this status was last observed. diff --git a/config/base/crds/bases/metal3.io_baremetalhosts.yaml b/config/base/crds/bases/metal3.io_baremetalhosts.yaml index 007f1e7a8f..a9eef1d30b 100644 --- a/config/base/crds/bases/metal3.io_baremetalhosts.yaml +++ b/config/base/crds/bases/metal3.io_baremetalhosts.yaml @@ -553,6 +553,7 @@ spec: - preparation error - provisioning error - power management error + - servicing error type: string goodCredentials: description: the last credentials we were able to validate as working @@ -808,6 +809,7 @@ spec: - error - delayed - detached + - servicing type: string poweredOn: description: indicator for whether or not the host is powered on diff --git a/config/render/capm3.yaml b/config/render/capm3.yaml index 36df29c780..48a3afe594 100644 --- a/config/render/capm3.yaml +++ b/config/render/capm3.yaml @@ -553,6 +553,7 @@ spec: - preparation error - provisioning error - power management error + - servicing error type: string goodCredentials: description: the last credentials we were able to validate as working @@ -808,6 +809,7 @@ spec: - error - delayed - detached + - servicing type: string poweredOn: description: indicator for whether or not the host is powered on diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index 47ac08cfaf..c5f4fd1c6e 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -318,6 +318,8 @@ func recordActionFailure(info *reconcileInfo, errorType metal3api.ErrorType, err metal3api.InspectionError: "InspectionError", metal3api.ProvisioningError: "ProvisioningError", metal3api.PowerManagementError: "PowerManagementError", + metal3api.PreparationError: "PreparationError", + metal3api.ServicingError: "ServicingError", }[errorType] counter := actionFailureCounters.WithLabelValues(eventType) @@ -457,9 +459,9 @@ func hasInspectAnnotation(host *metal3api.BareMetalHost) bool { return false } -// clearError removes any existing error message. -func clearError(host *metal3api.BareMetalHost) (dirty bool) { - dirty = host.SetOperationalStatus(metal3api.OperationalStatusOK) +// clearErrorWithStatus removes any existing error message and sets operational status. +func clearErrorWithStatus(host *metal3api.BareMetalHost, status metal3api.OperationalStatus) (dirty bool) { + dirty = host.SetOperationalStatus(status) var emptyErrType metal3api.ErrorType if host.Status.ErrorType != emptyErrType { host.Status.ErrorType = emptyErrType @@ -472,6 +474,11 @@ func clearError(host *metal3api.BareMetalHost) (dirty bool) { return dirty } +// clearError removes any existing error message. +func clearError(host *metal3api.BareMetalHost) (dirty bool) { + return clearErrorWithStatus(host, metal3api.OperationalStatusOK) +} + // setErrorMessage updates the ErrorMessage in the host Status struct // and increases the ErrorCount. func setErrorMessage(host *metal3api.BareMetalHost, errType metal3api.ErrorType, message string) { @@ -1330,6 +1337,66 @@ func (r *BareMetalHostReconciler) actionDeprovisioning(prov provisioner.Provisio return actionComplete{} } +func (r *BareMetalHostReconciler) checkServicing(prov provisioner.Provisioner, info *reconcileInfo) (result actionResult, isServicing bool) { + servicingData := provisioner.ServicingData{} + + var fwDirty bool + if !reflect.DeepEqual(info.host.Status.Provisioning.Firmware, info.host.Spec.Firmware) { + servicingData.FirmwareConfig = info.host.Spec.Firmware + fwDirty = true + } + + hfsDirty, hfs, err := r.getHostFirmwareSettings(info) + if err != nil { + return actionError{fmt.Errorf("could not determine updated settings: %w", err)}, false + } + if hfsDirty { + servicingData.ActualFirmwareSettings = hfs.Status.Settings + servicingData.TargetFirmwareSettings = hfs.Spec.Settings + } + + dirty := fwDirty || hfsDirty + + // Even if settings are clean, we need to check the result of the current servicing. + if !dirty && info.host.Status.OperationalStatus != metal3api.OperationalStatusServicing && info.host.Status.OperationalStatus != metal3api.OperationalStatusError { + // If nothing is going on, return control to the power management. + return nil, false + } + + provResult, started, err := prov.Service(servicingData, dirty, + info.host.Status.ErrorType == metal3api.ServicingError) + if err != nil { + return actionError{fmt.Errorf("error servicing host: %w", err)}, false + } + if provResult.ErrorMessage != "" { + result = recordActionFailure(info, metal3api.ServicingError, provResult.ErrorMessage) + return result, true + } + if started && clearErrorWithStatus(info.host, metal3api.OperationalStatusServicing) { + if fwDirty { + info.host.Status.Provisioning.Firmware = info.host.Spec.Firmware.DeepCopy() + } + dirty = true + } + + if provResult.Dirty { + result := actionContinue{provResult.RequeueAfter} + if dirty { + return actionUpdate{result}, true + } + return result, true + } + + // Servicing is finished at this point, clean up operational status + if clearErrorWithStatus(info.host, metal3api.OperationalStatusOK) { + // We need to give the HostFirmwareSettings controller some time to + // catch up with the changes, otherwise we risk starting the same + // operation again. + return actionUpdate{actionContinue{delay: subResourceNotReadyRetryDelay}}, true + } + return nil, false +} + // Check the current power status against the desired power status. func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, info *reconcileInfo) actionResult { var provResult provisioner.Result @@ -1343,12 +1410,20 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, if hwState.PoweredOn != nil && *hwState.PoweredOn != info.host.Status.PoweredOn { info.log.Info("updating power status", "discovered", *hwState.PoweredOn) info.host.Status.PoweredOn = *hwState.PoweredOn - clearError(info.host) + targetOperationalStatus := metal3api.OperationalStatusOK + if info.host.Status.OperationalStatus == metal3api.OperationalStatusServicing { + targetOperationalStatus = metal3api.OperationalStatusServicing + } + clearErrorWithStatus(info.host, targetOperationalStatus) return actionUpdate{} } desiredPowerOnState := info.host.Spec.Online + provState := info.host.Status.Provisioning.State + // Normal reboots only work in provisioned states, changing online is also possible for available hosts. + isProvisioned := provState == metal3api.StateProvisioned || provState == metal3api.StateExternallyProvisioned + if !info.host.Status.PoweredOn { if _, suffixlessAnnotationExists := info.host.Annotations[metal3api.RebootAnnotationPrefix]; suffixlessAnnotationExists { delete(info.host.Annotations, metal3api.RebootAnnotationPrefix) @@ -1359,11 +1434,15 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, return actionContinue{} } - } - provState := info.host.Status.Provisioning.State - // Normal reboots only work in provisioned states, changing online is also possible for available hosts. - isProvisioned := provState == metal3api.StateProvisioned || provState == metal3api.StateExternallyProvisioned + // Servicing only happens when the host is provisioned and is being powered on + if isProvisioned && desiredPowerOnState { + result, isServicing := r.checkServicing(prov, info) + if result != nil && (result.Dirty() || isServicing) { + return result + } + } + } desiredReboot, desiredRebootMode := hasRebootAnnotation(info, !isProvisioned) diff --git a/controllers/metal3.io/baremetalhost_controller_test.go b/controllers/metal3.io/baremetalhost_controller_test.go index cba36eb4ea..f9d67aeec0 100644 --- a/controllers/metal3.io/baremetalhost_controller_test.go +++ b/controllers/metal3.io/baremetalhost_controller_test.go @@ -21,6 +21,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -791,6 +792,63 @@ func TestRebootWithSuffixedAnnotation(t *testing.T) { ) } +// TestRebootWithServicing tests full reboot cycle with suffixless +// annotation and servicing. +func TestRebootWithServicing(t *testing.T) { + host := newDefaultHost(t) + host.Annotations = make(map[string]string) + host.Annotations[metal3api.RebootAnnotationPrefix] = "" + host.Status.PoweredOn = true + host.Status.Provisioning.State = metal3api.StateProvisioned + host.Spec.Online = true + host.Spec.Image = &metal3api.Image{URL: "foo", Checksum: "123"} + host.Spec.Image.URL = "foo" + host.Spec.Firmware = &metal3api.FirmwareConfig{ + VirtualizationEnabled: ptr.To(true), + } + host.Status.Provisioning.Image.URL = "foo" + + r := newTestReconciler(host) + + tryReconcile(t, r, host, + func(host *metal3api.BareMetalHost, result reconcile.Result) bool { + return host.Status.OperationalStatus == metal3api.OperationalStatusOK && !host.Status.PoweredOn + }, + ) + + tryReconcile(t, r, host, + func(host *metal3api.BareMetalHost, result reconcile.Result) bool { + _, exists := host.Annotations[metal3api.RebootAnnotationPrefix] + return host.Status.OperationalStatus == metal3api.OperationalStatusOK && !exists + }, + ) + + tryReconcile(t, r, host, + func(host *metal3api.BareMetalHost, result reconcile.Result) bool { + return host.Status.OperationalStatus == metal3api.OperationalStatusServicing && !host.Status.PoweredOn + }, + ) + + tryReconcile(t, r, host, + func(host *metal3api.BareMetalHost, result reconcile.Result) bool { + return host.Status.OperationalStatus == metal3api.OperationalStatusOK && !host.Status.PoweredOn + }, + ) + + tryReconcile(t, r, host, + func(host *metal3api.BareMetalHost, result reconcile.Result) bool { + return host.Status.PoweredOn + }, + ) + + // make sure we don't go into another reboot + tryReconcile(t, r, host, + func(host *metal3api.BareMetalHost, result reconcile.Result) bool { + return host.Status.PoweredOn + }, + ) +} + func getHostSecret(t *testing.T, r *BareMetalHostReconciler, host *metal3api.BareMetalHost) (secret *corev1.Secret) { t.Helper() secret = &corev1.Secret{} diff --git a/pkg/provisioner/fixture/fixture.go b/pkg/provisioner/fixture/fixture.go index 438047a300..1de7908902 100644 --- a/pkg/provisioner/fixture/fixture.go +++ b/pkg/provisioner/fixture/fixture.go @@ -199,15 +199,18 @@ func (p *fixtureProvisioner) UpdateHardwareState() (hwState provisioner.Hardware // Prepare remove existing configuration and set new configuration. func (p *fixtureProvisioner) Prepare(_ provisioner.PrepareData, unprepared bool, _ bool) (result provisioner.Result, started bool, err error) { - p.log.Info("preparing host") + p.log.Info("preparing host", "unprepared", unprepared) started = unprepared return } // Service remove existing configuration and set new configuration. func (p *fixtureProvisioner) Service(_ provisioner.ServicingData, unprepared bool, _ bool) (result provisioner.Result, started bool, err error) { - p.log.Info("servicing host") + p.log.Info("servicing host", "unprepared", unprepared) started = unprepared + if started { + result.Dirty = true + } return }