Skip to content

Commit

Permalink
Wire in servicing (updating provisioned hosts)
Browse files Browse the repository at this point in the history
Servicing only runs when a host is powered off (either completely or
by rebooting it).

Signed-off-by: Dmitry Tantsur <[email protected]>
  • Loading branch information
dtantsur committed May 2, 2024
1 parent 0219e0e commit 1ad33fc
Show file tree
Hide file tree
Showing 6 changed files with 163 additions and 12 deletions.
11 changes: 9 additions & 2 deletions apis/metal3.io/v1alpha1/baremetalhost_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,10 @@ const (
// OperationalStatusDetached is the status value when the host is
// marked unmanaged via the detached annotation.
OperationalStatusDetached OperationalStatus = "detached"

// OperationalStatusServicing is the status value when the host is
// undergoing servicing (e.g. checking firmware settings).
OperationalStatusServicing OperationalStatus = "servicing"
)

// OperationalStatusAllowed represents the allowed values of OperationalStatus.
Expand Down Expand Up @@ -179,6 +183,9 @@ const (
// DetachError is an error condition occurring when the
// controller is unable to detatch the host from the provisioner.
DetachError ErrorType = "detach error"
// ServicingError is an error condition occurring when
// service steps failed.
ServicingError ErrorType = "servicing error"
)

// ErrorTypeAllowed represents the allowed values of ErrorType.
Expand Down Expand Up @@ -767,12 +774,12 @@ type BareMetalHostStatus struct {
// after modifying this file

// OperationalStatus holds the status of the host
// +kubebuilder:validation:Enum="";OK;discovered;error;delayed;detached
// +kubebuilder:validation:Enum="";OK;discovered;error;delayed;detached;servicing
OperationalStatus OperationalStatus `json:"operationalStatus"`

// ErrorType indicates the type of failure encountered when the
// OperationalStatus is OperationalStatusError
// +kubebuilder:validation:Enum=provisioned registration error;registration error;inspection error;preparation error;provisioning error;power management error
// +kubebuilder:validation:Enum=provisioned registration error;registration error;inspection error;preparation error;provisioning error;power management error;servicing error
ErrorType ErrorType `json:"errorType,omitempty"`

// LastUpdated identifies when this status was last observed.
Expand Down
2 changes: 2 additions & 0 deletions config/base/crds/bases/metal3.io_baremetalhosts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,7 @@ spec:
- preparation error
- provisioning error
- power management error
- servicing error
type: string
goodCredentials:
description: the last credentials we were able to validate as working
Expand Down Expand Up @@ -808,6 +809,7 @@ spec:
- error
- delayed
- detached
- servicing
type: string
poweredOn:
description: indicator for whether or not the host is powered on
Expand Down
2 changes: 2 additions & 0 deletions config/render/capm3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,7 @@ spec:
- preparation error
- provisioning error
- power management error
- servicing error
type: string
goodCredentials:
description: the last credentials we were able to validate as working
Expand Down Expand Up @@ -808,6 +809,7 @@ spec:
- error
- delayed
- detached
- servicing
type: string
poweredOn:
description: indicator for whether or not the host is powered on
Expand Down
95 changes: 87 additions & 8 deletions controllers/metal3.io/baremetalhost_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,8 @@ func recordActionFailure(info *reconcileInfo, errorType metal3api.ErrorType, err
metal3api.InspectionError: "InspectionError",
metal3api.ProvisioningError: "ProvisioningError",
metal3api.PowerManagementError: "PowerManagementError",
metal3api.PreparationError: "PreparationError",
metal3api.ServicingError: "ServicingError",
}[errorType]

counter := actionFailureCounters.WithLabelValues(eventType)
Expand Down Expand Up @@ -457,9 +459,9 @@ func hasInspectAnnotation(host *metal3api.BareMetalHost) bool {
return false
}

// clearError removes any existing error message.
func clearError(host *metal3api.BareMetalHost) (dirty bool) {
dirty = host.SetOperationalStatus(metal3api.OperationalStatusOK)
// clearErrorWithStatus removes any existing error message and sets operational status.
func clearErrorWithStatus(host *metal3api.BareMetalHost, status metal3api.OperationalStatus) (dirty bool) {
dirty = host.SetOperationalStatus(status)
var emptyErrType metal3api.ErrorType
if host.Status.ErrorType != emptyErrType {
host.Status.ErrorType = emptyErrType
Expand All @@ -472,6 +474,11 @@ func clearError(host *metal3api.BareMetalHost) (dirty bool) {
return dirty
}

// clearError removes any existing error message.
func clearError(host *metal3api.BareMetalHost) (dirty bool) {
return clearErrorWithStatus(host, metal3api.OperationalStatusOK)
}

// setErrorMessage updates the ErrorMessage in the host Status struct
// and increases the ErrorCount.
func setErrorMessage(host *metal3api.BareMetalHost, errType metal3api.ErrorType, message string) {
Expand Down Expand Up @@ -1330,6 +1337,66 @@ func (r *BareMetalHostReconciler) actionDeprovisioning(prov provisioner.Provisio
return actionComplete{}
}

func (r *BareMetalHostReconciler) checkServicing(prov provisioner.Provisioner, info *reconcileInfo) (result actionResult, isServicing bool) {
servicingData := provisioner.ServicingData{}

var fwDirty bool
if !reflect.DeepEqual(info.host.Status.Provisioning.Firmware, info.host.Spec.Firmware) {
servicingData.FirmwareConfig = info.host.Spec.Firmware
fwDirty = true
}

hfsDirty, hfs, err := r.getHostFirmwareSettings(info)
if err != nil {
return actionError{fmt.Errorf("could not determine updated settings: %w", err)}, false
}
if hfsDirty {
servicingData.ActualFirmwareSettings = hfs.Status.Settings
servicingData.TargetFirmwareSettings = hfs.Spec.Settings
}

dirty := fwDirty || hfsDirty

// Even if settings are clean, we need to check the result of the current servicing.
if !dirty && info.host.Status.OperationalStatus != metal3api.OperationalStatusServicing && info.host.Status.OperationalStatus != metal3api.OperationalStatusError {
// If nothing is going on, return control to the power management.
return nil, false
}

provResult, started, err := prov.Service(servicingData, dirty,
info.host.Status.ErrorType == metal3api.ServicingError)
if err != nil {
return actionError{fmt.Errorf("error servicing host: %w", err)}, false
}
if provResult.ErrorMessage != "" {
result = recordActionFailure(info, metal3api.ServicingError, provResult.ErrorMessage)
return result, true
}
if started && clearErrorWithStatus(info.host, metal3api.OperationalStatusServicing) {
if fwDirty {
info.host.Status.Provisioning.Firmware = info.host.Spec.Firmware.DeepCopy()
}
dirty = true
}

if provResult.Dirty {
result := actionContinue{provResult.RequeueAfter}
if dirty {
return actionUpdate{result}, true
}
return result, true
}

// Servicing is finished at this point, clean up operational status
if clearErrorWithStatus(info.host, metal3api.OperationalStatusOK) {
// We need to give the HostFirmwareSettings controller some time to
// catch up with the changes, otherwise we risk starting the same
// operation again.
return actionUpdate{actionContinue{delay: subResourceNotReadyRetryDelay}}, true
}
return nil, false
}

// Check the current power status against the desired power status.
func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, info *reconcileInfo) actionResult {
var provResult provisioner.Result
Expand All @@ -1343,12 +1410,20 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner,
if hwState.PoweredOn != nil && *hwState.PoweredOn != info.host.Status.PoweredOn {
info.log.Info("updating power status", "discovered", *hwState.PoweredOn)
info.host.Status.PoweredOn = *hwState.PoweredOn
clearError(info.host)
targetOperationalStatus := metal3api.OperationalStatusOK
if info.host.Status.OperationalStatus == metal3api.OperationalStatusServicing {
targetOperationalStatus = metal3api.OperationalStatusServicing
}
clearErrorWithStatus(info.host, targetOperationalStatus)
return actionUpdate{}
}

desiredPowerOnState := info.host.Spec.Online

provState := info.host.Status.Provisioning.State
// Normal reboots only work in provisioned states, changing online is also possible for available hosts.
isProvisioned := provState == metal3api.StateProvisioned || provState == metal3api.StateExternallyProvisioned

if !info.host.Status.PoweredOn {
if _, suffixlessAnnotationExists := info.host.Annotations[metal3api.RebootAnnotationPrefix]; suffixlessAnnotationExists {
delete(info.host.Annotations, metal3api.RebootAnnotationPrefix)
Expand All @@ -1359,11 +1434,15 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner,

return actionContinue{}
}
}

provState := info.host.Status.Provisioning.State
// Normal reboots only work in provisioned states, changing online is also possible for available hosts.
isProvisioned := provState == metal3api.StateProvisioned || provState == metal3api.StateExternallyProvisioned
// Servicing only happens when the host is provisioned and is being powered on
if isProvisioned && desiredPowerOnState {
result, isServicing := r.checkServicing(prov, info)
if result != nil && (result.Dirty() || isServicing) {
return result
}
}
}

desiredReboot, desiredRebootMode := hasRebootAnnotation(info, !isProvisioned)

Expand Down
58 changes: 58 additions & 0 deletions controllers/metal3.io/baremetalhost_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake"
Expand Down Expand Up @@ -791,6 +792,63 @@ func TestRebootWithSuffixedAnnotation(t *testing.T) {
)
}

// TestRebootWithServicing tests full reboot cycle with suffixless
// annotation and servicing.
func TestRebootWithServicing(t *testing.T) {
host := newDefaultHost(t)
host.Annotations = make(map[string]string)
host.Annotations[metal3api.RebootAnnotationPrefix] = ""
host.Status.PoweredOn = true
host.Status.Provisioning.State = metal3api.StateProvisioned
host.Spec.Online = true
host.Spec.Image = &metal3api.Image{URL: "foo", Checksum: "123"}
host.Spec.Image.URL = "foo"
host.Spec.Firmware = &metal3api.FirmwareConfig{
VirtualizationEnabled: ptr.To(true),
}
host.Status.Provisioning.Image.URL = "foo"

r := newTestReconciler(host)

tryReconcile(t, r, host,
func(host *metal3api.BareMetalHost, result reconcile.Result) bool {
return host.Status.OperationalStatus == metal3api.OperationalStatusOK && !host.Status.PoweredOn
},
)

tryReconcile(t, r, host,
func(host *metal3api.BareMetalHost, result reconcile.Result) bool {
_, exists := host.Annotations[metal3api.RebootAnnotationPrefix]
return host.Status.OperationalStatus == metal3api.OperationalStatusOK && !exists
},
)

tryReconcile(t, r, host,
func(host *metal3api.BareMetalHost, result reconcile.Result) bool {
return host.Status.OperationalStatus == metal3api.OperationalStatusServicing && !host.Status.PoweredOn
},
)

tryReconcile(t, r, host,
func(host *metal3api.BareMetalHost, result reconcile.Result) bool {
return host.Status.OperationalStatus == metal3api.OperationalStatusOK && !host.Status.PoweredOn
},
)

tryReconcile(t, r, host,
func(host *metal3api.BareMetalHost, result reconcile.Result) bool {
return host.Status.PoweredOn
},
)

// make sure we don't go into another reboot
tryReconcile(t, r, host,
func(host *metal3api.BareMetalHost, result reconcile.Result) bool {
return host.Status.PoweredOn
},
)
}

func getHostSecret(t *testing.T, r *BareMetalHostReconciler, host *metal3api.BareMetalHost) (secret *corev1.Secret) {
t.Helper()
secret = &corev1.Secret{}
Expand Down
7 changes: 5 additions & 2 deletions pkg/provisioner/fixture/fixture.go
Original file line number Diff line number Diff line change
Expand Up @@ -199,15 +199,18 @@ func (p *fixtureProvisioner) UpdateHardwareState() (hwState provisioner.Hardware

// Prepare remove existing configuration and set new configuration.
func (p *fixtureProvisioner) Prepare(_ provisioner.PrepareData, unprepared bool, _ bool) (result provisioner.Result, started bool, err error) {
p.log.Info("preparing host")
p.log.Info("preparing host", "unprepared", unprepared)
started = unprepared
return
}

// Service remove existing configuration and set new configuration.
func (p *fixtureProvisioner) Service(_ provisioner.ServicingData, unprepared bool, _ bool) (result provisioner.Result, started bool, err error) {
p.log.Info("servicing host")
p.log.Info("servicing host", "unprepared", unprepared)
started = unprepared
if started {
result.Dirty = true
}
return
}

Expand Down

0 comments on commit 1ad33fc

Please sign in to comment.