Skip to content

Commit

Permalink
Fixing race condition during cluster upgrade with kernel upgrade (#1222)
Browse files Browse the repository at this point in the history
The following scenario causes worker pods to be stuck during cluster
upgrade:
1) kernel module is loaded into the node. NMC contains both spec and
   status using the current kernel version
2) cluster upgrade starts. As part of the upgrade node becomes
   Unschedulable
3) Module-NMC removes Spec from NMC, since the node is Uschedulable
4) Node becomes schedulable
5) NMC controller tries to unload kernel module using the NMC Status
   confiration, which contains old kernel.
6) Worker unload pods get stuck in Error, since Node is running the new
   kernel
7) Module-NMC updates Spec of NMC, but since worker pod exists, nothing
   is done

Solution:
   When processing orphaned NMC statuses (status exists but spec does
not), NMC controller should ignore modules that have statuses created
prior to Node's Ready timestamp
  • Loading branch information
yevgeny-shnaidman authored Sep 25, 2024
1 parent aed4d51 commit 898c278
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 85 deletions.
16 changes: 8 additions & 8 deletions internal/controllers/mock_nmc_reconciler.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

35 changes: 20 additions & 15 deletions internal/controllers/nmc_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,14 +125,19 @@ func (r *NMCReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
statusMap[status.Namespace+"/"+status.Name] = &nmcObj.Status.Modules[i]
}

node := v1.Node{}
if err := r.client.Get(ctx, types.NamespacedName{Name: nmcObj.Name}, &node); err != nil {
return ctrl.Result{}, fmt.Errorf("could not get node %s: %v", nmcObj.Name, err)
}

errs := make([]error, 0, len(nmcObj.Spec.Modules)+len(nmcObj.Status.Modules))

for _, mod := range nmcObj.Spec.Modules {
moduleNameKey := mod.Namespace + "/" + mod.Name

logger := logger.WithValues("module", moduleNameKey)

if err := r.helper.ProcessModuleSpec(ctrl.LoggerInto(ctx, logger), &nmcObj, &mod, statusMap[moduleNameKey]); err != nil {
if err := r.helper.ProcessModuleSpec(ctrl.LoggerInto(ctx, logger), &nmcObj, &mod, statusMap[moduleNameKey], &node); err != nil {
errs = append(
errs,
fmt.Errorf("error processing Module %s: %v", moduleNameKey, err),
Expand All @@ -150,7 +155,7 @@ func (r *NMCReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
for statusNameKey, status := range statusMap {
logger := logger.WithValues("status", statusNameKey)

if err := r.helper.ProcessUnconfiguredModuleStatus(ctrl.LoggerInto(ctx, logger), &nmcObj, status); err != nil {
if err := r.helper.ProcessUnconfiguredModuleStatus(ctrl.LoggerInto(ctx, logger), &nmcObj, status, &node); err != nil {
errs = append(
errs,
fmt.Errorf("error processing orphan status for Module %s: %v", statusNameKey, err),
Expand All @@ -161,10 +166,6 @@ func (r *NMCReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
if err := r.helper.GarbageCollectInUseLabels(ctx, &nmcObj); err != nil {
errs = append(errs, fmt.Errorf("failed to GC in-use labels for NMC %s: %v", req.NamespacedName, err))
}
node := v1.Node{}
if err := r.client.Get(ctx, types.NamespacedName{Name: nmcObj.Name}, &node); err != nil {
return ctrl.Result{}, fmt.Errorf("could not get node %s: %v", nmcObj.Name, err)
}

if loaded, unloaded, err := r.helper.UpdateNodeLabels(ctx, &nmcObj, &node); err != nil {
errs = append(errs, fmt.Errorf("could not update node's labels for NMC %s: %v", req.NamespacedName, err))
Expand Down Expand Up @@ -223,8 +224,8 @@ func GetContainerStatus(statuses []v1.ContainerStatus, name string) v1.Container

type nmcReconcilerHelper interface {
GarbageCollectInUseLabels(ctx context.Context, nmc *kmmv1beta1.NodeModulesConfig) error
ProcessModuleSpec(ctx context.Context, nmc *kmmv1beta1.NodeModulesConfig, spec *kmmv1beta1.NodeModuleSpec, status *kmmv1beta1.NodeModuleStatus) error
ProcessUnconfiguredModuleStatus(ctx context.Context, nmc *kmmv1beta1.NodeModulesConfig, status *kmmv1beta1.NodeModuleStatus) error
ProcessModuleSpec(ctx context.Context, nmc *kmmv1beta1.NodeModulesConfig, spec *kmmv1beta1.NodeModuleSpec, status *kmmv1beta1.NodeModuleStatus, node *v1.Node) error
ProcessUnconfiguredModuleStatus(ctx context.Context, nmc *kmmv1beta1.NodeModulesConfig, status *kmmv1beta1.NodeModuleStatus, node *v1.Node) error
RemovePodFinalizers(ctx context.Context, nodeName string) error
SyncStatus(ctx context.Context, nmc *kmmv1beta1.NodeModulesConfig) error
UpdateNodeLabels(ctx context.Context, nmc *kmmv1beta1.NodeModulesConfig, node *v1.Node) ([]types.NamespacedName, []types.NamespacedName, error)
Expand Down Expand Up @@ -314,6 +315,7 @@ func (h *nmcReconcilerHelperImpl) ProcessModuleSpec(
nmcObj *kmmv1beta1.NodeModulesConfig,
spec *kmmv1beta1.NodeModuleSpec,
status *kmmv1beta1.NodeModuleStatus,
node *v1.Node,
) error {
podName := workerPodName(nmcObj.Name, spec.Name)

Expand Down Expand Up @@ -344,13 +346,7 @@ func (h *nmcReconcilerHelperImpl) ProcessModuleSpec(
return h.pm.CreateLoaderPod(ctx, nmcObj, spec)
}

node := v1.Node{}

if err = h.client.Get(ctx, types.NamespacedName{Name: nmcObj.Name}, &node); err != nil {
return fmt.Errorf("could not get node %s: %v", nmcObj.Name, err)
}

if h.nodeAPI.NodeBecomeReadyAfter(&node, status.LastTransitionTime) {
if h.nodeAPI.NodeBecomeReadyAfter(node, status.LastTransitionTime) {
logger.Info("node has been rebooted and become ready after kernel module was loaded; creating loader Pod")
return h.pm.CreateLoaderPod(ctx, nmcObj, spec)
}
Expand Down Expand Up @@ -392,11 +388,20 @@ func (h *nmcReconcilerHelperImpl) ProcessUnconfiguredModuleStatus(
ctx context.Context,
nmcObj *kmmv1beta1.NodeModulesConfig,
status *kmmv1beta1.NodeModuleStatus,
node *v1.Node,
) error {
podName := workerPodName(nmcObj.Name, status.Name)

logger := ctrl.LoggerFrom(ctx).WithValues("pod name", podName)

/* node was rebooted, spec not set so no kernel module is loaded, no need to unload.
it also fixes the scenario when node's kernel was upgraded, so unload pod will fail anyway
*/
if h.nodeAPI.NodeBecomeReadyAfter(node, status.LastTransitionTime) {
logger.Info("node was rebooted, no need to unload kernel module that is not present in kernel, will wait until NMC spec is updated")
return nil
}

pod, err := h.pm.GetWorkerPod(ctx, podName, status.Namespace)
if err != nil {
return fmt.Errorf("error while getting the worker Pod %s: %v", podName, err)
Expand Down
Loading

0 comments on commit 898c278

Please sign in to comment.