Skip to content

Commit

Permalink
improve finalising logic for canary release (#229)
Browse files Browse the repository at this point in the history
improve finalising logic for canary release-2

Signed-off-by: yunbo <[email protected]>
Co-authored-by: yunbo <[email protected]>
  • Loading branch information
myname4423 and Funinu authored Sep 4, 2024
1 parent 5378dc2 commit d261313
Show file tree
Hide file tree
Showing 9 changed files with 287 additions and 72 deletions.
2 changes: 0 additions & 2 deletions api/v1alpha1/rollout_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,6 @@ type RolloutStatus struct {
// Conditions a list of conditions a rollout can have.
// +optional
Conditions []RolloutCondition `json:"conditions,omitempty"`
// +optional
//BlueGreenStatus *BlueGreenStatus `json:"blueGreenStatus,omitempty"`
// Phase is the rollout phase.
Phase RolloutPhase `json:"phase,omitempty"`
// Message provides details on why the rollout is in its current phase
Expand Down
24 changes: 12 additions & 12 deletions api/v1beta1/rollout_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ type CanaryStatus struct {
// BlueGreenStatus status fields that only pertain to the blueGreen rollout
type BlueGreenStatus struct {
CommonStatus `json:",inline"`
// CanaryRevision is calculated by rollout based on podTemplateHash, and the internal logic flow uses
// UpdatedRevision is calculated by rollout based on podTemplateHash, and the internal logic flow uses
// It may be different from rs podTemplateHash in different k8s versions, so it cannot be used as service selector label
UpdatedRevision string `json:"updatedRevision"`
// UpdatedReplicas the numbers of updated pods
Expand Down Expand Up @@ -558,29 +558,29 @@ const (
type FinalisingStepType string

const (
// some work that should be done before pod scaling down.
// For BlueGreenStrategy:
// we rout all traffic to stable or new version based on FinaliseReason
// For CanaryStrategy:
// we remove the selector of stable service
FinalisingStepTypePreparing FinalisingStepType = "Preparing"
// Route all traffic to stable or new version based on FinaliseReason (for bluegreen)
FinalisingStepTypeRouteAllTraffic FinalisingStepType = "RouteAllTraffic"
// Restore the stable Service, i.e. remove corresponding selector
FinalisingStepTypeStableService FinalisingStepType = "RestoreStableService"
// Remove the canary Service
FinalisingStepTypeRemoveCanaryService FinalisingStepType = "RemoveCanaryService"

// Patch Batch Release to scale down (exception: the canary Deployment will be
// scaled down in FinalisingStepTypeDeleteBR step)
// For Both BlueGreenStrategy and CanaryStrategy:
// set workload.pause=false, set workload.partition=0
FinalisingStepTypeBatchRelease FinalisingStepType = "PatchBatchRelease"
//TODO - Currently, the next three steps are in the same function, FinalisingTrafficRouting
// we should try to separate the FinalisingStepTypeGateway and FinalisingStepTypeCanaryService
// with graceful time to prevent some potential issues

// Restore the stable Service (i.e. remove corresponding selector)
FinalisingStepTypeStableService FinalisingStepType = "RestoreStableService"
// Execute the FinalisingTrafficRouting function
FinalisingStepTypeTrafficRouting FinalisingStepType = "FinalisingTrafficRouting"
// Restore the GatewayAPI/Ingress/Istio
FinalisingStepTypeGateway FinalisingStepType = "RestoreGateway"
// Delete Canary Service
FinalisingStepTypeDeleteCanaryService FinalisingStepType = "DeleteCanaryService"
// Delete Batch Release
FinalisingStepTypeDeleteBR FinalisingStepType = "DeleteBatchRelease"
// All needed work done
FinalisingStepTypeEnd FinalisingStepType = "END"
)

// +genclient
Expand Down
5 changes: 2 additions & 3 deletions config/crd/bases/rollouts.kruise.io_rollouts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -534,8 +534,7 @@ spec:
format: int64
type: integer
phase:
description: BlueGreenStatus *BlueGreenStatus `json:"blueGreenStatus,omitempty"`
Phase is the rollout phase.
description: Phase is the rollout phase.
type: string
type: object
type: object
Expand Down Expand Up @@ -1475,7 +1474,7 @@ spec:
format: int32
type: integer
updatedRevision:
description: CanaryRevision is calculated by rollout based on
description: UpdatedRevision is calculated by rollout based on
podTemplateHash, and the internal logic flow uses It may be
different from rs podTemplateHash in different k8s versions,
so it cannot be used as service selector label
Expand Down
142 changes: 103 additions & 39 deletions pkg/controller/rollout/rollout_canary.go
Original file line number Diff line number Diff line change
Expand Up @@ -363,43 +363,66 @@ func (m *canaryReleaseManager) doCanaryJump(c *RolloutContext) (jumped bool) {

// cleanup after rollout is completed or finished
func (m *canaryReleaseManager) doCanaryFinalising(c *RolloutContext) (bool, error) {
canaryStatus := c.NewStatus.CanaryStatus
// when CanaryStatus is nil, which means canary action hasn't started yet, don't need doing cleanup
if c.NewStatus.CanaryStatus == nil {
if canaryStatus == nil {
return true, nil
}
// 1. rollout progressing complete, remove rollout progressing annotation in workload
// rollout progressing complete, remove rollout progressing annotation in workload
err := m.removeRolloutProgressingAnnotation(c)
if err != nil {
return false, err
}
tr := newTrafficRoutingContext(c)
// 2. remove stable service the pod revision selector, so stable service will be selector all version pods.
done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr)
c.NewStatus.CanaryStatus.LastUpdateTime = tr.LastUpdateTime
if err != nil || !done {
return done, err
}
// 3. set workload.pause=false; set workload.partition=0
done, err = m.finalizingBatchRelease(c)
if err != nil || !done {
return done, err
}
// 4. modify network api(ingress or gateway api) configuration, and route 100% traffic to stable pods.
done, err = m.trafficRoutingManager.FinalisingTrafficRouting(tr)
c.NewStatus.CanaryStatus.LastUpdateTime = tr.LastUpdateTime
if err != nil || !done {
return done, err
}
// 5. delete batchRelease crd
done, err = m.removeBatchRelease(c)
if err != nil {
klog.Errorf("rollout(%s/%s) Finalize batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error())
return false, err
} else if !done {
// execute steps based on the predefined order for each reason
nextStep := nextTask(c.FinalizeReason, canaryStatus.FinalisingStep)
// if current step is empty, set it with the first step
// if current step is end, we just return
if len(canaryStatus.FinalisingStep) == 0 {
canaryStatus.FinalisingStep = nextStep
canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()}
} else if canaryStatus.FinalisingStep == v1beta1.FinalisingStepTypeEnd {
klog.Infof("rollout(%s/%s) finalising process is already completed", c.Rollout.Namespace, c.Rollout.Name)
return true, nil
}
klog.Infof("rollout(%s/%s) Finalising Step is %s", c.Rollout.Namespace, c.Rollout.Name, canaryStatus.FinalisingStep)

var retry bool
// the order of steps is maitained by calculating thenextStep
switch canaryStatus.FinalisingStep {
// set workload.pause=false; set workload.partition=0
case v1beta1.FinalisingStepTypeBatchRelease:
retry, err = m.finalizingBatchRelease(c)
// delete batchRelease
case v1beta1.FinalisingStepTypeDeleteBR:
retry, err = m.removeBatchRelease(c)
// restore the gateway resources (ingress/gatewayAPI/Istio), that means
// only stable Service will accept the traffic
case v1beta1.FinalisingStepTypeGateway:
retry, err = m.trafficRoutingManager.RestoreGateway(tr)
// restore the stable service
case v1beta1.FinalisingStepTypeStableService:
retry, err = m.trafficRoutingManager.RestoreStableService(tr)
// remove canary service
case v1beta1.FinalisingStepTypeRemoveCanaryService:
retry, err = m.trafficRoutingManager.RemoveCanaryService(tr)

default:
nextStep = nextTask(c.FinalizeReason, "")
klog.Warningf("unexpected finalising step, current step(%s), start from the first step(%s)", canaryStatus.FinalisingStep, nextStep)
canaryStatus.FinalisingStep = nextStep
return false, nil
}
klog.Infof("rollout(%s/%s) doCanaryFinalising success", c.Rollout.Namespace, c.Rollout.Name)
return true, nil
if err != nil || retry {
return false, err
}
// current step is done, run the next step
canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()}
canaryStatus.FinalisingStep = nextStep
if canaryStatus.FinalisingStep == v1beta1.FinalisingStepTypeEnd {
return true, nil
}
return false, nil
}

func (m *canaryReleaseManager) removeRolloutProgressingAnnotation(c *RolloutContext) error {
Expand Down Expand Up @@ -508,45 +531,47 @@ func createBatchRelease(rollout *v1beta1.Rollout, rolloutID string, batch int32,
return br
}

// bool means if we need retry; if error is not nil, always retry
func (m *canaryReleaseManager) removeBatchRelease(c *RolloutContext) (bool, error) {
batch := &v1beta1.BatchRelease{}
err := m.Get(context.TODO(), client.ObjectKey{Namespace: c.Rollout.Namespace, Name: c.Rollout.Name}, batch)
if err != nil && errors.IsNotFound(err) {
return true, nil
return false, nil
} else if err != nil {
klog.Errorf("rollout(%s/%s) fetch BatchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name)
return false, err
return true, err
}
if !batch.DeletionTimestamp.IsZero() {
klog.Infof("rollout(%s/%s) BatchRelease is terminating, and wait a moment", c.Rollout.Namespace, c.Rollout.Name)
return false, nil
return true, nil
}

//delete batchRelease
err = m.Delete(context.TODO(), batch)
if err != nil {
klog.Errorf("rollout(%s/%s) delete BatchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error())
return false, err
return true, err
}
klog.Infof("rollout(%s/%s) deleting BatchRelease, and wait a moment", c.Rollout.Namespace, c.Rollout.Name)
return false, nil
return true, nil
}

// bool means if we need retry; if error is not nil, always retry
func (m *canaryReleaseManager) finalizingBatchRelease(c *RolloutContext) (bool, error) {
br, err := m.fetchBatchRelease(c.Rollout.Namespace, c.Rollout.Name)
if err != nil {
if errors.IsNotFound(err) {
return true, nil
return false, nil
}
return false, err
return true, err
}
waitReady := c.WaitReady
// The Completed phase means batchRelease controller has processed all it
// should process. If BatchRelease phase is completed, we can do nothing.
if br.Spec.ReleasePlan.BatchPartition == nil &&
br.Status.Phase == v1beta1.RolloutPhaseCompleted {
klog.Infof("rollout(%s/%s) finalizing batchRelease(%s) done", c.Rollout.Namespace, c.Rollout.Name, util.DumpJSON(br.Status))
return true, nil
return false, nil
}

// If BatchPartition is nil, BatchRelease will directly resume workload via:
Expand All @@ -559,11 +584,11 @@ func (m *canaryReleaseManager) finalizingBatchRelease(c *RolloutContext) (bool,
switch br.Spec.ReleasePlan.FinalizingPolicy {
case v1beta1.WaitResumeFinalizingPolicyType:
if waitReady { // no need to patch again
return false, nil
return true, nil
}
default:
if !waitReady { // no need to patch again
return false, nil
return true, nil
}
}
}
Expand All @@ -577,10 +602,10 @@ func (m *canaryReleaseManager) finalizingBatchRelease(c *RolloutContext) (bool,
// Patch BatchPartition and FinalizingPolicy, BatchPartition always patch null here.
body := fmt.Sprintf(`{"spec":{"releasePlan":{"batchPartition":null,"finalizingPolicy":"%s"}}}`, policy)
if err = m.Patch(context.TODO(), br, client.RawPatch(types.MergePatchType, []byte(body))); err != nil {
return false, err
return true, err
}
klog.Infof("rollout(%s/%s) patch batchRelease(%s) success", c.Rollout.Namespace, c.Rollout.Name, body)
return false, nil
return true, nil
}

// syncBatchRelease sync status of br to canaryStatus, and sync rollout-id of canaryStatus to br.
Expand All @@ -601,3 +626,42 @@ func (m *canaryReleaseManager) syncBatchRelease(br *v1beta1.BatchRelease, canary
}
return nil
}

// calculate next task
func nextTask(reason string, currentTask v1beta1.FinalisingStepType) v1beta1.FinalisingStepType {
var taskSequence []v1beta1.FinalisingStepType
//REVIEW - should we consider more complex scenarios?
// like, user rollbacks the workload and disables the Rollout at the same time?
switch reason {
// in rollback, we cannot remove selector of the stable service in the first step,
// which may cause some traffic to problematic new version, so we should route all traffic to stable service
// in the first step
case v1beta1.FinaliseReasonRollback: // rollback
taskSequence = []v1beta1.FinalisingStepType{
v1beta1.FinalisingStepTypeGateway, // route all traffic to stable version
v1beta1.FinalisingStepTypeBatchRelease, // scale up old, scale down new
v1beta1.FinalisingStepTypeDeleteBR,
v1beta1.FinalisingStepTypeStableService,
v1beta1.FinalisingStepTypeRemoveCanaryService,
}
default: // others: success/disabled/deleting rollout
taskSequence = []v1beta1.FinalisingStepType{
v1beta1.FinalisingStepTypeStableService,
v1beta1.FinalisingStepTypeGateway,
v1beta1.FinalisingStepTypeRemoveCanaryService,
v1beta1.FinalisingStepTypeBatchRelease, // scale up new, scale down old
v1beta1.FinalisingStepTypeDeleteBR,
}
}
// if currentTask is empty, return first task
if len(currentTask) == 0 {
return taskSequence[0]
}
// find next task
for i := range taskSequence {
if currentTask == taskSequence[i] && i < len(taskSequence)-1 {
return taskSequence[i+1]
}
}
return v1beta1.FinalisingStepTypeEnd
}
12 changes: 8 additions & 4 deletions pkg/controller/rollout/rollout_progressing.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ type RolloutContext struct {
RecheckTime *time.Time
// wait stable workload pods ready
WaitReady bool
// finalising reason
FinalizeReason string
}

// parameter1 retryReconcile, parameter2 error
Expand Down Expand Up @@ -116,6 +118,7 @@ func (r *RolloutReconciler) reconcileRolloutProgressing(rollout *v1beta1.Rollout
klog.Infof("rollout(%s/%s) is Progressing, and in reason(%s)", rollout.Namespace, rollout.Name, cond.Reason)
var done bool
rolloutContext.WaitReady = true
rolloutContext.FinalizeReason = v1beta1.FinaliseReasonSuccess
done, err = r.doFinalising(rolloutContext)
if err != nil {
return nil, err
Expand All @@ -140,6 +143,7 @@ func (r *RolloutReconciler) reconcileRolloutProgressing(rollout *v1beta1.Rollout
case v1alpha1.ProgressingReasonCancelling:
klog.Infof("rollout(%s/%s) is Progressing, and in reason(%s)", rollout.Namespace, rollout.Name, cond.Reason)
var done bool
rolloutContext.FinalizeReason = v1beta1.FinaliseReasonRollback
done, err = r.doFinalising(rolloutContext)
if err != nil {
return nil, err
Expand Down Expand Up @@ -416,11 +420,11 @@ func (r *RolloutReconciler) doProgressingReset(c *RolloutContext) (bool, error)
}
// if no trafficRouting exists, simply remove batchRelease
if !c.Rollout.Spec.Strategy.HasTrafficRoutings() {
done, err := releaseManager.removeBatchRelease(c)
retry, err := releaseManager.removeBatchRelease(c)
if err != nil {
klog.Errorf("rollout(%s/%s) DoFinalising batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error())
return false, err
} else if !done {
} else if retry {
return false, nil
}
return true, nil
Expand Down Expand Up @@ -454,11 +458,11 @@ func (r *RolloutReconciler) doProgressingReset(c *RolloutContext) (bool, error)
// canary deployment, for other release, the v2 pods won't be deleted immediately
// in both cases, only the stable pods (v1) accept the traffic
case v1beta1.FinalisingStepTypeDeleteBR:
done, err := releaseManager.removeBatchRelease(c)
retry, err := releaseManager.removeBatchRelease(c)
if err != nil {
klog.Errorf("rollout(%s/%s) Finalize batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error())
return false, err
} else if !done {
} else if retry {
return false, nil
}
klog.Infof("rollout(%s/%s) in step (%s), and success", c.Rollout.Namespace, c.Rollout.Name, subStatus.FinalisingStep)
Expand Down
Loading

0 comments on commit d261313

Please sign in to comment.