Skip to content

Commit

Permalink
Add failover history information
Browse files Browse the repository at this point in the history
Signed-off-by: Aditya Addepalli <[email protected]>
  • Loading branch information
Dyex719 authored and mszacillo committed Oct 17, 2024
1 parent 47efa57 commit 185626b
Show file tree
Hide file tree
Showing 14 changed files with 535 additions and 2 deletions.
25 changes: 25 additions & 0 deletions api/openapi-spec/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -20022,6 +20022,23 @@
}
]
},
"com.github.karmada-io.karmada.pkg.apis.work.v1alpha2.FailoverHistoryItem": {
"type": "object",
"properties": {
"failoverTime": {
"description": "FailoverTime represents the timestamp when the workload failed over. It is represented in RFC3339 form(like '2021-04-25T10:02:10Z') and is in UTC.",
"$ref": "#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.Time"
},
"originCluster": {
"description": "OriginCluster denotes the name of the cluster from which the workload was failed over.",
"type": "string"
},
"reason": {
"description": "Reason denotes the reason why the workload failed over.",
"type": "string"
}
}
},
"com.github.karmada-io.karmada.pkg.apis.work.v1alpha2.GracefulEvictionTask": {
"description": "GracefulEvictionTask represents a graceful eviction task.",
"type": "object",
Expand Down Expand Up @@ -20332,6 +20349,14 @@
"$ref": "#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.Condition"
}
},
"failoverHistory": {
"description": "FailoverHistory represents history of the previous failovers of this resource",
"type": "array",
"items": {
"default": {},
"$ref": "#/definitions/com.github.karmada-io.karmada.pkg.apis.work.v1alpha2.FailoverHistoryItem"
}
},
"lastScheduledTime": {
"description": "LastScheduledTime representing the latest timestamp when scheduler successfully finished a scheduling. It is represented in RFC3339 form (like '2006-01-02T15:04:05Z') and is in UTC.",
"$ref": "#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.Time"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1339,6 +1339,50 @@ spec:
- type
type: object
type: array
failoverHistory:
description: FailoverHistory represents the history of the failover
of the resource
items:
description: FailoverHistoryItem represents either a failover event
in the history.
properties:
failoverTime:
description: StartTime is the timestamp of when the failover
occurred.
type: object
originCluster:
description: OriginCluster is the name of the cluster from which
the application migrated.
type: string
originalClusters:
description: ClustersBeforeFailover records the clusters where
running the application before failover.
items:
type: string
type: array
preservedLabelState:
additionalProperties:
type: string
description: |-
PreservedLabelState represents the application state information collected from the original cluster,
and it will be injected into the new cluster in the form of application labels.
type: object
reason:
description: Reason denotes the type of failover.
type: string
targetClusters:
description: ClustersAfterFailover records the clusters where
running the application after failover.
items:
type: string
type: array
required:
- failoverTime
- originCluster
- originalClusters
- reason
type: object
type: array
lastScheduledTime:
description: |-
LastScheduledTime representing the latest timestamp when scheduler successfully finished a scheduling.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1339,6 +1339,50 @@ spec:
- type
type: object
type: array
failoverHistory:
description: FailoverHistory represents the history of the failover
of the resource
items:
description: FailoverHistoryItem represents either a failover event
in the history.
properties:
failoverTime:
description: StartTime is the timestamp of when the failover
occurred.
type: object
originCluster:
description: OriginCluster is the name of the cluster from which
the application migrated.
type: string
originalClusters:
description: ClustersBeforeFailover records the clusters where
running the application before failover.
items:
type: string
type: array
preservedLabelState:
additionalProperties:
type: string
description: |-
PreservedLabelState represents the application state information collected from the original cluster,
and it will be injected into the new cluster in the form of application labels.
type: object
reason:
description: Reason denotes the type of failover.
type: string
targetClusters:
description: ClustersAfterFailover records the clusters where
running the application after failover.
items:
type: string
type: array
required:
- failoverTime
- originCluster
- originalClusters
- reason
type: object
type: array
lastScheduledTime:
description: |-
LastScheduledTime representing the latest timestamp when scheduler successfully finished a scheduling.
Expand Down
5 changes: 5 additions & 0 deletions pkg/apis/policy/v1alpha1/propagation_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,9 @@ type FailoverBehavior struct {
// If this value is nil, failover is disabled.
// +optional
// Cluster *ClusterFailoverBehavior `json:"cluster,omitempty"`

// TODO: Consider moving StatePreservation out from Application, so that
// in case of Cluster failover sceanrio can share.
}

// ApplicationFailoverBehavior indicates application failover behaviors.
Expand Down Expand Up @@ -318,6 +321,8 @@ type ApplicationFailoverBehavior struct {
// Value must be positive integer.
// +optional
GracePeriodSeconds *int32 `json:"gracePeriodSeconds,omitempty"`

// TODO: Add StatePreservation attribute for stateful failover use-cases
}

// DecisionConditions represents the decision conditions of performing the failover process.
Expand Down
43 changes: 43 additions & 0 deletions pkg/apis/work/v1alpha2/binding_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,10 @@ type ResourceBindingStatus struct {
// AggregatedStatus represents status list of the resource running in each member cluster.
// +optional
AggregatedStatus []AggregatedStatusItem `json:"aggregatedStatus,omitempty"`

// FailoverHistory represents the history of the failover of the resource
// + optional
FailoverHistory []FailoverHistoryItem `json:"failoverHistory,omitempty"`
}

// AggregatedStatusItem represents status of the resource running in a member cluster.
Expand Down Expand Up @@ -362,6 +366,45 @@ type AggregatedStatusItem struct {
Health ResourceHealth `json:"health,omitempty"`
}

// FailoverHistoryItem represents either a failover event in the history.
type FailoverHistoryItem struct {
// OriginCluster is the name of the cluster from which the application migrated.
// +required
OriginCluster string `json:"originCluster"`

// Reason denotes the type of failover.
// +required
Reason FailoverReason `json:"reason"`

// StartTime is the timestamp of when the failover occurred.
// +required
StartTime metav1.Time `json:"failoverTime"`

// ClustersBeforeFailover records the clusters where running the application before failover.
// +required
ClustersBeforeFailover []string `json:"originalClusters"`

// ClustersAfterFailover records the clusters where running the application after failover.
// +optional
ClustersAfterFailover []string `json:"targetClusters,omitempty"`

// PreservedLabelState represents the application state information collected from the original cluster,
// and it will be injected into the new cluster in the form of application labels.
// +optional
PreservedLabelState map[string]string `json:"preservedLabelState,omitempty"`
}

// FailoverReason represents the reason for the failover.
type FailoverReason string

const (
// ClusterFailover represents the failover is due to cluster issues.
ClusterFailover FailoverReason = "ClusterFailover"

// ApplicationFailover represents the failover is due to application issues.
ApplicationFailover FailoverReason = "ApplicationFailover" // Failover due to application issues, handled by health interpretation.
)

// Conditions definition
const (
// Scheduled represents the condition that the ResourceBinding or ClusterResourceBinding has been scheduled.
Expand Down
6 changes: 6 additions & 0 deletions pkg/apis/work/v1alpha2/well_known_constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,12 @@ const (
// Additional options will be added here in the future.
DeletionProtectionLabelKey = "resourcetemplate.karmada.io/deletion-protected"
DeletionProtectionAlways = "Always"

// ResourceBindingFailoverLabel If a resource is failed over by karmada, this label will be attached
// to the rescheduled workload. The value will denote the type of failover that occurred, either cluster or application.
// This can be useful if applications are stateful and need to know when they have been failed over by Karmada,
// as opposed to being scheduled fresh.
ResourceBindingFailoverLabel = "resourcebinding.karmada.io/failover-type"
)

// Define eviction reasons.
Expand Down
41 changes: 41 additions & 0 deletions pkg/apis/work/v1alpha2/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
configv1alpha1 "github.com/karmada-io/karmada/pkg/apis/config/v1alpha1"
policyv1alpha1 "github.com/karmada-io/karmada/pkg/apis/policy/v1alpha1"
workv1alpha2 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha2"
controllerUtils "github.com/karmada-io/karmada/pkg/controllers/utils"
"github.com/karmada-io/karmada/pkg/features"
"github.com/karmada-io/karmada/pkg/resourceinterpreter"
"github.com/karmada-io/karmada/pkg/sharedcli/ratelimiterflag"
Expand Down Expand Up @@ -154,6 +155,9 @@ func (c *RBApplicationFailoverController) syncBinding(ctx context.Context, bindi

func (c *RBApplicationFailoverController) evictBinding(binding *workv1alpha2.ResourceBinding, clusters []string) error {
for _, cluster := range clusters {
if err := controllerUtils.UpdateFailoverStatus(c.Client, binding, cluster, workv1alpha2.EvictionReasonApplicationFailure); err != nil {
klog.Errorf("Failed to update status with failover information. Error: %v", err)
}
switch binding.Spec.Failover.Application.PurgeMode {
case policyv1alpha1.Graciously:
if features.FeatureGate.Enabled(features.GracefulEviction) {
Expand Down
28 changes: 28 additions & 0 deletions pkg/controllers/binding/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,17 @@ func mergeTargetClusters(targetClusters []workv1alpha2.TargetCluster, requiredBy
func mergeLabel(workload *unstructured.Unstructured, binding metav1.Object, scope apiextensionsv1.ResourceScope) map[string]string {
var workLabel = make(map[string]string)
if scope == apiextensionsv1.NamespaceScoped {
namespaceBindingObj := binding.(*workv1alpha2.ResourceBinding)
failoverReason := checkFailoverHistory(namespaceBindingObj)
if failoverReason != "" {
if failoverReason == workv1alpha2.EvictionReasonApplicationFailure {
util.MergeLabel(workload, workv1alpha2.ResourceBindingFailoverLabel, "application")
workLabel[workv1alpha2.ResourceBindingFailoverLabel] = "application"
} else if failoverReason == workv1alpha2.EvictionReasonTaintUntolerated {
util.MergeLabel(workload, workv1alpha2.ResourceBindingFailoverLabel, "cluster")
workLabel[workv1alpha2.ResourceBindingFailoverLabel] = "cluster"
}
}
bindingID := util.GetLabelValue(binding.GetLabels(), workv1alpha2.ResourceBindingPermanentIDLabel)
util.MergeLabel(workload, workv1alpha2.ResourceBindingPermanentIDLabel, bindingID)
workLabel[workv1alpha2.ResourceBindingPermanentIDLabel] = bindingID
Expand All @@ -175,6 +186,23 @@ func mergeLabel(workload *unstructured.Unstructured, binding metav1.Object, scop
return workLabel
}

// Check if resourcebinding contains failover history, used to determine whether we attach failover label to cloned workload
// Return failover reason
func checkFailoverHistory(resourceBinding *workv1alpha2.ResourceBinding) string {
failoverHistory := resourceBinding.Status.FailoverHistory
if len(failoverHistory) == 0 {
return ""
}
lastFailover := failoverHistory[len(failoverHistory)-1]
if lastFailover.Reason == "ClusterFailover" {
return workv1alpha2.EvictionReasonTaintUntolerated
}
if lastFailover.Reason == "ApplicationFailover" {
return workv1alpha2.EvictionReasonApplicationFailure
}
return ""
}

func mergeAnnotations(workload *unstructured.Unstructured, binding metav1.Object, scope apiextensionsv1.ResourceScope) map[string]string {
annotations := make(map[string]string)
if workload.GetGeneration() > 0 {
Expand Down
5 changes: 5 additions & 0 deletions pkg/controllers/cluster/taint_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (

clusterv1alpha1 "github.com/karmada-io/karmada/pkg/apis/cluster/v1alpha1"
workv1alpha2 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha2"
controllerUtils "github.com/karmada-io/karmada/pkg/controllers/utils"
"github.com/karmada-io/karmada/pkg/features"
"github.com/karmada-io/karmada/pkg/util"
"github.com/karmada-io/karmada/pkg/util/fedinformer/keys"
Expand Down Expand Up @@ -170,6 +171,10 @@ func (tc *NoExecuteTaintManager) syncBindingEviction(key util.QueueKey) error {
// Case 2: Need eviction after toleration time. If time is up, do eviction right now.
// Case 3: Tolerate forever, we do nothing.
if needEviction || tolerationTime == 0 {
err := controllerUtils.UpdateFailoverStatus(tc.Client, binding, cluster, workv1alpha2.EvictionReasonTaintUntolerated)
if err != nil {
klog.Errorf("Failed to update status with failover information. Error: %v", err)
}
// update final result to evict the target cluster
if features.FeatureGate.Enabled(features.GracefulEviction) {
binding.Spec.GracefulEvictCluster(cluster, workv1alpha2.NewTaskOptions(workv1alpha2.WithProducer(workv1alpha2.EvictionProducerTaintManager), workv1alpha2.WithReason(workv1alpha2.EvictionReasonTaintUntolerated)))
Expand Down
Loading

0 comments on commit 185626b

Please sign in to comment.