Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add failover history information #5251

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions api/openapi-spec/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -20022,6 +20022,43 @@
}
]
},
"com.github.karmada-io.karmada.pkg.apis.work.v1alpha2.FailoverHistoryItem": {
"description": "FailoverHistoryItem represents either a failover event in the history.",
"type": "object",
"required": [
"reason",
"failoverTime",
"originalCluster"
],
"properties": {
"failoverTime": {
"description": "StartTime is the timestamp of when the failover occurred.",
"$ref": "#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.Time"
},
"originalCluster": {
"description": "ClustersBeforeFailover records the cluster where the application was running prior to failover.",
"type": "string",
"default": ""
},
"preservedLabelState": {
"description": "PreservedLabelState represents the application state information collected from the original cluster, and it will be injected into the new cluster in the form of application labels.",
"type": "object",
"additionalProperties": {
"type": "string",
"default": ""
}
},
"reason": {
"description": "Reason denotes the type of failover.",
"type": "string",
"default": ""
},
"targetCluster": {
"description": "ClustersAfterFailover records the cluster where the application is running after failover.",
"type": "string"
}
}
},
"com.github.karmada-io.karmada.pkg.apis.work.v1alpha2.GracefulEvictionTask": {
"description": "GracefulEvictionTask represents a graceful eviction task.",
"type": "object",
Expand Down Expand Up @@ -20332,6 +20369,14 @@
"$ref": "#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.Condition"
}
},
"failoverHistory": {
"description": "FailoverHistory represents the history of the failover of the resource",
"type": "array",
"items": {
"default": {},
"$ref": "#/definitions/com.github.karmada-io.karmada.pkg.apis.work.v1alpha2.FailoverHistoryItem"
}
},
"lastScheduledTime": {
"description": "LastScheduledTime representing the latest timestamp when scheduler successfully finished a scheduling. It is represented in RFC3339 form (like '2006-01-02T15:04:05Z') and is in UTC.",
"$ref": "#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.Time"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1339,6 +1339,42 @@ spec:
- type
type: object
type: array
failoverHistory:
description: FailoverHistory represents the history of the failover
of the resource
items:
description: FailoverHistoryItem represents either a failover event
in the history.
properties:
failoverTime:
description: StartTime is the timestamp of when the failover
occurred.
format: date-time
type: string
originalCluster:
description: ClustersBeforeFailover records the cluster where
the application was running prior to failover.
type: string
preservedLabelState:
additionalProperties:
type: string
description: |-
PreservedLabelState represents the application state information collected from the original cluster,
and it will be injected into the new cluster in the form of application labels.
type: object
reason:
description: Reason denotes the type of failover.
type: string
targetCluster:
description: ClustersAfterFailover records the cluster where
the application is running after failover.
type: string
required:
- failoverTime
- originalCluster
- reason
type: object
type: array
lastScheduledTime:
description: |-
LastScheduledTime representing the latest timestamp when scheduler successfully finished a scheduling.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1339,6 +1339,42 @@ spec:
- type
type: object
type: array
failoverHistory:
description: FailoverHistory represents the history of the failover
of the resource
items:
description: FailoverHistoryItem represents either a failover event
in the history.
properties:
failoverTime:
description: StartTime is the timestamp of when the failover
occurred.
format: date-time
type: string
originalCluster:
description: ClustersBeforeFailover records the cluster where
the application was running prior to failover.
type: string
preservedLabelState:
additionalProperties:
type: string
description: |-
PreservedLabelState represents the application state information collected from the original cluster,
and it will be injected into the new cluster in the form of application labels.
type: object
reason:
description: Reason denotes the type of failover.
type: string
targetCluster:
description: ClustersAfterFailover records the cluster where
the application is running after failover.
type: string
required:
- failoverTime
- originalCluster
- reason
type: object
type: array
lastScheduledTime:
description: |-
LastScheduledTime representing the latest timestamp when scheduler successfully finished a scheduling.
Expand Down
5 changes: 5 additions & 0 deletions pkg/apis/policy/v1alpha1/propagation_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,9 @@ type FailoverBehavior struct {
// If this value is nil, failover is disabled.
// +optional
// Cluster *ClusterFailoverBehavior `json:"cluster,omitempty"`

// TODO: Consider moving StatePreservation out from Application, so that
// in case of Cluster failover sceanrio can share.
}

// ApplicationFailoverBehavior indicates application failover behaviors.
Expand Down Expand Up @@ -318,6 +321,8 @@ type ApplicationFailoverBehavior struct {
// Value must be positive integer.
// +optional
GracePeriodSeconds *int32 `json:"gracePeriodSeconds,omitempty"`

// TODO: Add StatePreservation attribute for stateful failover use-cases
}

// DecisionConditions represents the decision conditions of performing the failover process.
Expand Down
39 changes: 39 additions & 0 deletions pkg/apis/work/v1alpha2/binding_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,10 @@ type ResourceBindingStatus struct {
// AggregatedStatus represents status list of the resource running in each member cluster.
// +optional
AggregatedStatus []AggregatedStatusItem `json:"aggregatedStatus,omitempty"`

// FailoverHistory represents the history of the failover of the resource
// + optional
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// + optional
// +optional

FailoverHistory []FailoverHistoryItem `json:"failoverHistory,omitempty"`
}

// AggregatedStatusItem represents status of the resource running in a member cluster.
Expand Down Expand Up @@ -362,6 +366,41 @@ type AggregatedStatusItem struct {
Health ResourceHealth `json:"health,omitempty"`
}

// FailoverHistoryItem represents either a failover event in the history.
type FailoverHistoryItem struct {
// Reason denotes the type of failover.
// +required
Reason FailoverReason `json:"reason"`

// StartTime is the timestamp of when the failover occurred.
// +required
StartTime metav1.Time `json:"failoverTime"`

// ClustersBeforeFailover records the cluster where the application was running prior to failover.
// +required
ClusterBeforeFailover string `json:"originalCluster"`

// ClustersAfterFailover records the cluster where the application is running after failover.
// +optional
ClusterAfterFailover string `json:"targetCluster,omitempty"`

// PreservedLabelState represents the application state information collected from the original cluster,
// and it will be injected into the new cluster in the form of application labels.
// +optional
PreservedLabelState map[string]string `json:"preservedLabelState,omitempty"`
}

// FailoverReason represents the reason for the failover.
type FailoverReason string

const (
// ClusterFailover represents the failover is due to cluster issues.
ClusterFailover FailoverReason = "ClusterFailover"

// ApplicationFailover represents the failover is due to application issues.
ApplicationFailover FailoverReason = "ApplicationFailover" // Failover due to application issues, handled by health interpretation.
)

// Conditions definition
const (
// Scheduled represents the condition that the ResourceBinding or ClusterResourceBinding has been scheduled.
Expand Down
6 changes: 6 additions & 0 deletions pkg/apis/work/v1alpha2/well_known_constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,12 @@ const (
// Additional options will be added here in the future.
DeletionProtectionLabelKey = "resourcetemplate.karmada.io/deletion-protected"
DeletionProtectionAlways = "Always"

// ResourceBindingFailoverLabel If a resource is failed over by karmada, this label will be attached
// to the rescheduled workload. The value will denote the type of failover that occurred, either cluster or application.
// This can be useful if applications are stateful and need to know when they have been failed over by Karmada,
// as opposed to being scheduled fresh.
ResourceBindingFailoverLabel = "resourcebinding.karmada.io/failover-type"
)

// Define eviction reasons.
Expand Down
31 changes: 31 additions & 0 deletions pkg/apis/work/v1alpha2/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
configv1alpha1 "github.com/karmada-io/karmada/pkg/apis/config/v1alpha1"
policyv1alpha1 "github.com/karmada-io/karmada/pkg/apis/policy/v1alpha1"
workv1alpha2 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha2"
controllerUtils "github.com/karmada-io/karmada/pkg/controllers/utils"
"github.com/karmada-io/karmada/pkg/features"
"github.com/karmada-io/karmada/pkg/resourceinterpreter"
"github.com/karmada-io/karmada/pkg/sharedcli/ratelimiterflag"
Expand Down Expand Up @@ -154,6 +155,9 @@ func (c *RBApplicationFailoverController) syncBinding(ctx context.Context, bindi

func (c *RBApplicationFailoverController) evictBinding(binding *workv1alpha2.ResourceBinding, clusters []string) error {
for _, cluster := range clusters {
if err := controllerUtils.UpdateFailoverStatus(c.Client, binding, cluster, workv1alpha2.EvictionReasonApplicationFailure); err != nil {
klog.Errorf("Failed to update status with failover information. Error: %v", err)
}
switch binding.Spec.Failover.Application.PurgeMode {
case policyv1alpha1.Graciously:
if features.FeatureGate.Enabled(features.GracefulEviction) {
Expand Down
28 changes: 28 additions & 0 deletions pkg/controllers/binding/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,17 @@ func mergeTargetClusters(targetClusters []workv1alpha2.TargetCluster, requiredBy
func mergeLabel(workload *unstructured.Unstructured, binding metav1.Object, scope apiextensionsv1.ResourceScope) map[string]string {
var workLabel = make(map[string]string)
if scope == apiextensionsv1.NamespaceScoped {
namespaceBindingObj := binding.(*workv1alpha2.ResourceBinding)
failoverReason := checkFailoverHistory(namespaceBindingObj)
if failoverReason != "" {
if failoverReason == workv1alpha2.EvictionReasonApplicationFailure {
util.MergeLabel(workload, workv1alpha2.ResourceBindingFailoverLabel, "application")
workLabel[workv1alpha2.ResourceBindingFailoverLabel] = "application"
} else if failoverReason == workv1alpha2.EvictionReasonTaintUntolerated {
util.MergeLabel(workload, workv1alpha2.ResourceBindingFailoverLabel, "cluster")
workLabel[workv1alpha2.ResourceBindingFailoverLabel] = "cluster"
}
}
bindingID := util.GetLabelValue(binding.GetLabels(), workv1alpha2.ResourceBindingPermanentIDLabel)
util.MergeLabel(workload, workv1alpha2.ResourceBindingPermanentIDLabel, bindingID)
workLabel[workv1alpha2.ResourceBindingPermanentIDLabel] = bindingID
Expand All @@ -175,6 +186,23 @@ func mergeLabel(workload *unstructured.Unstructured, binding metav1.Object, scop
return workLabel
}

// Check if resourcebinding contains failover history, used to determine whether we attach failover label to cloned workload
// Return failover reason
func checkFailoverHistory(resourceBinding *workv1alpha2.ResourceBinding) string {
failoverHistory := resourceBinding.Status.FailoverHistory
if len(failoverHistory) == 0 {
return ""
}
lastFailover := failoverHistory[len(failoverHistory)-1]
if lastFailover.Reason == "ClusterFailover" {
return workv1alpha2.EvictionReasonTaintUntolerated
}
if lastFailover.Reason == "ApplicationFailover" {
return workv1alpha2.EvictionReasonApplicationFailure
}
return ""
}

func mergeAnnotations(workload *unstructured.Unstructured, binding metav1.Object, scope apiextensionsv1.ResourceScope) map[string]string {
annotations := make(map[string]string)
if workload.GetGeneration() > 0 {
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/binding/common_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ func Test_mergeLabel(t *testing.T) {
},
},
},
binding: &workv1alpha2.ClusterResourceBinding{
binding: &workv1alpha2.ResourceBinding{
ObjectMeta: metav1.ObjectMeta{
Name: bindingName,
Namespace: namespace,
Expand Down
5 changes: 5 additions & 0 deletions pkg/controllers/cluster/taint_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (

clusterv1alpha1 "github.com/karmada-io/karmada/pkg/apis/cluster/v1alpha1"
workv1alpha2 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha2"
controllerUtils "github.com/karmada-io/karmada/pkg/controllers/utils"
"github.com/karmada-io/karmada/pkg/features"
"github.com/karmada-io/karmada/pkg/util"
"github.com/karmada-io/karmada/pkg/util/fedinformer/keys"
Expand Down Expand Up @@ -170,6 +171,10 @@ func (tc *NoExecuteTaintManager) syncBindingEviction(key util.QueueKey) error {
// Case 2: Need eviction after toleration time. If time is up, do eviction right now.
// Case 3: Tolerate forever, we do nothing.
if needEviction || tolerationTime == 0 {
err := controllerUtils.UpdateFailoverStatus(tc.Client, binding, cluster, workv1alpha2.EvictionReasonTaintUntolerated)
if err != nil {
klog.Errorf("Failed to update status with failover information. Error: %v", err)
}
// update final result to evict the target cluster
if features.FeatureGate.Enabled(features.GracefulEviction) {
binding.Spec.GracefulEvictCluster(cluster, workv1alpha2.NewTaskOptions(workv1alpha2.WithProducer(workv1alpha2.EvictionProducerTaintManager), workv1alpha2.WithReason(workv1alpha2.EvictionReasonTaintUntolerated)))
Expand Down
Loading