Add HA mode for service-mirror (#11047)

In certain scenarios, the service-mirror may act as a single point of failure. Linkerd's multicluster extension supports an `--ha` mode to increase reliability by adding more replicas, however, it is currently supported only in the gateway. To avoid the service-mirror as a single point of failure, this change introduces an `--ha` flag for `linkerd multicluster link`. The HA flag will use a set of value overrides that will: * Configure the service-mirror with affinity and PDB policies to ensure replicas are spread across hosts to protect against (in)voluntary disruptions; * Configure the service-mirror to run with more than 3 replicas; * Configure the service-mirror deployment's rolling strategy to ensure at least one replica is available. Additionally, with the introduction of leader election, `linkerd mc gateways` displays redundant information since metrics are collected from each pod. This change adds a small lookup table of currently lease claimants. Metrics are extracted only for claimants. --------- Signed-off-by: Matei David <[email protected]>
linkerd · Jul 17, 2023 · d0e837d · d0e837d
1 parent 748d99e
commit d0e837d
Show file tree

Hide file tree

Showing 9 changed files with 91 additions and 9 deletions.
diff --git a/multicluster/charts/linkerd-multicluster-link/README.md b/multicluster/charts/linkerd-multicluster-link/README.md
@@ -30,12 +30,14 @@ Kubernetes: `>=1.21.0-0`
 | controllerImageVersion | string | `"linkerdVersionValue"` | Tag for the Service Mirror container Docker image |
 | enableHeadlessServices | bool | `false` | Toggle support for mirroring headless services |
 | enablePSP | bool | `false` | Create RoleBindings to associate ServiceAccount of target cluster Service Mirror to the control plane PSP resource. This requires that `enabledPSP` is set to true on the extension and control plane install. Note PSP has been deprecated since k8s v1.21 |
+| enablePodAntiAffinity | bool | `false` | Enables Pod Anti Affinity logic to balance the placement of replicas across hosts and zones for High Availability. Enable this only when you have multiple replicas of components. |
 | gateway.probe.port | int | `4191` | The port used for liveliness probing |
 | logFormat | string | `"plain"` | Log format (`plain` or `json`) |
 | logLevel | string | `"info"` | Log level for the Multicluster components |
 | nodeSelector | object | `{}` | Node selectors for the Service mirror pod |
 | podAnnotations | object | `{}` | Additional annotations to add to all pods |
 | podLabels | object | `{}` | Additional labels to add to all pods |
+| replicas | int | `1` | Number of service mirror replicas to run |
 | resources | object | `{}` | Resources for the Service mirror container |
 | serviceMirrorRetryLimit | int | `3` | Number of times update from the remote cluster is allowed to be requeued (retried) |
 | serviceMirrorUID | int | `2103` | User id under which the Service Mirror shall be ran |

diff --git a/multicluster/charts/linkerd-multicluster-link/templates/service-mirror.yaml b/multicluster/charts/linkerd-multicluster-link/templates/service-mirror.yaml
@@ -1,4 +1,3 @@
----
 kind: ClusterRole
 apiVersion: rbac.authorization.k8s.io/v1
 metadata:
@@ -97,11 +96,16 @@ metadata:
   name: linkerd-service-mirror-{{.Values.targetClusterName}}
   namespace: {{ .Release.Namespace }}
 spec:
-  replicas: 1
+  replicas: {{ .Values.replicas }}
   selector:
     matchLabels:
       component: linkerd-service-mirror
       mirror.linkerd.io/cluster-name: {{.Values.targetClusterName}}
+  {{- if .Values.enablePodAntiAffinity }}
+  strategy:
+    rollingUpdate:
+      maxUnavailable: 1
+  {{- end }}
   template:
     metadata:
       annotations:
@@ -115,6 +119,10 @@ spec:
         mirror.linkerd.io/cluster-name: {{.Values.targetClusterName}}
         {{- with .Values.podLabels }}{{ toYaml . | trim | nindent 8 }}{{- end }}
     spec:
+    {{- if .Values.enablePodAntiAffinity -}}
+    {{- $local := dict "label" "mirror.linkerd.io/cluster-name" "component" .Values.targetClusterName -}}
+    {{- include "linkerd.pod-affinity" $local | nindent 6 -}}
+    {{- end }}
       containers:
       - args:
         - service-mirror
@@ -154,3 +162,21 @@ spec:
       {{- with .Values.tolerations }}
       tolerations: {{ toYaml . | nindent 6 }}
       {{- end }}
+{{- if .Values.enablePodAntiAffinity }}
+---
+kind: PodDisruptionBudget
+apiVersion: policy/v1
+metadata:
+  name: linkerd-service-mirror-{{.Values.targetClusterName}}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    component: linkerd-service-mirror
+  annotations:
+    {{ include "partials.annotations.created-by" . }}
+spec:
+  maxUnavailable: 1
+  selector:
+    matchLabels:
+      component: linkerd-service-mirror
+      mirror.linkerd.io/cluster-name: {{.Values.targetClusterName}}
+{{- end}}
diff --git a/multicluster/charts/linkerd-multicluster-link/values-ha.yaml b/multicluster/charts/linkerd-multicluster-link/values-ha.yaml
@@ -0,0 +1,2 @@
+enablePodAntiAffinity: true
+replicas: 3
diff --git a/multicluster/charts/linkerd-multicluster-link/values.yaml b/multicluster/charts/linkerd-multicluster-link/values.yaml
@@ -11,6 +11,10 @@ podLabels: {}
 commonLabels: {}
 # -- Toggle support for mirroring headless services
 enableHeadlessServices: false
+# -- Enables Pod Anti Affinity logic to balance the placement of replicas
+# across hosts and zones for High Availability.
+# Enable this only when you have multiple replicas of components.
+enablePodAntiAffinity: false
 gateway:
   probe:
     # -- The port used for liveliness probing
@@ -21,6 +25,8 @@ logLevel: info
 logFormat: plain
 # -- Node selectors for the Service mirror pod
 nodeSelector: {}
+# -- Number of service mirror replicas to run
+replicas: 1
 # -- Resources for the Service mirror container
 resources: {}
 # -- Number of times update from the remote cluster is allowed to be requeued

diff --git a/multicluster/cmd/check.go b/multicluster/cmd/check.go
@@ -579,9 +579,21 @@ func (hc *healthChecker) checkIfGatewayMirrorsHaveEndpoints(ctx context.Context,
 			continue
 		}
 
+		lease, err := hc.KubeAPIClient().CoordinationV1().Leases(multiclusterNs.Name).Get(ctx, fmt.Sprintf("service-mirror-write-%s", link.TargetClusterName), metav1.GetOptions{})
+		if err != nil {
+			errors = append(errors, fmt.Errorf("failed to get the service-mirror component Lease for target cluster %s: %w", link.TargetClusterName, err))
+			continue
+		}
+
+		// Build a simple lookup table to retrieve Lease object claimant.
+		// Metrics should only be pulled from claimants as they are the ones
+		// running probes.
+		leaders := make(map[string]struct{})
+		leaders[*lease.Spec.HolderIdentity] = struct{}{}
+
 		// Get and parse the gateway metrics so that we can extract liveness
 		// information.
-		gatewayMetrics := getGatewayMetrics(hc.KubeAPIClient(), pods.Items, wait)
+		gatewayMetrics := getGatewayMetrics(hc.KubeAPIClient(), pods.Items, leaders, wait)
 		if len(gatewayMetrics) != 1 {
 			errors = append(errors, fmt.Errorf("expected exactly one gateway metric for target cluster %s; got %d", link.TargetClusterName, len(gatewayMetrics)))
 			continue

diff --git a/multicluster/cmd/gateways.go b/multicluster/cmd/gateways.go
@@ -7,6 +7,7 @@ import (
 	"fmt"
 	"io"
 	"os"
+	"strings"
 	"sync/atomic"
 	"time"
 
@@ -77,8 +78,27 @@ func newGatewaysCommand() *cobra.Command {
 				os.Exit(1)
 			}
 
+			leases, err := k8sAPI.CoordinationV1().Leases(multiclusterNs.Name).List(cmd.Context(), metav1.ListOptions{})
+			if err != nil {
+				fmt.Fprintf(os.Stderr, "failed to list pods in namespace %s: %s", multiclusterNs.Name, err)
+				os.Exit(1)
+			}
+			// Build a simple lookup table to retrieve Lease object claimants.
+			// Metrics should only be pulled from claimants as they are the ones
+			// running probes.
+			leaders := make(map[string]struct{})
+			for _, lease := range leases.Items {
+				// If the Lease is not used by the service-mirror, or if it does
+				// not have a claimant, then ignore it
+				if !strings.Contains(lease.Name, "service-mirror-write") || lease.Spec.HolderIdentity == nil {
+					continue
+				}
+
+				leaders[*lease.Spec.HolderIdentity] = struct{}{}
+			}
+
 			var statuses []gatewayStatus
-			gatewayMetrics := getGatewayMetrics(k8sAPI, pods.Items, opts.wait)
+			gatewayMetrics := getGatewayMetrics(k8sAPI, pods.Items, leaders, opts.wait)
 			for _, gateway := range gatewayMetrics {
 				if gateway.err != nil {
 					fmt.Fprintf(os.Stderr, "Failed to get gateway status for %s: %s\n", gateway.clusterName, gateway.err)
@@ -159,11 +179,15 @@ func newGatewaysCommand() *cobra.Command {
 	return cmd
 }
 
-func getGatewayMetrics(k8sAPI *k8s.KubernetesAPI, pods []corev1.Pod, wait time.Duration) []gatewayMetrics {
+func getGatewayMetrics(k8sAPI *k8s.KubernetesAPI, pods []corev1.Pod, leaders map[string]struct{}, wait time.Duration) []gatewayMetrics {
 	var metrics []gatewayMetrics
 	metricsChan := make(chan gatewayMetrics)
 	var activeRoutines int32
 	for _, pod := range pods {
+		if _, found := leaders[pod.Name]; !found {
+			continue
+		}
+
 		atomic.AddInt32(&activeRoutines, 1)
 		go func(p corev1.Pod) {
 			defer atomic.AddInt32(&activeRoutines, -1)

diff --git a/multicluster/cmd/link.go b/multicluster/cmd/link.go
@@ -47,6 +47,7 @@ type (
 		selector                string
 		gatewayAddresses        string
 		gatewayPort             uint32
+		ha                      bool
 	}
 )
 
@@ -255,6 +256,16 @@ A full list of configurable values can be found at https://github.com/linkerd/li
 				return err
 			}
 
+			if opts.ha {
+				if valuesOverrides, err = charts.OverrideFromFile(
+					valuesOverrides,
+					static.Templates,
+					helmMulticlusterLinkDefaultChartName,
+					"values-ha.yaml",
+				); err != nil {
+					return err
+				}
+			}
 			serviceMirrorOut, err := renderServiceMirror(values, valuesOverrides, opts.namespace)
 			if err != nil {
 				return err
@@ -287,6 +298,7 @@ A full list of configurable values can be found at https://github.com/linkerd/li
 	cmd.Flags().StringVarP(&opts.selector, "selector", "l", opts.selector, "Selector (label query) to filter which services in the target cluster to mirror")
 	cmd.Flags().StringVar(&opts.gatewayAddresses, "gateway-addresses", opts.gatewayAddresses, "If specified, overwrites gateway addresses when gateway service is not type LoadBalancer (comma separated list)")
 	cmd.Flags().Uint32Var(&opts.gatewayPort, "gateway-port", opts.gatewayPort, "If specified, overwrites gateway port when gateway service is not type LoadBalancer")
+	cmd.Flags().BoolVar(&opts.ha, "ha", opts.ha, "Enable HA configuration for the service-mirror deployment (default false)")
 
 	pkgcmd.ConfigureNamespaceFlagCompletion(
 		cmd, []string{"namespace", "gateway-namespace"},
@@ -384,6 +396,7 @@ func newLinkOptionsWithDefault() (*linkOptions, error) {
 		selector:                fmt.Sprintf("%s=%s", k8s.DefaultExportedServiceSelector, "true"),
 		gatewayAddresses:        "",
 		gatewayPort:             0,
+		ha:                      false,
 	}, nil
 }
 

diff --git a/multicluster/cmd/service-mirror/main.go b/multicluster/cmd/service-mirror/main.go
@@ -183,10 +183,6 @@ func Main(args []string) {
 		LeaseMeta: metav1.ObjectMeta{
 			Name:      fmt.Sprintf("service-mirror-write-%s", linkName),
 			Namespace: *namespace,
-			Labels: map[string]string{
-				"component":                      "linkerd-service-mirror",
-				"mirror.linkerd.io/cluster-name": linkName,
-			},
 		},
 		Client: k8sAPI.CoordinationV1(),
 		LockConfig: resourcelock.ResourceLockConfig{

diff --git a/multicluster/values/values.go b/multicluster/values/values.go
@@ -31,6 +31,7 @@ type Values struct {
 	LogFormat                      string   `json:"logFormat"`
 	ServiceMirrorRetryLimit        uint32   `json:"serviceMirrorRetryLimit"`
 	ServiceMirrorUID               int64    `json:"serviceMirrorUID"`
+	Replicas                       uint32   `json:"replicas"`
 	RemoteMirrorServiceAccount     bool     `json:"remoteMirrorServiceAccount"`
 	RemoteMirrorServiceAccountName string   `json:"remoteMirrorServiceAccountName"`
 	TargetClusterName              string   `json:"targetClusterName"`