Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add HA mode for service-mirror #11047

Merged
merged 8 commits into from
Jul 17, 2023
2 changes: 2 additions & 0 deletions multicluster/charts/linkerd-multicluster-link/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,14 @@ Kubernetes: `>=1.21.0-0`
| controllerImageVersion | string | `"linkerdVersionValue"` | Tag for the Service Mirror container Docker image |
| enableHeadlessServices | bool | `false` | Toggle support for mirroring headless services |
| enablePSP | bool | `false` | Create RoleBindings to associate ServiceAccount of target cluster Service Mirror to the control plane PSP resource. This requires that `enabledPSP` is set to true on the extension and control plane install. Note PSP has been deprecated since k8s v1.21 |
| enablePodAntiAffinity | bool | `false` | Enables Pod Anti Affinity logic to balance the placement of replicas across hosts and zones for High Availability. Enable this only when you have multiple replicas of components. |
| gateway.probe.port | int | `4191` | The port used for liveliness probing |
| logFormat | string | `"plain"` | Log format (`plain` or `json`) |
| logLevel | string | `"info"` | Log level for the Multicluster components |
| nodeSelector | object | `{}` | Node selectors for the Service mirror pod |
| podAnnotations | object | `{}` | Additional annotations to add to all pods |
| podLabels | object | `{}` | Additional labels to add to all pods |
| replicas | int | `1` | Number of service mirror replicas to run |
| resources | object | `{}` | Resources for the Service mirror container |
| serviceMirrorRetryLimit | int | `3` | Number of times update from the remote cluster is allowed to be requeued (retried) |
| serviceMirrorUID | int | `2103` | User id under which the Service Mirror shall be ran |
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
Expand Down Expand Up @@ -97,11 +96,16 @@ metadata:
name: linkerd-service-mirror-{{.Values.targetClusterName}}
namespace: {{ .Release.Namespace }}
spec:
replicas: 1
replicas: {{ .Values.replicas }}
selector:
matchLabels:
component: linkerd-service-mirror
mirror.linkerd.io/cluster-name: {{.Values.targetClusterName}}
{{- if .Values.enablePodAntiAffinity }}
strategy:
rollingUpdate:
maxUnavailable: 1
{{- end }}
template:
metadata:
annotations:
Expand Down Expand Up @@ -154,3 +158,21 @@ spec:
{{- with .Values.tolerations }}
tolerations: {{ toYaml . | nindent 6 }}
{{- end }}
{{- if .Values.enablePodAntiAffinity }}
---
kind: PodDisruptionBudget
apiVersion: policy/v1
metadata:
name: linkerd-service-mirror-{{.Values.targetClusterName}}
namespace: {{ .Release.Namespace }}
labels:
component: linkerd-service-mirror
annotations:
{{ include "partials.annotations.created-by" . }}
spec:
maxUnavailable: 1
selector:
matchLabels:
component: linkerd-service-mirror
mirror.linkerd.io/cluster-name: {{.Values.targetClusterName}}
{{- end}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
enablePodAntiAffinity: true
replicas: 3
6 changes: 6 additions & 0 deletions multicluster/charts/linkerd-multicluster-link/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ podLabels: {}
commonLabels: {}
# -- Toggle support for mirroring headless services
enableHeadlessServices: false
# -- Enables Pod Anti Affinity logic to balance the placement of replicas
# across hosts and zones for High Availability.
# Enable this only when you have multiple replicas of components.
enablePodAntiAffinity: false
gateway:
probe:
# -- The port used for liveliness probing
Expand All @@ -21,6 +25,8 @@ logLevel: info
logFormat: plain
# -- Node selectors for the Service mirror pod
nodeSelector: {}
# -- Number of service mirror replicas to run
replicas: 1
# -- Resources for the Service mirror container
resources: {}
# -- Number of times update from the remote cluster is allowed to be requeued
Expand Down
14 changes: 13 additions & 1 deletion multicluster/cmd/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -579,9 +579,21 @@ func (hc *healthChecker) checkIfGatewayMirrorsHaveEndpoints(ctx context.Context,
continue
}

lease, err := hc.KubeAPIClient().CoordinationV1().Leases(multiclusterNs.Name).Get(ctx, fmt.Sprintf("service-mirror-write-%s", link.TargetClusterName), metav1.GetOptions{})
if err != nil {
errors = append(errors, fmt.Errorf("failed to get the service-mirror component Leases for target cluster %s: %w", link.TargetClusterName, err))
mateiidavid marked this conversation as resolved.
Show resolved Hide resolved
continue
}

// Build a simple lookup table to retrieve Lease object claimants.
mateiidavid marked this conversation as resolved.
Show resolved Hide resolved
// Metrics should only be pulled from claimants as they are the ones
// running probes.
leaders := make(map[string]struct{})
leaders[*lease.Spec.HolderIdentity] = struct{}{}

// Get and parse the gateway metrics so that we can extract liveness
// information.
gatewayMetrics := getGatewayMetrics(hc.KubeAPIClient(), pods.Items, wait)
gatewayMetrics := getGatewayMetrics(hc.KubeAPIClient(), pods.Items, leaders, wait)
if len(gatewayMetrics) != 1 {
errors = append(errors, fmt.Errorf("expected exactly one gateway metric for target cluster %s; got %d", link.TargetClusterName, len(gatewayMetrics)))
continue
Expand Down
21 changes: 19 additions & 2 deletions multicluster/cmd/gateways.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,21 @@ func newGatewaysCommand() *cobra.Command {
os.Exit(1)
}

leases, err := k8sAPI.CoordinationV1().Leases(multiclusterNs.Name).List(cmd.Context(), metav1.ListOptions{})
mateiidavid marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
fmt.Fprintf(os.Stderr, "failed to list pods in namespace %s: %s", multiclusterNs.Name, err)
os.Exit(1)
}
// Build a simple lookup table to retrieve Lease object claimants.
// Metrics should only be pulled from claimants as they are the ones
// running probes.
leaders := make(map[string]struct{})
for _, lease := range leases.Items {
leaders[*lease.Spec.HolderIdentity] = struct{}{}
}

var statuses []gatewayStatus
gatewayMetrics := getGatewayMetrics(k8sAPI, pods.Items, opts.wait)
gatewayMetrics := getGatewayMetrics(k8sAPI, pods.Items, leaders, opts.wait)
for _, gateway := range gatewayMetrics {
if gateway.err != nil {
fmt.Fprintf(os.Stderr, "Failed to get gateway status for %s: %s\n", gateway.clusterName, gateway.err)
Expand Down Expand Up @@ -159,11 +172,15 @@ func newGatewaysCommand() *cobra.Command {
return cmd
}

func getGatewayMetrics(k8sAPI *k8s.KubernetesAPI, pods []corev1.Pod, wait time.Duration) []gatewayMetrics {
func getGatewayMetrics(k8sAPI *k8s.KubernetesAPI, pods []corev1.Pod, leaders map[string]struct{}, wait time.Duration) []gatewayMetrics {
var metrics []gatewayMetrics
metricsChan := make(chan gatewayMetrics)
var activeRoutines int32
for _, pod := range pods {
if _, found := leaders[pod.Name]; !found {
continue
}

atomic.AddInt32(&activeRoutines, 1)
go func(p corev1.Pod) {
defer atomic.AddInt32(&activeRoutines, -1)
Expand Down
13 changes: 13 additions & 0 deletions multicluster/cmd/link.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ type (
selector string
gatewayAddresses string
gatewayPort uint32
ha bool
}
)

Expand Down Expand Up @@ -255,6 +256,16 @@ A full list of configurable values can be found at https://github.com/linkerd/li
return err
}

if opts.ha {
if valuesOverrides, err = charts.OverrideFromFile(
valuesOverrides,
static.Templates,
helmMulticlusterLinkDefaultChartName,
"values-ha.yaml",
); err != nil {
return err
}
}
serviceMirrorOut, err := renderServiceMirror(values, valuesOverrides, opts.namespace)
if err != nil {
return err
Expand Down Expand Up @@ -287,6 +298,7 @@ A full list of configurable values can be found at https://github.com/linkerd/li
cmd.Flags().StringVarP(&opts.selector, "selector", "l", opts.selector, "Selector (label query) to filter which services in the target cluster to mirror")
cmd.Flags().StringVar(&opts.gatewayAddresses, "gateway-addresses", opts.gatewayAddresses, "If specified, overwrites gateway addresses when gateway service is not type LoadBalancer (comma separated list)")
cmd.Flags().Uint32Var(&opts.gatewayPort, "gateway-port", opts.gatewayPort, "If specified, overwrites gateway port when gateway service is not type LoadBalancer")
cmd.Flags().BoolVar(&opts.ha, "ha", opts.ha, "Enable HA configuration for the service-mirror deployment (default false)")

pkgcmd.ConfigureNamespaceFlagCompletion(
cmd, []string{"namespace", "gateway-namespace"},
Expand Down Expand Up @@ -384,6 +396,7 @@ func newLinkOptionsWithDefault() (*linkOptions, error) {
selector: fmt.Sprintf("%s=%s", k8s.DefaultExportedServiceSelector, "true"),
gatewayAddresses: "",
gatewayPort: 0,
ha: false,
}, nil
}

Expand Down
4 changes: 0 additions & 4 deletions multicluster/cmd/service-mirror/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,10 +183,6 @@ func Main(args []string) {
LeaseMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("service-mirror-write-%s", linkName),
Namespace: *namespace,
Labels: map[string]string{
"component": "linkerd-service-mirror",
"mirror.linkerd.io/cluster-name": linkName,
},
},
Client: k8sAPI.CoordinationV1(),
LockConfig: resourcelock.ResourceLockConfig{
Expand Down
2 changes: 1 addition & 1 deletion multicluster/values/values.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ type Values struct {
LogFormat string `json:"logFormat"`
ServiceMirrorRetryLimit uint32 `json:"serviceMirrorRetryLimit"`
ServiceMirrorUID int64 `json:"serviceMirrorUID"`
Replicas uint32 `json:"replicas"`
RemoteMirrorServiceAccount bool `json:"remoteMirrorServiceAccount"`
RemoteMirrorServiceAccountName string `json:"remoteMirrorServiceAccountName"`
TargetClusterName string `json:"targetClusterName"`
Expand All @@ -40,7 +41,6 @@ type Values struct {
// Gateway contains all options related to the Gateway Service
type Gateway struct {
Enabled bool `json:"enabled"`
Replicas uint32 `json:"replicas"`
alpeb marked this conversation as resolved.
Show resolved Hide resolved
Name string `json:"name"`
Port uint32 `json:"port"`
NodePort uint32 `json:"nodePort"`
Expand Down