Skip to content

Commit

Permalink
chore: add custom metrics to sgController
Browse files Browse the repository at this point in the history
  • Loading branch information
Rdpaula committed Jul 15, 2024
1 parent 3f5230b commit cc6f5a9
Show file tree
Hide file tree
Showing 6 changed files with 250 additions and 18 deletions.
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ require (
github.com/onsi/ginkgo/v2 v2.11.0
github.com/onsi/gomega v1.27.10
github.com/pkg/errors v0.9.1
github.com/topfreegames/kubernetes-kops-operator v0.11.0-rc
github.com/topfreegames/kubernetes-kops-operator v0.13.1-alpha
go.uber.org/zap v1.25.0
k8s.io/api v0.28.1
k8s.io/apimachinery v0.28.1
Expand Down Expand Up @@ -95,8 +95,8 @@ require (
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pkg/sftp v1.13.6 // indirect
github.com/prometheus/client_golang v1.16.0 // indirect
github.com/prometheus/client_model v0.4.0 // indirect
github.com/prometheus/client_golang v1.16.0
github.com/prometheus/client_model v0.4.0
github.com/prometheus/common v0.44.0 // indirect
github.com/prometheus/procfs v0.10.1 // indirect
github.com/spf13/afero v1.9.5 // indirect
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -360,8 +360,8 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/topfreegames/kubernetes-kops-operator v0.11.0-rc h1:L4WaAXgb+FM+D+oVJnJgxutOiwWTbCS+1uhpIhLzTqA=
github.com/topfreegames/kubernetes-kops-operator v0.11.0-rc/go.mod h1:PnaRBjLXOYvuTRDZEBppcE0xLAVvz8NH/5gwi//SUIc=
github.com/topfreegames/kubernetes-kops-operator v0.13.1-alpha h1:j2a0J3owdH5gB2c0AW42JcjA/Fi5a1L1mJBz0yAknE4=
github.com/topfreegames/kubernetes-kops-operator v0.13.1-alpha/go.mod h1:+FwqSNyOaJ3bRMI07c2DrVAJD+Ky8rIE1yaVsw5Du9E=
github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
Expand Down
56 changes: 48 additions & 8 deletions internal/controller/ec2.aws/securitygroup_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ import (
crossec2v1beta1 "github.com/crossplane-contrib/provider-aws/apis/ec2/v1beta1"
"github.com/go-logr/logr"
"github.com/hashicorp/go-multierror"
custommetrics "github.com/topfreegames/kubernetes-crossplane-infrastructure-operator/metrics"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
Expand Down Expand Up @@ -116,7 +117,7 @@ func DefaultReconciler(mgr manager.Manager) *SecurityGroupReconciler {
//+kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;patch
//+kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch;create;update

func (c *SecurityGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, rerr error) {
func (c *SecurityGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reconciliationErr error) {
r := &SecurityGroupReconciliation{
SecurityGroupReconciler: *c,
start: time.Now(),
Expand All @@ -141,25 +142,31 @@ func (c *SecurityGroupReconciler) Reconcile(ctx context.Context, req ctrl.Reques
r.log.Info(fmt.Sprintf("starting reconcile loop for %s", r.sg.ObjectMeta.GetName()))

defer func() {
environment := r.getEnvironment(ctx)

securityGroupHelper := r.sg.DeepCopy()
if err := r.Update(ctx, r.sg); err != nil {
r.Recorder.Eventf(r.sg, corev1.EventTypeWarning, "FailedToUpdate", "failed to update security group %s: %s", r.sg.Name, err)
if rerr == nil {
rerr = err
if reconciliationErr == nil {
reconciliationErr = err
}
}

if securityGroupHelper.ObjectMeta.DeletionTimestamp.IsZero() {
r.sg.Status = securityGroupHelper.Status
if err := r.Status().Update(ctx, r.sg); err != nil {
r.Recorder.Eventf(r.sg, corev1.EventTypeWarning, "FailedToUpdateStatus", "failed to update security group status %s: %s", r.sg.Name, err)
if rerr == nil {
rerr = err
if reconciliationErr == nil {
reconciliationErr = err
}
}
}

r.log.Info(fmt.Sprintf("finished reconcile loop for %s", r.sg.ObjectMeta.GetName()))
if reconciliationErr != nil {
custommetrics.ReconciliationConsecutiveErrorsTotal.WithLabelValues("securitygroup", r.sg.Name, environment).Inc()
} else {
custommetrics.ReconciliationConsecutiveErrorsTotal.WithLabelValues("securitygroup", r.sg.Name, environment).Set(0)
}
r.log.Info(fmt.Sprintf("finished reconcile loop for %s", r.sg.Name))
}()

err := r.retrieveInfraRefInfo(ctx)
Expand All @@ -181,7 +188,7 @@ func (r *SecurityGroupReconciliation) retrieveKCPNetworkInfo(ctx context.Context
Name: name,
Namespace: namespace,
}
if err := r.Client.Get(ctx, key, kcp); err != nil {
if err := r.Get(ctx, key, kcp); err != nil {
return nil, nil, nil, err
}

Expand Down Expand Up @@ -967,3 +974,36 @@ func (r *SecurityGroupReconciliation) addKCPFinalizer(ctx context.Context, kcp k
}
}
}

func (r *SecurityGroupReconciliation) getEnvironment(ctx context.Context) string {
ref := r.sg.Spec.InfrastructureRef[0]
var clusterObject clusterv1beta1.Cluster
switch ref.Kind {
case "KopsMachinePool":
kmp := kinfrastructurev1alpha1.KopsMachinePool{}
key := client.ObjectKey{
Name: ref.Name,
Namespace: ref.Namespace,
}
if err := r.Client.Get(ctx, key, &kmp); err != nil {
return ""
}
clusterName := &kmp.Spec.ClusterName
key = client.ObjectKey{
Name: *clusterName,
Namespace: kmp.ObjectMeta.Namespace,
}
if err := r.Client.Get(ctx, key, &clusterObject); err != nil {
return ""
}
case "KopsControlPlane":
key := client.ObjectKey{
Name: ref.Name,
Namespace: ref.Namespace,
}
if err := r.Client.Get(ctx, key, &clusterObject); err != nil {
return ""
}
}
return clusterObject.Labels["environment"]
}
175 changes: 175 additions & 0 deletions internal/controller/ec2.aws/securitygroup_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,11 @@ import (
crossec2v1beta1 "github.com/crossplane-contrib/provider-aws/apis/ec2/v1beta1"
crossplanev1 "github.com/crossplane/crossplane-runtime/apis/common/v1"
"github.com/hashicorp/go-multierror"
dto "github.com/prometheus/client_model/go"
oceanaws "github.com/spotinst/spotinst-sdk-go/service/ocean/providers/aws"
clustermeshv1alpha1 "github.com/topfreegames/kubernetes-crossplane-infrastructure-operator/api/clustermesh.infrastructure/v1alpha1"
securitygroupv1alpha2 "github.com/topfreegames/kubernetes-crossplane-infrastructure-operator/api/ec2.aws/v1alpha2"
custommetrics "github.com/topfreegames/kubernetes-crossplane-infrastructure-operator/metrics"
"github.com/topfreegames/kubernetes-crossplane-infrastructure-operator/pkg/aws/autoscaling"
fakeasg "github.com/topfreegames/kubernetes-crossplane-infrastructure-operator/pkg/aws/autoscaling/fake"
"github.com/topfreegames/kubernetes-crossplane-infrastructure-operator/pkg/aws/ec2"
Expand All @@ -50,6 +52,7 @@ import (
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/tools/record"
kopsapi "k8s.io/kops/pkg/apis/kops"
"k8s.io/kops/util/pkg/vfs"
clusterv1beta1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/util/conditions"
ctrl "sigs.k8s.io/controller-runtime"
Expand Down Expand Up @@ -304,6 +307,9 @@ var (
ObjectMeta: metav1.ObjectMeta{
Namespace: metav1.NamespaceDefault,
Name: "test-cluster",
Labels: map[string]string{
"environment": "testing",
},
},
Spec: clusterv1beta1.ClusterSpec{
ControlPlaneRef: &corev1.ObjectReference{
Expand Down Expand Up @@ -4463,6 +4469,175 @@ func TestSecurityGroupStatus(t *testing.T) {
}
}

func TestCustomMetrics(t *testing.T) {
testCases := []struct {
description string
expectedResult float64
reconciliationStatusResult []string
}{
{
description: "should be zero on successful reconciliation",
expectedResult: 0.0,
reconciliationStatusResult: []string{"succeed"},
},
{
description: "should get incremented on reconcile errors",
expectedResult: 1.0,
reconciliationStatusResult: []string{"fail"},
},
{
description: "should get incremented on consecutive reconcile errors",
expectedResult: 2.0,
reconciliationStatusResult: []string{"fail", "fail"},
},
{
description: "should be zero after a successful reconciliation",
expectedResult: 0.0,
reconciliationStatusResult: []string{"fail", "succeed"},
},
}
RegisterFailHandler(Fail)
g := NewWithT(t)

custommetrics.ReconciliationConsecutiveErrorsTotal.Reset()
vfs.Context.ResetMemfsContext(true)
err := clusterv1beta1.AddToScheme(scheme.Scheme)
Expect(err).NotTo(HaveOccurred())

err = crossec2v1beta1.SchemeBuilder.AddToScheme(scheme.Scheme)
Expect(err).NotTo(HaveOccurred())

err = securitygroupv1alpha2.AddToScheme(scheme.Scheme)
Expect(err).NotTo(HaveOccurred())

err = kinfrastructurev1alpha1.AddToScheme(scheme.Scheme)
Expect(err).NotTo(HaveOccurred())

err = kcontrolplanev1alpha1.AddToScheme(scheme.Scheme)
Expect(err).NotTo(HaveOccurred())

fakeASGClient := &fakeasg.MockAutoScalingClient{}
fakeASGClient.MockDescribeAutoScalingGroups = func(ctx context.Context, params *awsautoscaling.DescribeAutoScalingGroupsInput, optFns []func(*awsautoscaling.Options)) (*awsautoscaling.DescribeAutoScalingGroupsOutput, error) {
return &awsautoscaling.DescribeAutoScalingGroupsOutput{
AutoScalingGroups: []autoscalingtypes.AutoScalingGroup{
{
AutoScalingGroupName: aws.String("test-asg"),
LaunchTemplate: &autoscalingtypes.LaunchTemplateSpecification{
LaunchTemplateId: aws.String("lt-xxxx"),
Version: aws.String("1"),
},
},
},
}, nil
}
fakeEC2Client := &fakeec2.MockEC2Client{}
fakeEC2Client.MockDescribeSecurityGroups = func(ctx context.Context, params *awsec2.DescribeSecurityGroupsInput, optFns []func(*awsec2.Options)) (*awsec2.DescribeSecurityGroupsOutput, error) {
return &awsec2.DescribeSecurityGroupsOutput{}, nil
}
fakeEC2Client.MockDescribeVpcs = func(ctx context.Context, input *awsec2.DescribeVpcsInput, opts []func(*awsec2.Options)) (*awsec2.DescribeVpcsOutput, error) {
return &awsec2.DescribeVpcsOutput{
Vpcs: []ec2types.Vpc{
{
VpcId: aws.String("x.x.x.x"),
},
},
}, nil
}
fakeEC2Client.MockDescribeLaunchTemplateVersions = func(ctx context.Context, params *awsec2.DescribeLaunchTemplateVersionsInput, optFns []func(*awsec2.Options)) (*awsec2.DescribeLaunchTemplateVersionsOutput, error) {
return &awsec2.DescribeLaunchTemplateVersionsOutput{
LaunchTemplateVersions: []ec2types.LaunchTemplateVersion{
{
LaunchTemplateId: params.LaunchTemplateId,
LaunchTemplateData: &ec2types.ResponseLaunchTemplateData{
NetworkInterfaces: []ec2types.LaunchTemplateInstanceNetworkInterfaceSpecification{
{
Groups: []string{
"sg-xxxx",
},
},
},
},
},
},
}, nil
}
fakeEC2Client.MockCreateLaunchTemplateVersion = func(ctx context.Context, params *awsec2.CreateLaunchTemplateVersionInput, optFns []func(*awsec2.Options)) (*awsec2.CreateLaunchTemplateVersionOutput, error) {
return &awsec2.CreateLaunchTemplateVersionOutput{
LaunchTemplateVersion: &ec2types.LaunchTemplateVersion{
VersionNumber: aws.Int64(1),
},
}, nil
}
fakeEC2Client.MockDescribeLaunchTemplates = func(ctx context.Context, params *awsec2.DescribeLaunchTemplatesInput, optFns []func(*awsec2.Options)) (*awsec2.DescribeLaunchTemplatesOutput, error) {
return &awsec2.DescribeLaunchTemplatesOutput{
LaunchTemplates: []ec2types.LaunchTemplate{
{
LaunchTemplateId: aws.String("lt-xxxx"),
LaunchTemplateName: aws.String("test-launch-template"),
Tags: []ec2types.Tag{
{
Key: aws.String("tag:KubernetesCluster"),
Value: aws.String("test-cluster"),
},
{
Key: aws.String("tag:Name"),
Value: aws.String("test-launch-template"),
},
},
},
},
}, nil
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
custommetrics.ReconciliationConsecutiveErrorsTotal.Reset()

fakeClient := fake.NewClientBuilder().WithObjects(kcp, cluster, defaultSecret, karpenterKMP, sg, csg).WithScheme(scheme.Scheme).WithStatusSubresource(sg).Build()

reconciler := &SecurityGroupReconciler{
Client: fakeClient,
NewEC2ClientFactory: func(cfg aws.Config) ec2.EC2Client {
return fakeEC2Client
},
NewAutoScalingClientFactory: func(cfg aws.Config) autoscaling.AutoScalingClient {
return fakeASGClient
},
}

for _, reconciliationStatus := range tc.reconciliationStatusResult {
reconciler.Recorder = record.NewFakeRecorder(10)
// This is just to force a reconciliation error so we can increment the metric
if reconciliationStatus == "fail" {
fakeEC2Client.MockDescribeInstances = func(ctx context.Context, input *awsec2.DescribeInstancesInput, opts []func(*awsec2.Options)) (*awsec2.DescribeInstancesOutput, error) {
return nil, errors.New("error")
}
} else {
fakeEC2Client.MockDescribeInstances = func(ctx context.Context, input *awsec2.DescribeInstancesInput, opts []func(*awsec2.Options)) (*awsec2.DescribeInstancesOutput, error) {
return &awsec2.DescribeInstancesOutput{}, nil
}
}
_, _ = reconciler.Reconcile(context.TODO(), ctrl.Request{
NamespacedName: client.ObjectKey{
Name: sg.Name,
},
})

}

var reconciliationConsecutiveErrorsTotal dto.Metric
g.Expect(func() error {
g.Expect(custommetrics.ReconciliationConsecutiveErrorsTotal.WithLabelValues("securitygroup", sg.Name, "testing").Write(&reconciliationConsecutiveErrorsTotal)).To(Succeed())
metricValue := reconciliationConsecutiveErrorsTotal.GetGauge().GetValue()
if metricValue != tc.expectedResult {
return fmt.Errorf("metric value differs from expected: %f != %f", metricValue, tc.expectedResult)
}
return nil
}()).Should(Succeed())

})
}
}

func assertConditions(g *WithT, from conditions.Getter, conditions ...*clusterv1beta1.Condition) {
for _, condition := range conditions {
assertCondition(g, from, condition)
Expand Down
22 changes: 22 additions & 0 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package metrics

import (
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

var (

// TODO: We could add the reason of the error as a label as well
ReconciliationConsecutiveErrorsTotal = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "custom_reconciliation_consecutive_errors_total",
Help: "Total number of consecutive reconciliation errors labeled by controller, cluster and environment",
}, []string{"controller", "sg_name", "cluster_environment"},
)
)

func init() {
// Register custom metrics with the global prometheus registry
metrics.Registry.MustRegister(ReconciliationConsecutiveErrorsTotal)
}
5 changes: 0 additions & 5 deletions pkg/kops/kops.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,7 @@ func GetRegionFromKopsControlPlane(ctx context.Context, kcp *kcontrolplanev1alph
return nil, err
}

if err != nil {
return nil, err
}

return region, nil

}

func awsConfigForCredential(ctx context.Context, region string, accessKey string, secretAccessKey string) (aws.Config, error) {
Expand Down

0 comments on commit cc6f5a9

Please sign in to comment.