Skip to content

Commit

Permalink
chore: Additional upstream metrics Part1
Browse files Browse the repository at this point in the history
  • Loading branch information
jigisha620 committed Sep 17, 2024
1 parent 71f7aef commit 448ac20
Show file tree
Hide file tree
Showing 9 changed files with 74 additions and 6 deletions.
21 changes: 18 additions & 3 deletions pkg/controllers/metrics/pod/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,15 @@ var (
Objectives: metrics.SummaryObjectives(),
},
)
podBoundDurationSeconds = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "karpenter",
Subsystem: metrics.PodSubsystem,
Name: "bound_duration_seconds",
Help: "The time from pod creation until the pod is bound.",
},
labelNames(),
)
)

// Controller for the resource
Expand All @@ -82,7 +91,7 @@ type Controller struct {
}

func init() {
crmetrics.Registry.MustRegister(podState, podStartupDurationSeconds)
crmetrics.Registry.MustRegister(podState, podStartupDurationSeconds, podBoundDurationSeconds)
}

func labelNames() []string {
Expand Down Expand Up @@ -132,13 +141,19 @@ func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reco
Labels: labels,
},
})
c.recordPodStartupMetric(pod)
c.recordPodStartupMetric(pod, labels)
return reconcile.Result{}, nil
}

func (c *Controller) recordPodStartupMetric(pod *corev1.Pod) {
func (c *Controller) recordPodStartupMetric(pod *corev1.Pod, labels prometheus.Labels) {
key := client.ObjectKeyFromObject(pod).String()
if pod.Status.Phase == phasePending {
cond, ok := lo.Find(pod.Status.Conditions, func(c corev1.PodCondition) bool {
return c.Type == corev1.PodScheduled
})
if ok && cond.Status == corev1.ConditionTrue {
podBoundDurationSeconds.With(labels).Observe(cond.LastTransitionTime.Sub(pod.CreationTimestamp.Time).Seconds())
}
c.pendingPods.Insert(key)
return
}
Expand Down
14 changes: 14 additions & 0 deletions pkg/controllers/metrics/pod/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ import (
"context"
"testing"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -84,6 +86,18 @@ var _ = Describe("Pod Metrics", func() {
})
Expect(found).To(BeTrue())
})
It("should update the pod unbound_duration_seconds metric", func() {
p := test.Pod()
p.Status.Phase = corev1.PodPending
p.Status.Conditions = []corev1.PodCondition{{Type: corev1.PodScheduled, Status: corev1.ConditionTrue, LastTransitionTime: metav1.Now()}}
ExpectApplied(ctx, env.Client, p)
ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p))
_, found := FindMetricWithLabelValues("karpenter_pods_bound_duration_seconds", map[string]string{
"name": p.GetName(),
"namespace": p.GetNamespace(),
})
Expect(found).To(BeTrue())
})
It("should delete the pod state metric on pod delete", func() {
p := test.Pod()
ExpectApplied(ctx, env.Client, p)
Expand Down
3 changes: 3 additions & 0 deletions pkg/controllers/node/termination/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,9 @@ func (c *Controller) finalize(ctx context.Context, node *corev1.Node) (reconcile

return reconcile.Result{RequeueAfter: 1 * time.Second}, nil
}
NodesDrainedTotal.With(prometheus.Labels{
metrics.NodePoolLabel: node.Labels[v1.NodePoolLabelKey],
}).Inc()
// In order for Pods associated with PersistentVolumes to smoothly migrate from the terminating Node, we wait
// for VolumeAttachments of drain-able Pods to be cleaned up before terminating Node and removing its finalizer.
// However, if TerminationGracePeriod is configured for Node, and we are past that period, we will skip waiting.
Expand Down
12 changes: 11 additions & 1 deletion pkg/controllers/node/termination/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ import (
func init() {
crmetrics.Registry.MustRegister(
TerminationDurationSeconds,
NodeLifetimeDurationSeconds)
NodeLifetimeDurationSeconds,
NodesDrainedTotal)
}

const dayDuration = time.Hour * 24
Expand All @@ -44,6 +45,15 @@ var (
},
[]string{metrics.NodePoolLabel},
)
NodesDrainedTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metrics.Namespace,
Subsystem: metrics.NodeSubsystem,
Name: "drained_total",
Help: "The total number of nodes drained by Karpenter",
},
[]string{metrics.NodePoolLabel},
)
NodeLifetimeDurationSeconds = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: metrics.Namespace,
Expand Down
2 changes: 2 additions & 0 deletions pkg/controllers/node/termination/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ var _ = Describe("Termination", func() {
metrics.NodesTerminatedTotal.Reset()
termination.TerminationDurationSeconds.Reset()
termination.NodeLifetimeDurationSeconds.Reset()
termination.NodesDrainedTotal.Reset()
})

Context("Reconciliation", func() {
Expand Down Expand Up @@ -841,6 +842,7 @@ var _ = Describe("Termination", func() {
node = ExpectNodeExists(ctx, env.Client, node.Name)
// Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node).
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectMetricCounterValue(termination.NodesDrainedTotal, 1, map[string]string{"nodepool": node.Labels[v1.NodePoolLabelKey]})
ExpectObjectReconciled(ctx, env.Client, terminationController, node)

m, ok := FindMetricWithLabelValues("karpenter_nodes_terminated_total", map[string]string{"nodepool": node.Labels[v1.NodePoolLabelKey]})
Expand Down
2 changes: 2 additions & 0 deletions pkg/controllers/provisioning/provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ func (p *Provisioner) GetPendingPods(ctx context.Context) ([]*corev1.Pod, error)
pods = lo.Reject(pods, func(po *corev1.Pod, _ int) bool {
if err := p.Validate(ctx, po); err != nil {
log.FromContext(ctx).WithValues("Pod", klog.KRef(po.Namespace, po.Name)).V(1).Info(fmt.Sprintf("ignoring pod, %s", err))
metrics.IgnoredPodTotal.Inc()
return true
}
return false
Expand Down Expand Up @@ -346,6 +347,7 @@ func (p *Provisioner) Schedule(ctx context.Context) (scheduler.Results, error) {
results := s.Solve(ctx, pods).TruncateInstanceTypes(scheduler.MaxInstanceTypes)
if len(results.NewNodeClaims) > 0 {
log.FromContext(ctx).WithValues("Pods", pretty.Slice(lo.Map(pods, func(p *corev1.Pod, _ int) string { return klog.KRef(p.Namespace, p.Name).String() }), 5), "duration", time.Since(start)).Info("found provisionable pod(s)")
scheduler.PodsNominatedTotal.With(prometheus.Labels{scheduler.ControllerLabel: injection.GetControllerName(ctx)}).Add(float64(len(pods)))
}
results.Record(ctx, p.recorder, p.cluster)
return results, nil
Expand Down
13 changes: 12 additions & 1 deletion pkg/controllers/provisioning/scheduling/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
)

func init() {
crmetrics.Registry.MustRegister(SchedulingDurationSeconds, QueueDepth)
crmetrics.Registry.MustRegister(SchedulingDurationSeconds, QueueDepth, PodsNominatedTotal)
}

const (
Expand Down Expand Up @@ -58,4 +58,15 @@ var (
schedulingIDLabel,
},
)
PodsNominatedTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metrics.Namespace,
Subsystem: schedulerSubsystem,
Name: "pods_nominated_total",
Help: "The number of pods that the scheduler has processed and made a decision for.",
},
[]string{
ControllerLabel,
},
)
)
4 changes: 4 additions & 0 deletions pkg/controllers/provisioning/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ import (
"testing"
"time"

schedulingMetrics "sigs.k8s.io/karpenter/pkg/controllers/provisioning/scheduling"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/samber/lo"
Expand Down Expand Up @@ -97,13 +99,15 @@ var _ = AfterEach(func() {
ExpectCleanedUp(ctx, env.Client)
cloudProvider.Reset()
cluster.Reset()
schedulingMetrics.PodsNominatedTotal.Reset()
})

var _ = Describe("Provisioning", func() {
It("should provision nodes", func() {
ExpectApplied(ctx, env.Client, test.NodePool())
pod := test.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod)
ExpectMetricCounterValue(schedulingMetrics.PodsNominatedTotal, 1, nil)
nodes := &corev1.NodeList{}
Expect(env.Client.List(ctx, nodes)).To(Succeed())
Expect(len(nodes.Items)).To(Equal(1))
Expand Down
9 changes: 8 additions & 1 deletion pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,16 @@ var (
NodePoolLabel,
},
)
IgnoredPodTotal = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: Namespace,
Name: "ignored_pod_total",
Help: "Number of pods ignored during scheduling by Karpenter",
},
)
)

func init() {
crmetrics.Registry.MustRegister(NodeClaimsCreatedTotal, NodeClaimsTerminatedTotal, NodeClaimsDisruptedTotal,
NodesCreatedTotal, NodesTerminatedTotal)
NodesCreatedTotal, NodesTerminatedTotal, IgnoredPodTotal)
}

0 comments on commit 448ac20

Please sign in to comment.