From 3eced4d3bf85bef6aaa7d29da90a63b0f44fb0a8 Mon Sep 17 00:00:00 2001 From: Andrea Panattoni Date: Wed, 10 Jul 2024 16:09:20 +0200 Subject: [PATCH] metrics: Add PrometheusRule for namespaced metrics PrometheusRules allow recording pre-defined queries. Use `sriov_kubepoddevice` metric to add `pod|namespace` pair to the sriov metrics. Signed-off-by: Andrea Panattoni --- .../metrics-prometheus-rule.yaml | 38 +++++ deploy/role.yaml | 1 + .../templates/role.yaml | 1 + .../tests/test_exporter_metrics.go | 99 +++++++++++- ...monitoring.coreos.com_prometheusrules.yaml | 142 ++++++++++++++++++ 5 files changed, 275 insertions(+), 6 deletions(-) create mode 100644 bindata/manifests/metrics-exporter/metrics-prometheus-rule.yaml create mode 100644 test/util/crds/monitoring.coreos.com_prometheusrules.yaml diff --git a/bindata/manifests/metrics-exporter/metrics-prometheus-rule.yaml b/bindata/manifests/metrics-exporter/metrics-prometheus-rule.yaml new file mode 100644 index 000000000..5f4fe6903 --- /dev/null +++ b/bindata/manifests/metrics-exporter/metrics-prometheus-rule.yaml @@ -0,0 +1,38 @@ +--- +{{ if .IsPrometheusOperatorInstalled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: sriov-vf-rules + namespace: {{.Namespace}} +spec: + groups: + - name: sriov-network-metrics-operator.rules + interval: 10s + rules: + - expr: | + sriov_vf_tx_packets * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + record: network:sriov_vf_tx_packets + - expr: | + sriov_vf_rx_packets * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + record: network:sriov_vf_rx_packets + - expr: | + sriov_vf_tx_bytes * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + record: network:sriov_vf_tx_bytes + - expr: | + sriov_vf_rx_bytes * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + record: network:sriov_vf_rx_bytes + - expr: | + sriov_vf_tx_dropped * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + record: network:sriov_vf_tx_dropped + - expr: | + sriov_vf_rx_dropped * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + record: network:sriov_vf_rx_dropped + - expr: | + sriov_vf_rx_broadcast * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + record: network:sriov_vf_rx_broadcast + - expr: | + sriov_vf_rx_multicast * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + record: network:sriov_vf_rx_multicast +{{ end }} + diff --git a/deploy/role.yaml b/deploy/role.yaml index a24f13729..d03c47e21 100644 --- a/deploy/role.yaml +++ b/deploy/role.yaml @@ -29,6 +29,7 @@ rules: - monitoring.coreos.com resources: - servicemonitors + - prometheusrules verbs: - get - create diff --git a/deployment/sriov-network-operator-chart/templates/role.yaml b/deployment/sriov-network-operator-chart/templates/role.yaml index 29cf80cce..28c5ff175 100644 --- a/deployment/sriov-network-operator-chart/templates/role.yaml +++ b/deployment/sriov-network-operator-chart/templates/role.yaml @@ -32,6 +32,7 @@ rules: - monitoring.coreos.com resources: - servicemonitors + - prometheusrules verbs: - get - create diff --git a/test/conformance/tests/test_exporter_metrics.go b/test/conformance/tests/test_exporter_metrics.go index 21fe81572..a22445300 100644 --- a/test/conformance/tests/test_exporter_metrics.go +++ b/test/conformance/tests/test_exporter_metrics.go @@ -2,18 +2,22 @@ package tests import ( "context" + "encoding/json" "fmt" + "net/url" "strings" - "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/utils" + sriovv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1" "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/cluster" "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/discovery" "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/namespaces" "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/network" "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/pod" + runtimeclient "sigs.k8s.io/controller-runtime/pkg/client" dto "github.com/prometheus/client_model/go" "github.com/prometheus/common/expfmt" + "github.com/prometheus/common/model" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -23,6 +27,8 @@ import ( ) var _ = Describe("[sriov] Metrics Exporter", Ordered, func() { + var node string + var nic *sriovv1.InterfaceExt BeforeAll(func() { if cluster.VirtualCluster() { @@ -44,18 +50,16 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() { By("Enabling `metricsExporter` feature flag") setFeatureFlag("metricsExporter", true) - By("Adding ") - err = utils.AddLabelToNamespace(context.Background(), operatorNamespace, "openshift.io/cluster-monitoring", "true", clients) + By("Adding monitoring label to " + operatorNamespace) + err = addLabelToNamespace(context.Background(), operatorNamespace, "openshift.io/cluster-monitoring", "true", clients) Expect(err).ToNot(HaveOccurred()) WaitForSRIOVStable() - }) - It("collects metrics regarding receiving traffic via VF", func() { sriovInfos, err := cluster.DiscoverSriov(clients, operatorNamespace) Expect(err).ToNot(HaveOccurred()) - node, nic, err := sriovInfos.FindOneSriovNodeAndDevice() + node, nic, err = sriovInfos.FindOneSriovNodeAndDevice() Expect(err).ToNot(HaveOccurred()) By("Using device " + nic.Name + " on node " + node) @@ -66,7 +70,13 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() { Expect(err).ToNot(HaveOccurred()) waitForNetAttachDef("test-me-network", namespaces.Test) + DeferCleanup(namespaces.Clean, operatorNamespace, namespaces.Test, clients, discovery.Enabled()) + }) + + It("collects metrics regarding receiving traffic via VF", func() { + pod := createTestPod(node, []string{"test-me-network"}) + DeferCleanup(namespaces.CleanPods, namespaces.Test, clients) ips, err := network.GetSriovNicIPs(pod, "net1") Expect(err).ToNot(HaveOccurred()) @@ -89,6 +99,28 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() { Expect(finalRxPackets).Should(BeNumerically(">", initialRxPackets+3)) }) + It("PrometheusRule should provide namespaced metrics", func() { + pod := createTestPod(node, []string{"test-me-network"}) + DeferCleanup(namespaces.CleanPods, namespaces.Test, clients) + + namespacedMetricNames := []string{ + "network:sriov_vf_rx_bytes", + "network:sriov_vf_tx_bytes", + "network:sriov_vf_rx_packets", + "network:sriov_vf_tx_packets", + "network:sriov_vf_rx_dropped", + "network:sriov_vf_tx_dropped", + "network:sriov_vf_rx_broadcast", + "network:sriov_vf_rx_multicast", + } + + Eventually(func(g Gomega) { + for _, metricName := range namespacedMetricNames { + values := runPromQLQuery(fmt.Sprintf(`%s{namespace="%s",pod="%s"}`, metricName, pod.Namespace, pod.Name)) + g.Expect(values).ToNot(BeEmpty(), "no value for metric %s", metricName) + } + }, "40s", "1s").Should(Succeed()) + }) }) func getMetricsForNode(nodeName string) map[string]*dto.MetricFamily { @@ -186,3 +218,58 @@ func areLabelsMatching(labels []*dto.LabelPair, labelsToMatch map[string]string) return true } + +func runPromQLQuery(query string) model.Vector { + prometheusPods, err := clients.Pods("").List(context.Background(), metav1.ListOptions{ + LabelSelector: "app.kubernetes.io/component=prometheus", + }) + ExpectWithOffset(1, err).ToNot(HaveOccurred()) + ExpectWithOffset(1, prometheusPods.Items).ToNot(HaveLen(0), "At least one Prometheus operator pod expected") + + prometheusPod := prometheusPods.Items[0] + + url := fmt.Sprintf("localhost:9090/api/v1/query?%s", (url.Values{"query": []string{query}}).Encode()) + command := []string{"curl", url} + stdout, stderr, err := pod.ExecCommand(clients, &prometheusPod, command...) + ExpectWithOffset(1, err).ToNot(HaveOccurred(), + "promQL query failed: [%s/%s] command: [%v]\nstdout: %s\nstderr: %s", prometheusPod.Namespace, prometheusPod.Name, command, stdout, stderr) + + result := struct { + Status string `json:"status"` + Data struct { + ResultType string `json:"resultType"` + Result model.Vector `json:"result"` + } `json:"data"` + }{} + + json.Unmarshal([]byte(stdout), &result) + ExpectWithOffset(1, err).ToNot(HaveOccurred()) + ExpectWithOffset(1, result.Status).To(Equal("success"), "cURL for [%s] failed: %s", url, stdout) + + return result.Data.Result +} + +func addLabelToNamespace(ctx context.Context, namespaceName, key, value string) error { + ns := &corev1.Namespace{} + err := clients.Get(ctx, runtimeclient.ObjectKey{Name: namespaceName}, ns) + if err != nil { + return fmt.Errorf("failed to get namespace [%s]: %v", namespaceName, err) + } + + if ns.Labels == nil { + ns.Labels = make(map[string]string) + } + + if ns.Labels[key] == value { + return nil + } + + ns.Labels[key] = value + + err = clients.Update(ctx, ns) + if err != nil { + return fmt.Errorf("failed to update namespace [%s] with label [%s: %s]: %v", namespaceName, key, value, err) + } + + return nil +} diff --git a/test/util/crds/monitoring.coreos.com_prometheusrules.yaml b/test/util/crds/monitoring.coreos.com_prometheusrules.yaml new file mode 100644 index 000000000..6c16e8396 --- /dev/null +++ b/test/util/crds/monitoring.coreos.com_prometheusrules.yaml @@ -0,0 +1,142 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.15.0 + operator.prometheus.io/version: 0.75.1 + name: prometheusrules.monitoring.coreos.com +spec: + group: monitoring.coreos.com + names: + categories: + - prometheus-operator + kind: PrometheusRule + listKind: PrometheusRuleList + plural: prometheusrules + shortNames: + - promrule + singular: prometheusrule + scope: Namespaced + versions: + - name: v1 + schema: + openAPIV3Schema: + description: |- + The `PrometheusRule` custom resource definition (CRD) defines [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) and [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) rules to be evaluated by `Prometheus` or `ThanosRuler` objects. + + + `Prometheus` and `ThanosRuler` objects select `PrometheusRule` objects using label and namespace selectors. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: Specification of desired alerting rule definitions for Prometheus. + properties: + groups: + description: Content of Prometheus rule file + items: + description: RuleGroup is a list of sequentially evaluated recording + and alerting rules. + properties: + interval: + description: Interval determines how often rules in the group + are evaluated. + pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + limit: + description: |- + Limit the number of alerts an alerting rule and series a recording + rule can produce. + Limit is supported starting with Prometheus >= 2.31 and Thanos Ruler >= 0.24. + type: integer + name: + description: Name of the rule group. + minLength: 1 + type: string + partial_response_strategy: + description: |- + PartialResponseStrategy is only used by ThanosRuler and will + be ignored by Prometheus instances. + More info: https://github.com/thanos-io/thanos/blob/main/docs/components/rule.md#partial-response + pattern: ^(?i)(abort|warn)?$ + type: string + rules: + description: List of alerting and recording rules. + items: + description: |- + Rule describes an alerting or recording rule + See Prometheus documentation: [alerting](https://www.prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) or [recording](https://www.prometheus.io/docs/prometheus/latest/configuration/recording_rules/#recording-rules) rule + properties: + alert: + description: |- + Name of the alert. Must be a valid label value. + Only one of `record` and `alert` must be set. + type: string + annotations: + additionalProperties: + type: string + description: |- + Annotations to add to each alert. + Only valid for alerting rules. + type: object + expr: + anyOf: + - type: integer + - type: string + description: PromQL expression to evaluate. + x-kubernetes-int-or-string: true + for: + description: Alerts are considered firing once they have + been returned for this long. + pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + keep_firing_for: + description: KeepFiringFor defines how long an alert will + continue firing after the condition that triggered it + has cleared. + minLength: 1 + pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + labels: + additionalProperties: + type: string + description: Labels to add or overwrite. + type: object + record: + description: |- + Name of the time series to output to. Must be a valid metric name. + Only one of `record` and `alert` must be set. + type: string + required: + - expr + type: object + type: array + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + type: object + required: + - spec + type: object + served: true + storage: true