Skip to content

Commit

Permalink
metrics: Add PrometheusRule for namespaced metrics
Browse files Browse the repository at this point in the history
PrometheusRules allow recording pre-defined queries. Use
`sriov_kubepoddevice` metric to add `pod|namespace` pair
to the sriov metrics.

Signed-off-by: Andrea Panattoni <[email protected]>
  • Loading branch information
zeeke committed Jul 19, 2024
1 parent d48fd61 commit 3eced4d
Show file tree
Hide file tree
Showing 5 changed files with 275 additions and 6 deletions.
38 changes: 38 additions & 0 deletions bindata/manifests/metrics-exporter/metrics-prometheus-rule.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
---
{{ if .IsPrometheusOperatorInstalled }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: sriov-vf-rules
namespace: {{.Namespace}}
spec:
groups:
- name: sriov-network-metrics-operator.rules
interval: 10s
rules:
- expr: |
sriov_vf_tx_packets * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_tx_packets
- expr: |
sriov_vf_rx_packets * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_packets
- expr: |
sriov_vf_tx_bytes * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_tx_bytes
- expr: |
sriov_vf_rx_bytes * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_bytes
- expr: |
sriov_vf_tx_dropped * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_tx_dropped
- expr: |
sriov_vf_rx_dropped * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_dropped
- expr: |
sriov_vf_rx_broadcast * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_broadcast
- expr: |
sriov_vf_rx_multicast * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_multicast
{{ end }}

1 change: 1 addition & 0 deletions deploy/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ rules:
- monitoring.coreos.com
resources:
- servicemonitors
- prometheusrules
verbs:
- get
- create
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ rules:
- monitoring.coreos.com
resources:
- servicemonitors
- prometheusrules
verbs:
- get
- create
Expand Down
99 changes: 93 additions & 6 deletions test/conformance/tests/test_exporter_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,22 @@ package tests

import (
"context"
"encoding/json"
"fmt"
"net/url"
"strings"

"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/utils"
sriovv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/cluster"
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/discovery"
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/namespaces"
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/network"
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/pod"
runtimeclient "sigs.k8s.io/controller-runtime/pkg/client"

dto "github.com/prometheus/client_model/go"
"github.com/prometheus/common/expfmt"
"github.com/prometheus/common/model"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -23,6 +27,8 @@ import (
)

var _ = Describe("[sriov] Metrics Exporter", Ordered, func() {
var node string
var nic *sriovv1.InterfaceExt

BeforeAll(func() {
if cluster.VirtualCluster() {
Expand All @@ -44,18 +50,16 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() {
By("Enabling `metricsExporter` feature flag")
setFeatureFlag("metricsExporter", true)

By("Adding ")
err = utils.AddLabelToNamespace(context.Background(), operatorNamespace, "openshift.io/cluster-monitoring", "true", clients)
By("Adding monitoring label to " + operatorNamespace)
err = addLabelToNamespace(context.Background(), operatorNamespace, "openshift.io/cluster-monitoring", "true", clients)

Check failure on line 54 in test/conformance/tests/test_exporter_metrics.go

View workflow job for this annotation

GitHub Actions / Golangci-lint

too many arguments in call to addLabelToNamespace

Check failure on line 54 in test/conformance/tests/test_exporter_metrics.go

View workflow job for this annotation

GitHub Actions / Golangci-lint

too many arguments in call to addLabelToNamespace

Check failure on line 54 in test/conformance/tests/test_exporter_metrics.go

View workflow job for this annotation

GitHub Actions / build

too many arguments in call to addLabelToNamespace

Check failure on line 54 in test/conformance/tests/test_exporter_metrics.go

View workflow job for this annotation

GitHub Actions / build

too many arguments in call to addLabelToNamespace
Expect(err).ToNot(HaveOccurred())

WaitForSRIOVStable()
})

It("collects metrics regarding receiving traffic via VF", func() {
sriovInfos, err := cluster.DiscoverSriov(clients, operatorNamespace)
Expect(err).ToNot(HaveOccurred())

node, nic, err := sriovInfos.FindOneSriovNodeAndDevice()
node, nic, err = sriovInfos.FindOneSriovNodeAndDevice()
Expect(err).ToNot(HaveOccurred())
By("Using device " + nic.Name + " on node " + node)

Expand All @@ -66,7 +70,13 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() {
Expect(err).ToNot(HaveOccurred())
waitForNetAttachDef("test-me-network", namespaces.Test)

DeferCleanup(namespaces.Clean, operatorNamespace, namespaces.Test, clients, discovery.Enabled())
})

It("collects metrics regarding receiving traffic via VF", func() {

pod := createTestPod(node, []string{"test-me-network"})
DeferCleanup(namespaces.CleanPods, namespaces.Test, clients)

ips, err := network.GetSriovNicIPs(pod, "net1")
Expect(err).ToNot(HaveOccurred())
Expand All @@ -89,6 +99,28 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() {
Expect(finalRxPackets).Should(BeNumerically(">", initialRxPackets+3))
})

It("PrometheusRule should provide namespaced metrics", func() {
pod := createTestPod(node, []string{"test-me-network"})
DeferCleanup(namespaces.CleanPods, namespaces.Test, clients)

namespacedMetricNames := []string{
"network:sriov_vf_rx_bytes",
"network:sriov_vf_tx_bytes",
"network:sriov_vf_rx_packets",
"network:sriov_vf_tx_packets",
"network:sriov_vf_rx_dropped",
"network:sriov_vf_tx_dropped",
"network:sriov_vf_rx_broadcast",
"network:sriov_vf_rx_multicast",
}

Eventually(func(g Gomega) {
for _, metricName := range namespacedMetricNames {
values := runPromQLQuery(fmt.Sprintf(`%s{namespace="%s",pod="%s"}`, metricName, pod.Namespace, pod.Name))
g.Expect(values).ToNot(BeEmpty(), "no value for metric %s", metricName)
}
}, "40s", "1s").Should(Succeed())
})
})

func getMetricsForNode(nodeName string) map[string]*dto.MetricFamily {
Expand Down Expand Up @@ -186,3 +218,58 @@ func areLabelsMatching(labels []*dto.LabelPair, labelsToMatch map[string]string)

return true
}

func runPromQLQuery(query string) model.Vector {
prometheusPods, err := clients.Pods("").List(context.Background(), metav1.ListOptions{
LabelSelector: "app.kubernetes.io/component=prometheus",
})
ExpectWithOffset(1, err).ToNot(HaveOccurred())
ExpectWithOffset(1, prometheusPods.Items).ToNot(HaveLen(0), "At least one Prometheus operator pod expected")

prometheusPod := prometheusPods.Items[0]

url := fmt.Sprintf("localhost:9090/api/v1/query?%s", (url.Values{"query": []string{query}}).Encode())
command := []string{"curl", url}
stdout, stderr, err := pod.ExecCommand(clients, &prometheusPod, command...)
ExpectWithOffset(1, err).ToNot(HaveOccurred(),
"promQL query failed: [%s/%s] command: [%v]\nstdout: %s\nstderr: %s", prometheusPod.Namespace, prometheusPod.Name, command, stdout, stderr)

result := struct {
Status string `json:"status"`
Data struct {
ResultType string `json:"resultType"`
Result model.Vector `json:"result"`
} `json:"data"`
}{}

json.Unmarshal([]byte(stdout), &result)
ExpectWithOffset(1, err).ToNot(HaveOccurred())
ExpectWithOffset(1, result.Status).To(Equal("success"), "cURL for [%s] failed: %s", url, stdout)

return result.Data.Result
}

func addLabelToNamespace(ctx context.Context, namespaceName, key, value string) error {
ns := &corev1.Namespace{}
err := clients.Get(ctx, runtimeclient.ObjectKey{Name: namespaceName}, ns)
if err != nil {
return fmt.Errorf("failed to get namespace [%s]: %v", namespaceName, err)
}

if ns.Labels == nil {
ns.Labels = make(map[string]string)
}

if ns.Labels[key] == value {
return nil
}

ns.Labels[key] = value

err = clients.Update(ctx, ns)
if err != nil {
return fmt.Errorf("failed to update namespace [%s] with label [%s: %s]: %v", namespaceName, key, value, err)
}

return nil
}
142 changes: 142 additions & 0 deletions test/util/crds/monitoring.coreos.com_prometheusrules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.15.0
operator.prometheus.io/version: 0.75.1
name: prometheusrules.monitoring.coreos.com
spec:
group: monitoring.coreos.com
names:
categories:
- prometheus-operator
kind: PrometheusRule
listKind: PrometheusRuleList
plural: prometheusrules
shortNames:
- promrule
singular: prometheusrule
scope: Namespaced
versions:
- name: v1
schema:
openAPIV3Schema:
description: |-
The `PrometheusRule` custom resource definition (CRD) defines [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) and [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) rules to be evaluated by `Prometheus` or `ThanosRuler` objects.
`Prometheus` and `ThanosRuler` objects select `PrometheusRule` objects using label and namespace selectors.
properties:
apiVersion:
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
spec:
description: Specification of desired alerting rule definitions for Prometheus.
properties:
groups:
description: Content of Prometheus rule file
items:
description: RuleGroup is a list of sequentially evaluated recording
and alerting rules.
properties:
interval:
description: Interval determines how often rules in the group
are evaluated.
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
limit:
description: |-
Limit the number of alerts an alerting rule and series a recording
rule can produce.
Limit is supported starting with Prometheus >= 2.31 and Thanos Ruler >= 0.24.
type: integer
name:
description: Name of the rule group.
minLength: 1
type: string
partial_response_strategy:
description: |-
PartialResponseStrategy is only used by ThanosRuler and will
be ignored by Prometheus instances.
More info: https://github.com/thanos-io/thanos/blob/main/docs/components/rule.md#partial-response
pattern: ^(?i)(abort|warn)?$
type: string
rules:
description: List of alerting and recording rules.
items:
description: |-
Rule describes an alerting or recording rule
See Prometheus documentation: [alerting](https://www.prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) or [recording](https://www.prometheus.io/docs/prometheus/latest/configuration/recording_rules/#recording-rules) rule
properties:
alert:
description: |-
Name of the alert. Must be a valid label value.
Only one of `record` and `alert` must be set.
type: string
annotations:
additionalProperties:
type: string
description: |-
Annotations to add to each alert.
Only valid for alerting rules.
type: object
expr:
anyOf:
- type: integer
- type: string
description: PromQL expression to evaluate.
x-kubernetes-int-or-string: true
for:
description: Alerts are considered firing once they have
been returned for this long.
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
keep_firing_for:
description: KeepFiringFor defines how long an alert will
continue firing after the condition that triggered it
has cleared.
minLength: 1
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
labels:
additionalProperties:
type: string
description: Labels to add or overwrite.
type: object
record:
description: |-
Name of the time series to output to. Must be a valid metric name.
Only one of `record` and `alert` must be set.
type: string
required:
- expr
type: object
type: array
required:
- name
type: object
type: array
x-kubernetes-list-map-keys:
- name
x-kubernetes-list-type: map
type: object
required:
- spec
type: object
served: true
storage: true

0 comments on commit 3eced4d

Please sign in to comment.