Skip to content

Commit

Permalink
metrics: Add PrometheusRule for namespaced metrics
Browse files Browse the repository at this point in the history
PrometheusRules allow recording pre-defined queries. Use
`sriov_kubepoddevice` metric to add `pod|namespace` pair
to the sriov metrics.

Feature is enabled via the `METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULE`
environment variable.

Signed-off-by: Andrea Panattoni <[email protected]>
  • Loading branch information
zeeke committed Aug 20, 2024
1 parent ac35440 commit 9493e50
Show file tree
Hide file tree
Showing 12 changed files with 265 additions and 3 deletions.
38 changes: 38 additions & 0 deletions bindata/manifests/metrics-exporter/metrics-prometheus-rule.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
---
{{ if and .IsPrometheusOperatorInstalled .PrometheusOperatorDeployRules }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: sriov-vf-rules
namespace: {{.Namespace}}
spec:
groups:
- name: sriov-network-metrics-operator.rules
interval: 30s
rules:
- expr: |
sriov_vf_tx_packets * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_tx_packets
- expr: |
sriov_vf_rx_packets * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_packets
- expr: |
sriov_vf_tx_bytes * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_tx_bytes
- expr: |
sriov_vf_rx_bytes * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_bytes
- expr: |
sriov_vf_tx_dropped * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_tx_dropped
- expr: |
sriov_vf_rx_dropped * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_dropped
- expr: |
sriov_vf_rx_broadcast * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_broadcast
- expr: |
sriov_vf_rx_multicast * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_multicast
{{ end }}

1 change: 1 addition & 0 deletions controllers/sriovoperatorconfig_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ func (r *SriovOperatorConfigReconciler) syncMetricsExporter(ctx context.Context,
data.Data["IsOpenshift"] = r.PlatformHelper.IsOpenshiftCluster()

data.Data["IsPrometheusOperatorInstalled"] = strings.ToLower(os.Getenv("METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED")) == trueString
data.Data["PrometheusOperatorDeployRules"] = strings.ToLower(os.Getenv("METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES")) == trueString
data.Data["PrometheusOperatorServiceAccount"] = os.Getenv("METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT")
data.Data["PrometheusOperatorNamespace"] = os.Getenv("METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE")

Expand Down
10 changes: 10 additions & 0 deletions controllers/sriovoperatorconfig_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,8 @@ var _ = Describe("SriovOperatorConfig controller", Ordered, func() {
It("should deploy extra configuration when the Prometheus operator is installed", func() {
DeferCleanup(os.Setenv, "METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED", os.Getenv("METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED"))
os.Setenv("METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED", "true")
DeferCleanup(os.Setenv, "METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES", os.Getenv("METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES"))
os.Setenv("METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES", "true")

err := util.WaitForNamespacedObject(&rbacv1.Role{}, k8sClient, testNamespace, "prometheus-k8s", util.RetryInterval, util.APITimeout)
Expect(err).ToNot(HaveOccurred())
Expand All @@ -382,6 +384,14 @@ var _ = Describe("SriovOperatorConfig controller", Ordered, func() {
Version: "v1",
},
client.ObjectKey{Namespace: testNamespace, Name: "sriov-network-metrics-exporter"})

assertResourceExists(
schema.GroupVersionKind{
Group: "monitoring.coreos.com",
Kind: "PrometheusRule",
Version: "v1",
},
client.ObjectKey{Namespace: testNamespace, Name: "sriov-vf-rules"})
})
})
})
Expand Down
2 changes: 2 additions & 0 deletions deploy/operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ spec:
value: $METRICS_EXPORTER_KUBE_RBAC_PROXY_IMAGE
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED
value: "$METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED"
- name: METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES
value: "$METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES"
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT
value: $METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE
Expand Down
1 change: 1 addition & 0 deletions deploy/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ rules:
- monitoring.coreos.com
resources:
- servicemonitors
- prometheusrules
verbs:
- get
- create
Expand Down
1 change: 1 addition & 0 deletions deployment/sriov-network-operator-chart/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ We have introduced the following Chart parameters.
| `operator.metricsExporter.prometheusOperator.enabled` | bool | false | Wheter the operator shoud configure Prometheus resources or not (e.g. `ServiceMonitors`). |
| `operator.metricsExporter.prometheusOperator.serviceAccount` | string | `prometheus-k8s` | The service account used by the Prometheus Operator. This is used to give Prometheus the permission to list resource in the SR-IOV operator namespace |
| `operator.metricsExporter.prometheusOperator.namespace` | string | `monitoring` | The namespace where the Prometheus Operator is installed. Setting this variable makes the operator deploy `monitoring.coreos.com` resources. |
| `operator.metricsExporter.prometheusOperator.deployRules` | boold | false | Whether the operator should deploy `PrometheusRules` to scrape namesapced version of metrics. |

#### Admission Controllers parameters

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ spec:
{{- if .Values.operator.metricsExporter.prometheusOperator.enabled }}
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED
value: {{ .Values.operator.metricsExporter.prometheusOperator.enabled | quote}}
- name: METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES
value: {{ .Values.operator.metricsExporter.prometheusOperator.deployRules | quote}}
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT
value: {{ .Values.operator.metricsExporter.prometheusOperator.serviceAccount }}
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ rules:
- monitoring.coreos.com
resources:
- servicemonitors
- prometheusrules
verbs:
- get
- create
Expand Down
1 change: 1 addition & 0 deletions deployment/sriov-network-operator-chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ operator:
enabled: false
serviceAccount: "prometheus-k8s"
namespace: "monitoring"
deployRules: false
admissionControllers:
enabled: false
certificates:
Expand Down
1 change: 1 addition & 0 deletions hack/run-e2e-conformance-virtual-ocp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ export DEV_MODE=TRUE
export CLUSTER_HAS_EMULATED_PF=TRUE
export OPERATOR_LEADER_ELECTION_ENABLE=true
export METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED=true
export METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULE=true
export METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT=${METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT:-"prometheus-k8s"}
export METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE=${METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE:-"openshfit-monitoring"}

Expand Down
68 changes: 65 additions & 3 deletions test/conformance/tests/test_exporter_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@ package tests

import (
"context"
"encoding/json"
"fmt"
"net/url"
"strings"

sriovv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/cluster"
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/discovery"
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/namespaces"
Expand All @@ -13,6 +16,7 @@ import (

dto "github.com/prometheus/client_model/go"
"github.com/prometheus/common/expfmt"
"github.com/prometheus/common/model"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -22,6 +26,8 @@ import (
)

var _ = Describe("[sriov] Metrics Exporter", Ordered, func() {
var node string
var nic *sriovv1.InterfaceExt

BeforeAll(func() {
if cluster.VirtualCluster() {
Expand All @@ -48,13 +54,11 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() {
Expect(err).ToNot(HaveOccurred())

WaitForSRIOVStable()
})

It("collects metrics regarding receiving traffic via VF", func() {
sriovInfos, err := cluster.DiscoverSriov(clients, operatorNamespace)
Expect(err).ToNot(HaveOccurred())

node, nic, err := sriovInfos.FindOneSriovNodeAndDevice()
node, nic, err = sriovInfos.FindOneSriovNodeAndDevice()
Expect(err).ToNot(HaveOccurred())
By("Using device " + nic.Name + " on node " + node)

Expand All @@ -65,7 +69,13 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() {
Expect(err).ToNot(HaveOccurred())
waitForNetAttachDef("test-me-network", namespaces.Test)

DeferCleanup(namespaces.Clean, operatorNamespace, namespaces.Test, clients, discovery.Enabled())
})

It("collects metrics regarding receiving traffic via VF", func() {

pod := createTestPod(node, []string{"test-me-network"})
DeferCleanup(namespaces.CleanPods, namespaces.Test, clients)

ips, err := network.GetSriovNicIPs(pod, "net1")
Expect(err).ToNot(HaveOccurred())
Expand All @@ -88,6 +98,28 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() {
Expect(finalRxPackets).Should(BeNumerically(">", initialRxPackets))
})

It("PrometheusRule should provide namespaced metrics", func() {
pod := createTestPod(node, []string{"test-me-network"})
DeferCleanup(namespaces.CleanPods, namespaces.Test, clients)

namespacedMetricNames := []string{
"network:sriov_vf_rx_bytes",
"network:sriov_vf_tx_bytes",
"network:sriov_vf_rx_packets",
"network:sriov_vf_tx_packets",
"network:sriov_vf_rx_dropped",
"network:sriov_vf_tx_dropped",
"network:sriov_vf_rx_broadcast",
"network:sriov_vf_rx_multicast",
}

Eventually(func(g Gomega) {
for _, metricName := range namespacedMetricNames {
values := runPromQLQuery(fmt.Sprintf(`%s{namespace="%s",pod="%s"}`, metricName, pod.Namespace, pod.Name))
g.Expect(values).ToNot(BeEmpty(), "no value for metric %s", metricName)
}
}, "40s", "1s").Should(Succeed())
})
})

func getMetricsForNode(nodeName string) map[string]*dto.MetricFamily {
Expand Down Expand Up @@ -185,3 +217,33 @@ func areLabelsMatching(labels []*dto.LabelPair, labelsToMatch map[string]string)

return true
}

func runPromQLQuery(query string) model.Vector {
prometheusPods, err := clients.Pods("").List(context.Background(), metav1.ListOptions{
LabelSelector: "app.kubernetes.io/component=prometheus",
})
ExpectWithOffset(1, err).ToNot(HaveOccurred())
ExpectWithOffset(1, prometheusPods.Items).ToNot(HaveLen(0), "At least one Prometheus operator pod expected")

prometheusPod := prometheusPods.Items[0]

url := fmt.Sprintf("localhost:9090/api/v1/query?%s", (url.Values{"query": []string{query}}).Encode())
command := []string{"curl", url}
stdout, stderr, err := pod.ExecCommand(clients, &prometheusPod, command...)
ExpectWithOffset(1, err).ToNot(HaveOccurred(),
"promQL query failed: [%s/%s] command: [%v]\nstdout: %s\nstderr: %s", prometheusPod.Namespace, prometheusPod.Name, command, stdout, stderr)

result := struct {
Status string `json:"status"`
Data struct {
ResultType string `json:"resultType"`
Result model.Vector `json:"result"`
} `json:"data"`
}{}

json.Unmarshal([]byte(stdout), &result)
ExpectWithOffset(1, err).ToNot(HaveOccurred())
ExpectWithOffset(1, result.Status).To(Equal("success"), "cURL for [%s] failed: %s", url, stdout)

return result.Data.Result
}
142 changes: 142 additions & 0 deletions test/util/crds/monitoring.coreos.com_prometheusrules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.15.0
operator.prometheus.io/version: 0.75.1
name: prometheusrules.monitoring.coreos.com
spec:
group: monitoring.coreos.com
names:
categories:
- prometheus-operator
kind: PrometheusRule
listKind: PrometheusRuleList
plural: prometheusrules
shortNames:
- promrule
singular: prometheusrule
scope: Namespaced
versions:
- name: v1
schema:
openAPIV3Schema:
description: |-
The `PrometheusRule` custom resource definition (CRD) defines [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) and [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) rules to be evaluated by `Prometheus` or `ThanosRuler` objects.
`Prometheus` and `ThanosRuler` objects select `PrometheusRule` objects using label and namespace selectors.
properties:
apiVersion:
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
spec:
description: Specification of desired alerting rule definitions for Prometheus.
properties:
groups:
description: Content of Prometheus rule file
items:
description: RuleGroup is a list of sequentially evaluated recording
and alerting rules.
properties:
interval:
description: Interval determines how often rules in the group
are evaluated.
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
limit:
description: |-
Limit the number of alerts an alerting rule and series a recording
rule can produce.
Limit is supported starting with Prometheus >= 2.31 and Thanos Ruler >= 0.24.
type: integer
name:
description: Name of the rule group.
minLength: 1
type: string
partial_response_strategy:
description: |-
PartialResponseStrategy is only used by ThanosRuler and will
be ignored by Prometheus instances.
More info: https://github.com/thanos-io/thanos/blob/main/docs/components/rule.md#partial-response
pattern: ^(?i)(abort|warn)?$
type: string
rules:
description: List of alerting and recording rules.
items:
description: |-
Rule describes an alerting or recording rule
See Prometheus documentation: [alerting](https://www.prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) or [recording](https://www.prometheus.io/docs/prometheus/latest/configuration/recording_rules/#recording-rules) rule
properties:
alert:
description: |-
Name of the alert. Must be a valid label value.
Only one of `record` and `alert` must be set.
type: string
annotations:
additionalProperties:
type: string
description: |-
Annotations to add to each alert.
Only valid for alerting rules.
type: object
expr:
anyOf:
- type: integer
- type: string
description: PromQL expression to evaluate.
x-kubernetes-int-or-string: true
for:
description: Alerts are considered firing once they have
been returned for this long.
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
keep_firing_for:
description: KeepFiringFor defines how long an alert will
continue firing after the condition that triggered it
has cleared.
minLength: 1
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
labels:
additionalProperties:
type: string
description: Labels to add or overwrite.
type: object
record:
description: |-
Name of the time series to output to. Must be a valid metric name.
Only one of `record` and `alert` must be set.
type: string
required:
- expr
type: object
type: array
required:
- name
type: object
type: array
x-kubernetes-list-map-keys:
- name
x-kubernetes-list-type: map
type: object
required:
- spec
type: object
served: true
storage: true

0 comments on commit 9493e50

Please sign in to comment.