Skip to content

Commit

Permalink
metrics: Add node label to sriov_* metrics
Browse files Browse the repository at this point in the history
It might happen that two SR-IOV pods, deployed on different node, are using devices
with the same PCI address. In such cases, the query suggested [1] by the sriov-network-metrics-exporter produces the error:

```

Error loading values found duplicate series for the match group {pciAddr="0000:3b:02.4"} on the right hand-side of the operation:
    [
        {
            __name__="sriov_kubepoddevice",
            container="test",
            dev_type="openshift.io/intelnetdevice",
            endpoint="sriov-network-metrics",
            instance="10.1.98.60:9110",
            job="sriov-network-metrics-exporter-service",
            namespace="cnf-4916",
            pciAddr="0000:3b:02.4",
            pod="pod-cnfdr22.telco5g.eng.rdu2.redhat.com",
            prometheus="openshift-monitoring/k8s",
            service="sriov-network-metrics-exporter-service"
        }, {
            __name__="sriov_kubepoddevice",
            container="test",
            dev_type="openshift.io/intelnetdevice",
            endpoint="sriov-network-metrics",
            instance="10.1.98.230:9110",
            job="sriov-network-metrics-exporter-service",
            namespace="cnf-4916",
            pciAddr="0000:3b:02.4",
            pod="pod-dhcp-98-230.telco5g.eng.rdu2.redhat.com",
            prometheus="openshift-monitoring/k8s",
            service="sriov-network-metrics-exporter-service"
        }
    ];many-to-many matching not allowed: matching labels must be unique on one side
```

Configure the ServiceMonitor resource to add a `node` label to all metrics.
The right query to get metrics, as updated in the PrometheusRule, will be:

```
sriov_vf_tx_packets * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
```

Also remove `pod`,  `namespace` and `container` label from the `sriov_vf_*` metrics, as they were
wrongly set to `sriov-network-metrics-exporter-zj2n9`, `openshift-sriov-network-operator`, `kube-rbac-proxy`

[1] https://github.com/k8snetworkplumbingwg/sriov-network-metrics-exporter/blob/0f6a784f377ede87b95f31e569116ceb9775b5b9/README.md?plain=1#L38

Signed-off-by: Andrea Panattoni <[email protected]>
  • Loading branch information
zeeke committed Sep 22, 2024
1 parent f17bb2a commit 3ff1b85
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 31 deletions.
16 changes: 8 additions & 8 deletions bindata/manifests/metrics-exporter/metrics-prometheus-rule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,28 +11,28 @@ spec:
interval: 30s
rules:
- expr: |
sriov_vf_tx_packets * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
sriov_vf_tx_packets * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_tx_packets
- expr: |
sriov_vf_rx_packets * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
sriov_vf_rx_packets * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_packets
- expr: |
sriov_vf_tx_bytes * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
sriov_vf_tx_bytes * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_tx_bytes
- expr: |
sriov_vf_rx_bytes * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
sriov_vf_rx_bytes * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_bytes
- expr: |
sriov_vf_tx_dropped * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
sriov_vf_tx_dropped * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_tx_dropped
- expr: |
sriov_vf_rx_dropped * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
sriov_vf_rx_dropped * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_dropped
- expr: |
sriov_vf_rx_broadcast * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
sriov_vf_rx_broadcast * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_broadcast
- expr: |
sriov_vf_rx_multicast * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
sriov_vf_rx_multicast * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_multicast
{{ end }}

11 changes: 11 additions & 0 deletions bindata/manifests/metrics-exporter/metrics-prometheus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,17 @@ spec:
bearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token"
scheme: "https"
honorLabels: true
relabelings:
- action: replace
sourceLabels:
- __meta_kubernetes_endpoint_node_name
targetLabel: node
- action: labeldrop
regex: pod
- action: labeldrop
regex: container
- action: labeldrop
regex: namespace
tlsConfig:
serverName: sriov-network-metrics-exporter-service.{{.Namespace}}.svc
caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt
Expand Down
95 changes: 72 additions & 23 deletions test/conformance/tests/test_exporter_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,18 @@ import (
"github.com/prometheus/common/model"

corev1 "k8s.io/api/core/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)

var _ = Describe("[sriov] Metrics Exporter", Ordered, func() {
var _ = Describe("[sriov] Metrics Exporter", Ordered, ContinueOnFailure, func() {
var node string
var nic *sriovv1.InterfaceExt

BeforeAll(func() {
if cluster.VirtualCluster() {
Skip("IGB driver does not support VF statistics")
}

err := namespaces.Create(namespaces.Test, clients)
Expect(err).ToNot(HaveOccurred())

Expand Down Expand Up @@ -73,6 +70,9 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() {
})

It("collects metrics regarding receiving traffic via VF", func() {
if cluster.VirtualCluster() {
Skip("IGB driver does not support VF statistics")
}

pod := createTestPod(node, []string{"test-me-network"})
DeferCleanup(namespaces.CleanPods, namespaces.Test, clients)
Expand All @@ -98,27 +98,76 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() {
Expect(finalRxPackets).Should(BeNumerically(">", initialRxPackets))
})

It("PrometheusRule should provide namespaced metrics", func() {
pod := createTestPod(node, []string{"test-me-network"})
DeferCleanup(namespaces.CleanPods, namespaces.Test, clients)
Context("When Prometheus operator is available", func() {
BeforeEach(func() {
_, err := clients.ServiceMonitors(operatorNamespace).List(context.Background(), metav1.ListOptions{})
if k8serrors.IsNotFound(err) {
Skip("Prometheus operator not available in the cluster")
}
})

namespacedMetricNames := []string{
"network:sriov_vf_rx_bytes",
"network:sriov_vf_tx_bytes",
"network:sriov_vf_rx_packets",
"network:sriov_vf_tx_packets",
"network:sriov_vf_rx_dropped",
"network:sriov_vf_tx_dropped",
"network:sriov_vf_rx_broadcast",
"network:sriov_vf_rx_multicast",
}
It("PrometheusRule should provide namespaced metrics", func() {
pod := createTestPod(node, []string{"test-me-network"})
DeferCleanup(namespaces.CleanPods, namespaces.Test, clients)

namespacedMetricNames := []string{
"network:sriov_vf_rx_bytes",
"network:sriov_vf_tx_bytes",
"network:sriov_vf_rx_packets",
"network:sriov_vf_tx_packets",
"network:sriov_vf_rx_dropped",
"network:sriov_vf_tx_dropped",
"network:sriov_vf_rx_broadcast",
"network:sriov_vf_rx_multicast",
}

Eventually(func(g Gomega) {
for _, metricName := range namespacedMetricNames {
values := runPromQLQuery(fmt.Sprintf(`%s{namespace="%s",pod="%s"}`, metricName, pod.Namespace, pod.Name))
g.Expect(values).ToNot(BeEmpty(), "no value for metric %s", metricName)
Eventually(func(g Gomega) {
for _, metricName := range namespacedMetricNames {
values := runPromQLQuery(fmt.Sprintf(`%s{namespace="%s",pod="%s"}`, metricName, pod.Namespace, pod.Name))
g.Expect(values).ToNot(BeEmpty(), "no value for metric %s", metricName)
}
}, "90s", "1s").Should(Succeed())
})

It("Metrics should have the correct labels", func() {
pod := createTestPod(node, []string{"test-me-network"})
DeferCleanup(namespaces.CleanPods, namespaces.Test, clients)

metricsName := []string{
"sriov_vf_rx_bytes",
"sriov_vf_tx_bytes",
"sriov_vf_rx_packets",
"sriov_vf_tx_packets",
"sriov_vf_rx_dropped",
"sriov_vf_tx_dropped",
"sriov_vf_rx_broadcast",
"sriov_vf_rx_multicast",
}
}, "40s", "1s").Should(Succeed())

Eventually(func(g Gomega) {
for _, metricName := range metricsName {
samples := runPromQLQuery(metricName)
g.Expect(samples).ToNot(BeEmpty(), "no value for metric %s", metricName)
g.Expect(samples[0].Metric).To(And(
HaveKey(model.LabelName("pciAddr")),
HaveKey(model.LabelName("node")),
HaveKey(model.LabelName("pf")),
HaveKey(model.LabelName("vf")),
))
}
}, "90s", "1s").Should(Succeed())

// sriov_kubepoddevice has a different sets of label than statistics metrics
samples := runPromQLQuery(fmt.Sprintf(`sriov_kubepoddevice{namespace="%s",pod="%s"}`, pod.Namespace, pod.Name))
Expect(samples).ToNot(BeEmpty(), "no value for metric sriov_kubepoddevice")
Expect(samples[0].Metric).To(And(
HaveKey(model.LabelName("pciAddr")),
HaveKeyWithValue(model.LabelName("node"), model.LabelValue(pod.Spec.NodeName)),
HaveKeyWithValue(model.LabelName("dev_type"), model.LabelValue("openshift.io/metricsResource")),
HaveKeyWithValue(model.LabelName("namespace"), model.LabelValue(pod.Namespace)),
HaveKeyWithValue(model.LabelName("pod"), model.LabelValue(pod.Name)),
))
})
})
})

Expand Down
20 changes: 20 additions & 0 deletions test/util/k8sreporter/reporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ import (

sriovv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/namespaces"

monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
rbacv1 "k8s.io/api/rbac/v1"
)

func New(reportPath string) (*kniK8sReporter.KubernetesReporter, error) {
Expand All @@ -18,6 +21,17 @@ func New(reportPath string) (*kniK8sReporter.KubernetesReporter, error) {
if err != nil {
return err
}

err = monitoringv1.AddToScheme(s)
if err != nil {
return err
}

err = rbacv1.AddToScheme(s)
if err != nil {
return err
}

return nil
}

Expand All @@ -38,6 +52,8 @@ func New(reportPath string) (*kniK8sReporter.KubernetesReporter, error) {
return true
case multusNamespace != "" && ns == multusNamespace:
return true
case ns == "openshift-monitoring":
return true
}
return false
}
Expand All @@ -47,6 +63,10 @@ func New(reportPath string) (*kniK8sReporter.KubernetesReporter, error) {
{Cr: &sriovv1.SriovNetworkNodePolicyList{}},
{Cr: &sriovv1.SriovNetworkList{}},
{Cr: &sriovv1.SriovOperatorConfigList{}},
{Cr: &monitoringv1.ServiceMonitorList{}, Namespace: &operatorNamespace},
{Cr: &monitoringv1.PrometheusRuleList{}, Namespace: &operatorNamespace},
{Cr: &rbacv1.RoleList{}, Namespace: &operatorNamespace},
{Cr: &rbacv1.RoleBindingList{}, Namespace: &operatorNamespace},
}

err := os.Mkdir(reportPath, 0755)
Expand Down

0 comments on commit 3ff1b85

Please sign in to comment.