Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[metrics 4/x] Metrics exporter rules #732

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions bindata/manifests/metrics-exporter/metrics-prometheus-rule.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
---
{{ if and .IsPrometheusOperatorInstalled .PrometheusOperatorDeployRules }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: sriov-vf-rules
namespace: {{.Namespace}}
spec:
groups:
- name: sriov-network-metrics-operator.rules
interval: 30s
rules:
- expr: |
sriov_vf_tx_packets * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_tx_packets
- expr: |
sriov_vf_rx_packets * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_packets
- expr: |
sriov_vf_tx_bytes * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_tx_bytes
- expr: |
sriov_vf_rx_bytes * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_bytes
- expr: |
sriov_vf_tx_dropped * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_tx_dropped
- expr: |
sriov_vf_rx_dropped * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_dropped
- expr: |
sriov_vf_rx_broadcast * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_broadcast
- expr: |
sriov_vf_rx_multicast * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_multicast
{{ end }}

1 change: 1 addition & 0 deletions controllers/sriovoperatorconfig_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ func (r *SriovOperatorConfigReconciler) syncMetricsExporter(ctx context.Context,
data.Data["IsOpenshift"] = r.PlatformHelper.IsOpenshiftCluster()

data.Data["IsPrometheusOperatorInstalled"] = strings.ToLower(os.Getenv("METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED")) == trueString
data.Data["PrometheusOperatorDeployRules"] = strings.ToLower(os.Getenv("METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES")) == trueString
data.Data["PrometheusOperatorServiceAccount"] = os.Getenv("METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT")
data.Data["PrometheusOperatorNamespace"] = os.Getenv("METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE")

Expand Down
10 changes: 10 additions & 0 deletions controllers/sriovoperatorconfig_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,8 @@ var _ = Describe("SriovOperatorConfig controller", Ordered, func() {
It("should deploy extra configuration when the Prometheus operator is installed", func() {
DeferCleanup(os.Setenv, "METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED", os.Getenv("METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED"))
os.Setenv("METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED", "true")
DeferCleanup(os.Setenv, "METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES", os.Getenv("METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES"))
os.Setenv("METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES", "true")

err := util.WaitForNamespacedObject(&rbacv1.Role{}, k8sClient, testNamespace, "prometheus-k8s", util.RetryInterval, util.APITimeout)
Expect(err).ToNot(HaveOccurred())
Expand All @@ -382,6 +384,14 @@ var _ = Describe("SriovOperatorConfig controller", Ordered, func() {
Version: "v1",
},
client.ObjectKey{Namespace: testNamespace, Name: "sriov-network-metrics-exporter"})

assertResourceExists(
schema.GroupVersionKind{
Group: "monitoring.coreos.com",
Kind: "PrometheusRule",
Version: "v1",
},
client.ObjectKey{Namespace: testNamespace, Name: "sriov-vf-rules"})
})
})
})
Expand Down
2 changes: 2 additions & 0 deletions deploy/operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ spec:
value: $METRICS_EXPORTER_KUBE_RBAC_PROXY_IMAGE
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED
value: "$METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED"
- name: METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES
value: "$METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES"
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT
value: $METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE
Expand Down
3 changes: 3 additions & 0 deletions deploy/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,12 @@ rules:
- monitoring.coreos.com
resources:
- servicemonitors
- prometheusrules
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we support deletion of prometheus objects ?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For when sno is redeployed with prometheus disabled (e.g in helm chart)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good point. we have to support object deletion as well. I handle it in

verbs:
- get
- create
- update
- delete
- apiGroups:
- apps
resourceNames:
Expand Down
1 change: 1 addition & 0 deletions deployment/sriov-network-operator-chart/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ We have introduced the following Chart parameters.
| `operator.metricsExporter.prometheusOperator.enabled` | bool | false | Wheter the operator shoud configure Prometheus resources or not (e.g. `ServiceMonitors`). |
| `operator.metricsExporter.prometheusOperator.serviceAccount` | string | `prometheus-k8s` | The service account used by the Prometheus Operator. This is used to give Prometheus the permission to list resource in the SR-IOV operator namespace |
| `operator.metricsExporter.prometheusOperator.namespace` | string | `monitoring` | The namespace where the Prometheus Operator is installed. Setting this variable makes the operator deploy `monitoring.coreos.com` resources. |
| `operator.metricsExporter.prometheusOperator.deployRules` | bool | false | Whether the operator should deploy `PrometheusRules` to scrape namespace version of metrics. |

#### Admission Controllers parameters

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ spec:
{{- if .Values.operator.metricsExporter.prometheusOperator.enabled }}
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED
value: {{ .Values.operator.metricsExporter.prometheusOperator.enabled | quote}}
- name: METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES
value: {{ .Values.operator.metricsExporter.prometheusOperator.deployRules | quote}}
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT
value: {{ .Values.operator.metricsExporter.prometheusOperator.serviceAccount }}
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE
Expand Down
3 changes: 3 additions & 0 deletions deployment/sriov-network-operator-chart/templates/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,12 @@ rules:
- monitoring.coreos.com
resources:
- servicemonitors
- prometheusrules
verbs:
- get
- create
- update
- delete
- apiGroups:
- apps
resourceNames:
Expand Down
1 change: 1 addition & 0 deletions deployment/sriov-network-operator-chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ operator:
enabled: false
serviceAccount: "prometheus-k8s"
namespace: "monitoring"
deployRules: false
admissionControllers:
enabled: false
certificates:
Expand Down
1 change: 1 addition & 0 deletions hack/run-e2e-conformance-virtual-ocp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ export DEV_MODE=TRUE
export CLUSTER_HAS_EMULATED_PF=TRUE
export OPERATOR_LEADER_ELECTION_ENABLE=true
export METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED=true
export METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULE=true
export METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT=${METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT:-"prometheus-k8s"}
export METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE=${METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE:-"openshfit-monitoring"}

Expand Down
68 changes: 65 additions & 3 deletions test/conformance/tests/test_exporter_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@ package tests

import (
"context"
"encoding/json"
"fmt"
"net/url"
"strings"

sriovv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/cluster"
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/discovery"
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/namespaces"
Expand All @@ -13,6 +16,7 @@ import (

dto "github.com/prometheus/client_model/go"
"github.com/prometheus/common/expfmt"
"github.com/prometheus/common/model"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -22,6 +26,8 @@ import (
)

var _ = Describe("[sriov] Metrics Exporter", Ordered, func() {
var node string
var nic *sriovv1.InterfaceExt

BeforeAll(func() {
if cluster.VirtualCluster() {
Expand All @@ -48,13 +54,11 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() {
Expect(err).ToNot(HaveOccurred())

WaitForSRIOVStable()
})

It("collects metrics regarding receiving traffic via VF", func() {
sriovInfos, err := cluster.DiscoverSriov(clients, operatorNamespace)
Expect(err).ToNot(HaveOccurred())

node, nic, err := sriovInfos.FindOneSriovNodeAndDevice()
node, nic, err = sriovInfos.FindOneSriovNodeAndDevice()
Expect(err).ToNot(HaveOccurred())
By("Using device " + nic.Name + " on node " + node)

Expand All @@ -65,7 +69,13 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() {
Expect(err).ToNot(HaveOccurred())
waitForNetAttachDef("test-me-network", namespaces.Test)

DeferCleanup(namespaces.Clean, operatorNamespace, namespaces.Test, clients, discovery.Enabled())
})

It("collects metrics regarding receiving traffic via VF", func() {

pod := createTestPod(node, []string{"test-me-network"})
DeferCleanup(namespaces.CleanPods, namespaces.Test, clients)

ips, err := network.GetSriovNicIPs(pod, "net1")
Expect(err).ToNot(HaveOccurred())
Expand All @@ -88,6 +98,28 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() {
Expect(finalRxPackets).Should(BeNumerically(">", initialRxPackets))
})

It("PrometheusRule should provide namespaced metrics", func() {
pod := createTestPod(node, []string{"test-me-network"})
DeferCleanup(namespaces.CleanPods, namespaces.Test, clients)

namespacedMetricNames := []string{
"network:sriov_vf_rx_bytes",
"network:sriov_vf_tx_bytes",
"network:sriov_vf_rx_packets",
"network:sriov_vf_tx_packets",
"network:sriov_vf_rx_dropped",
"network:sriov_vf_tx_dropped",
"network:sriov_vf_rx_broadcast",
"network:sriov_vf_rx_multicast",
}

Eventually(func(g Gomega) {
for _, metricName := range namespacedMetricNames {
values := runPromQLQuery(fmt.Sprintf(`%s{namespace="%s",pod="%s"}`, metricName, pod.Namespace, pod.Name))
g.Expect(values).ToNot(BeEmpty(), "no value for metric %s", metricName)
}
}, "40s", "1s").Should(Succeed())
})
})

func getMetricsForNode(nodeName string) map[string]*dto.MetricFamily {
Expand Down Expand Up @@ -185,3 +217,33 @@ func areLabelsMatching(labels []*dto.LabelPair, labelsToMatch map[string]string)

return true
}

func runPromQLQuery(query string) model.Vector {
prometheusPods, err := clients.Pods("").List(context.Background(), metav1.ListOptions{
LabelSelector: "app.kubernetes.io/component=prometheus",
})
ExpectWithOffset(1, err).ToNot(HaveOccurred())
ExpectWithOffset(1, prometheusPods.Items).ToNot(HaveLen(0), "At least one Prometheus operator pod expected")

prometheusPod := prometheusPods.Items[0]

url := fmt.Sprintf("localhost:9090/api/v1/query?%s", (url.Values{"query": []string{query}}).Encode())
command := []string{"curl", url}
stdout, stderr, err := pod.ExecCommand(clients, &prometheusPod, command...)
ExpectWithOffset(1, err).ToNot(HaveOccurred(),
"promQL query failed: [%s/%s] command: [%v]\nstdout: %s\nstderr: %s", prometheusPod.Namespace, prometheusPod.Name, command, stdout, stderr)

result := struct {
Status string `json:"status"`
Data struct {
ResultType string `json:"resultType"`
Result model.Vector `json:"result"`
} `json:"data"`
}{}

json.Unmarshal([]byte(stdout), &result)
ExpectWithOffset(1, err).ToNot(HaveOccurred())
ExpectWithOffset(1, result.Status).To(Equal("success"), "cURL for [%s] failed: %s", url, stdout)

return result.Data.Result
}
8 changes: 8 additions & 0 deletions test/conformance/tests/test_sriov_operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,14 @@ var _ = Describe("[sriov] operator", func() {
g.Expect(err).ToNot(HaveOccurred())
}).Should(Succeed())
})

It("should remove ServiceMonitor when the feature is turned off", func() {
setFeatureFlag("metricsExporter", false)
Eventually(func(g Gomega) {
_, err := clients.ServiceMonitors(operatorNamespace).Get(context.Background(), "sriov-network-metrics-exporter", metav1.GetOptions{})
g.Expect(k8serrors.IsNotFound(err)).To(BeTrue())
}).Should(Succeed())
})
})
})

Expand Down
Loading
Loading