From 42ff92f19ebba2a3a2609a308fc25f0dc4a44ca2 Mon Sep 17 00:00:00 2001 From: Israel Blancas Date: Tue, 25 Jul 2023 19:12:15 +0200 Subject: [PATCH] Create ServiceMonitors to gather metrics from the OTEL Operands (#1874) * Allow the creation of ServiceMonitors to gather metrics from the OpenTelemetry Collector instances Signed-off-by: Israel Blancas * Add missing changelog Signed-off-by: Israel Blancas * Fix unprotected statement Signed-off-by: Israel Blancas * Fix lint issues Signed-off-by: Israel Blancas * Apply changes requested in code review Signed-off-by: Israel Blancas * Add missing generated files Signed-off-by: Israel Blancas * Change the way to enable the feature flag Signed-off-by: Israel Blancas * Change the way to enable the feature flag Signed-off-by: Israel Blancas * Fix merge * Fix enable feature flag * Change the name of the option and move the E2E tests to their own folder * Fix unit test * Fix docs * Fix CRD field * Fix CRD field * Add from version to feature gate Signed-off-by: Israel Blancas * Move the E2E tests to their own section for the CI Signed-off-by: Israel Blancas --------- Signed-off-by: Israel Blancas --- .../1768-gather-metrics-from-collectors.yaml | 16 ++ .github/workflows/e2e.yaml | 2 +- Makefile | 14 ++ apis/v1alpha1/opentelemetrycollector_types.go | 29 +++ apis/v1alpha1/zz_generated.deepcopy.go | 32 ++++ ...emetry-operator.clusterserviceversion.yaml | 24 +++ ...ntelemetry.io_opentelemetrycollectors.yaml | 13 ++ ...ntelemetry.io_opentelemetrycollectors.yaml | 13 ++ ...emetry-operator.clusterserviceversion.yaml | 12 ++ config/rbac/role.yaml | 12 ++ .../opentelemetrycollector_controller.go | 11 ++ docs/api.md | 61 +++++++ go.mod | 5 +- go.sum | 10 +- hack/install-prometheus-operator.sh | 7 + kuttl-test-prometheuscr.yaml | 20 +++ main.go | 2 + pkg/collector/reconcile/servicemonitor.go | 170 ++++++++++++++++++ .../reconcile/servicemonitor_test.go | 138 ++++++++++++++ pkg/collector/reconcile/suite_test.go | 11 +- pkg/collector/testdata/sm_crd.go | 38 ++++ pkg/featuregate/featuregate.go | 8 + pkg/naming/main.go | 5 + .../00-assert.yaml | 10 ++ .../00-install.yaml | 28 +++ 25 files changed, 683 insertions(+), 8 deletions(-) create mode 100755 .chloggen/1768-gather-metrics-from-collectors.yaml create mode 100755 hack/install-prometheus-operator.sh create mode 100644 kuttl-test-prometheuscr.yaml create mode 100644 pkg/collector/reconcile/servicemonitor.go create mode 100644 pkg/collector/reconcile/servicemonitor_test.go create mode 100644 pkg/collector/testdata/sm_crd.go create mode 100644 tests/e2e-prometheuscr/gather-metrics-from-operands/00-assert.yaml create mode 100644 tests/e2e-prometheuscr/gather-metrics-from-operands/00-install.yaml diff --git a/.chloggen/1768-gather-metrics-from-collectors.yaml b/.chloggen/1768-gather-metrics-from-collectors.yaml new file mode 100755 index 0000000000..27892054e7 --- /dev/null +++ b/.chloggen/1768-gather-metrics-from-collectors.yaml @@ -0,0 +1,16 @@ +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: enhancement + +# The name of the component, or a single word describing the area of concern, (e.g. operator, target allocator, github action) +component: operator + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Add the ability to the operator to create Service Monitors for the OpenTelemetry Collectors in order to gather the metrics they are generating + +# One or more tracking issues related to the change +issues: [1768] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index ac42217715..e790b3f2b1 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -29,10 +29,10 @@ jobs: - "1.27" group: - e2e e2e-upgrade + - e2e-prometheuscr - e2e-autoscale steps: - - name: Set up Go uses: actions/setup-go@v4 with: diff --git a/Makefile b/Makefile index f6585b5edc..553db2e2db 100644 --- a/Makefile +++ b/Makefile @@ -176,6 +176,11 @@ generate: controller-gen api-docs e2e: $(KUTTL) test +# end-to-end-test for PrometheusCR E2E tests +.PHONY: e2e-prometheuscr +e2e-prometheuscr: + $(KUTTL) test --config kuttl-test-prometheuscr.yaml + # end-to-end-test for testing upgrading .PHONY: e2e-upgrade e2e-upgrade: undeploy @@ -200,6 +205,11 @@ e2e-log-operator: prepare-e2e: kuttl set-image-controller container container-target-allocator container-operator-opamp-bridge start-kind cert-manager install-metrics-server load-image-all deploy TARGETALLOCATOR_IMG=$(TARGETALLOCATOR_IMG) SED_BIN="$(SED)" ./hack/modify-test-images.sh +.PHONY: enable-prometheus-feature-flag +enable-prometheus-feature-flag: + $(SED) -i "s#--feature-gates=+operator.autoinstrumentation.go#--feature-gates=+operator.autoinstrumentation.go,+operator.observability.prometheus#g" config/default/manager_auth_proxy_patch.yaml + + .PHONY: scorecard-tests scorecard-tests: operator-sdk $(OPERATOR_SDK) scorecard -w=5m bundle || (echo "scorecard test failed" && exit 1) @@ -238,6 +248,10 @@ endif install-metrics-server: ./hack/install-metrics-server.sh +.PHONY: install-prometheus-operator +install-prometheus-operator: + ./hack/install-prometheus-operator.sh + .PHONY: load-image-all load-image-all: load-image-operator load-image-target-allocator load-image-operator-opamp-bridge diff --git a/apis/v1alpha1/opentelemetrycollector_types.go b/apis/v1alpha1/opentelemetrycollector_types.go index 7c14e54b0e..443dd3f216 100644 --- a/apis/v1alpha1/opentelemetrycollector_types.go +++ b/apis/v1alpha1/opentelemetrycollector_types.go @@ -190,6 +190,14 @@ type OpenTelemetryCollectorSpec struct { // https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ // +optional InitContainers []v1.Container `json:"initContainers,omitempty"` + + // ObservabilitySpec defines how telemetry data gets handled. + // + // +optional + // +kubebuilder:validation:Optional + // +operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Observability" + Observability ObservabilitySpec `json:"observability,omitempty"` + // TopologySpreadConstraints embedded kubernetes pod configuration option, // controls how pods are spread across your cluster among failure-domains // such as regions, zones, nodes, and other user-defined topology domains @@ -371,6 +379,27 @@ type AutoscalerSpec struct { TargetMemoryUtilization *int32 `json:"targetMemoryUtilization,omitempty"` } +// MetricsConfigSpec defines a metrics config. +type MetricsConfigSpec struct { + // EnableMetrics specifies if ServiceMonitors should be created for the OpenTelemetry Collector. + // The operator.observability.prometheus feature gate must be enabled to use this feature. + // + // +optional + // +kubebuilder:validation:Optional + // +operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Create ServiceMonitors for OpenTelemetry Collector" + EnableMetrics bool `json:"enableMetrics,omitempty"` +} + +// ObservabilitySpec defines how telemetry data gets handled. +type ObservabilitySpec struct { + // Metrics defines the metrics configuration for operands. + // + // +optional + // +kubebuilder:validation:Optional + // +operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Metrics Config" + Metrics MetricsConfigSpec `json:"metrics,omitempty"` +} + // Probe defines the OpenTelemetry's pod probe config. Only Liveness probe is supported currently. type Probe struct { // Number of seconds after the container has started before liveness probes are initiated. diff --git a/apis/v1alpha1/zz_generated.deepcopy.go b/apis/v1alpha1/zz_generated.deepcopy.go index 433a1b516d..800934266d 100644 --- a/apis/v1alpha1/zz_generated.deepcopy.go +++ b/apis/v1alpha1/zz_generated.deepcopy.go @@ -354,6 +354,21 @@ func (in *MetricSpec) DeepCopy() *MetricSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MetricsConfigSpec) DeepCopyInto(out *MetricsConfigSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MetricsConfigSpec. +func (in *MetricsConfigSpec) DeepCopy() *MetricsConfigSpec { + if in == nil { + return nil + } + out := new(MetricsConfigSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Nginx) DeepCopyInto(out *Nginx) { *out = *in @@ -407,6 +422,22 @@ func (in *NodeJS) DeepCopy() *NodeJS { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ObservabilitySpec) DeepCopyInto(out *ObservabilitySpec) { + *out = *in + out.Metrics = in.Metrics +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ObservabilitySpec. +func (in *ObservabilitySpec) DeepCopy() *ObservabilitySpec { + if in == nil { + return nil + } + out := new(ObservabilitySpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *OpenShiftRoute) DeepCopyInto(out *OpenShiftRoute) { *out = *in @@ -614,6 +645,7 @@ func (in *OpenTelemetryCollectorSpec) DeepCopyInto(out *OpenTelemetryCollectorSp (*in)[i].DeepCopyInto(&(*out)[i]) } } + out.Observability = in.Observability if in.TopologySpreadConstraints != nil { in, out := &in.TopologySpreadConstraints, &out.TopologySpreadConstraints *out = make([]v1.TopologySpreadConstraint, len(*in)) diff --git a/bundle/manifests/opentelemetry-operator.clusterserviceversion.yaml b/bundle/manifests/opentelemetry-operator.clusterserviceversion.yaml index 1c40158fa0..e402c3d1cc 100644 --- a/bundle/manifests/opentelemetry-operator.clusterserviceversion.yaml +++ b/bundle/manifests/opentelemetry-operator.clusterserviceversion.yaml @@ -76,6 +76,18 @@ spec: - kind: StatefulSets name: "" version: apps/v1 + specDescriptors: + - description: ObservabilitySpec defines how telemetry data gets handled. + displayName: Observability + path: observability + - description: Metrics defines the metrics configuration for operands. + displayName: Metrics Config + path: observability.metrics + - description: EnableMetrics specifies if ServiceMonitors should be created + for the OpenTelemetry Collector. The operator.observability.prometheus feature + gate must be enabled to use this feature. + displayName: Create ServiceMonitors for OpenTelemetry Collector + path: observability.metrics.enableMetrics version: v1alpha1 description: |- OpenTelemetry is a collection of tools, APIs, and SDKs. You use it to instrument, generate, collect, and export telemetry data (metrics, logs, and traces) for analysis in order to understand your software's performance and behavior. @@ -209,6 +221,18 @@ spec: - get - list - update + - apiGroups: + - monitoring.coreos.com + resources: + - servicemonitors + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - networking.k8s.io resources: diff --git a/bundle/manifests/opentelemetry.io_opentelemetrycollectors.yaml b/bundle/manifests/opentelemetry.io_opentelemetrycollectors.yaml index a7673696f2..1a18ddb382 100644 --- a/bundle/manifests/opentelemetry.io_opentelemetrycollectors.yaml +++ b/bundle/manifests/opentelemetry.io_opentelemetrycollectors.yaml @@ -2949,6 +2949,19 @@ spec: This is only relevant to daemonset, statefulset, and deployment mode type: object + observability: + description: ObservabilitySpec defines how telemetry data gets handled. + properties: + metrics: + description: Metrics defines the metrics configuration for operands. + properties: + enableMetrics: + description: EnableMetrics specifies if ServiceMonitors should + be created for the OpenTelemetry Collector. The operator.observability.prometheus + feature gate must be enabled to use this feature. + type: boolean + type: object + type: object podAnnotations: additionalProperties: type: string diff --git a/config/crd/bases/opentelemetry.io_opentelemetrycollectors.yaml b/config/crd/bases/opentelemetry.io_opentelemetrycollectors.yaml index 98fd85fd0c..45c701979a 100644 --- a/config/crd/bases/opentelemetry.io_opentelemetrycollectors.yaml +++ b/config/crd/bases/opentelemetry.io_opentelemetrycollectors.yaml @@ -2946,6 +2946,19 @@ spec: This is only relevant to daemonset, statefulset, and deployment mode type: object + observability: + description: ObservabilitySpec defines how telemetry data gets handled. + properties: + metrics: + description: Metrics defines the metrics configuration for operands. + properties: + enableMetrics: + description: EnableMetrics specifies if ServiceMonitors should + be created for the OpenTelemetry Collector. The operator.observability.prometheus + feature gate must be enabled to use this feature. + type: boolean + type: object + type: object podAnnotations: additionalProperties: type: string diff --git a/config/manifests/bases/opentelemetry-operator.clusterserviceversion.yaml b/config/manifests/bases/opentelemetry-operator.clusterserviceversion.yaml index b36f42e8e6..8b814d4a91 100644 --- a/config/manifests/bases/opentelemetry-operator.clusterserviceversion.yaml +++ b/config/manifests/bases/opentelemetry-operator.clusterserviceversion.yaml @@ -56,6 +56,18 @@ spec: - kind: StatefulSets name: "" version: apps/v1 + specDescriptors: + - description: ObservabilitySpec defines how telemetry data gets handled. + displayName: Observability + path: observability + - description: Metrics defines the metrics configuration for operands. + displayName: Metrics Config + path: observability.metrics + - description: EnableMetrics specifies if ServiceMonitors should be created + for the OpenTelemetry Collector. The operator.observability.prometheus feature + gate must be enabled to use this feature. + displayName: Create ServiceMonitors for OpenTelemetry Collector + path: observability.metrics.enableMetrics version: v1alpha1 description: |- OpenTelemetry is a collection of tools, APIs, and SDKs. You use it to instrument, generate, collect, and export telemetry data (metrics, logs, and traces) for analysis in order to understand your software's performance and behavior. diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index b907ad801b..33dca01315 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -119,6 +119,18 @@ rules: - get - list - update +- apiGroups: + - monitoring.coreos.com + resources: + - servicemonitors + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - networking.k8s.io resources: diff --git a/controllers/opentelemetrycollector_controller.go b/controllers/opentelemetrycollector_controller.go index efce335de0..75b84a50b7 100644 --- a/controllers/opentelemetrycollector_controller.go +++ b/controllers/opentelemetrycollector_controller.go @@ -21,6 +21,7 @@ import ( "sync" "github.com/go-logr/logr" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" appsv1 "k8s.io/api/apps/v1" autoscalingv2 "k8s.io/api/autoscaling/v2" autoscalingv2beta2 "k8s.io/api/autoscaling/v2beta2" @@ -35,6 +36,7 @@ import ( "github.com/open-telemetry/opentelemetry-operator/internal/config" "github.com/open-telemetry/opentelemetry-operator/pkg/autodetect" "github.com/open-telemetry/opentelemetry-operator/pkg/collector/reconcile" + "github.com/open-telemetry/opentelemetry-operator/pkg/featuregate" ) // OpenTelemetryCollectorReconciler reconciles a OpenTelemetryCollector object. @@ -164,6 +166,11 @@ func NewReconciler(p Params) *OpenTelemetryCollectorReconciler { "ingresses", true, }, + { + reconcile.ServiceMonitors, + "service monitors", + true, + }, { reconcile.Self, "opentelemetry", @@ -251,6 +258,10 @@ func (r *OpenTelemetryCollectorReconciler) SetupWithManager(mgr ctrl.Manager) er Owns(&appsv1.DaemonSet{}). Owns(&appsv1.StatefulSet{}) + if featuregate.PrometheusOperatorIsAvailable.IsEnabled() { + builder.Owns(&monitoringv1.ServiceMonitor{}) + } + autoscalingVersion := r.config.AutoscalingVersion() if autoscalingVersion == autodetect.AutoscalingVersionV2 { builder = builder.Owns(&autoscalingv2.HorizontalPodAutoscaler{}) diff --git a/docs/api.md b/docs/api.md index 71f4048161..d1db878bf4 100644 --- a/docs/api.md +++ b/docs/api.md @@ -3736,6 +3736,13 @@ OpenTelemetryCollectorSpec defines the desired state of OpenTelemetryCollector. NodeSelector to schedule OpenTelemetry Collector pods. This is only relevant to daemonset, statefulset, and deployment mode
false + + observability + object + + ObservabilitySpec defines how telemetry data gets handled.
+ + false podAnnotations map[string]string @@ -8985,6 +8992,60 @@ Liveness config for the OpenTelemetry Collector except the probe handler which i +### OpenTelemetryCollector.spec.observability +[↩ Parent](#opentelemetrycollectorspec) + + + +ObservabilitySpec defines how telemetry data gets handled. + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
metricsobject + Metrics defines the metrics configuration for operands.
+
false
+ + +### OpenTelemetryCollector.spec.observability.metrics +[↩ Parent](#opentelemetrycollectorspecobservability) + + + +Metrics defines the metrics configuration for operands. + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
enableMetricsboolean + EnableMetrics specifies if ServiceMonitors should be created for the OpenTelemetry Collector. The operator.observability.prometheus feature gate must be enabled to use this feature.
+
false
+ + ### OpenTelemetryCollector.spec.podSecurityContext [↩ Parent](#opentelemetrycollectorspec) diff --git a/go.mod b/go.mod index f7c1380afe..d7194f36ac 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( github.com/go-logr/logr v1.2.4 github.com/mitchellh/mapstructure v1.5.0 github.com/openshift/api v3.9.0+incompatible + github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.66.0 github.com/prometheus/prometheus v0.45.0 github.com/spf13/pflag v1.0.5 github.com/stretchr/testify v1.8.4 @@ -140,7 +141,7 @@ require ( golang.org/x/oauth2 v0.8.0 // indirect golang.org/x/sys v0.8.0 // indirect golang.org/x/term v0.8.0 // indirect - golang.org/x/text v0.9.0 // indirect + golang.org/x/text v0.10.0 // indirect golang.org/x/time v0.3.0 // indirect golang.org/x/tools v0.9.3 // indirect gomodules.xyz/jsonpatch/v2 v2.3.0 // indirect @@ -154,7 +155,7 @@ require ( gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/klog/v2 v2.100.1 // indirect k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f // indirect - k8s.io/utils v0.0.0-20230308161112-d77c459e9343 // indirect + k8s.io/utils v0.0.0-20230505201702-9f6742963106 // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect sigs.k8s.io/yaml v1.3.0 // indirect diff --git a/go.sum b/go.sum index 19faea5d2b..b5d0b02072 100644 --- a/go.sum +++ b/go.sum @@ -440,6 +440,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/posener/complete v1.1.1/go.mod h1:em0nMJCgc9GFtwrmVmEMR/ZL6WyhyjMBndrE9hABlRI= github.com/posener/complete v1.2.3/go.mod h1:WZIdtGGp+qx0sLrYKtIRAruyNpv6hFCicSgv7Sy7s/s= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.66.0 h1:PPW01FLVjJHMNcbAL1DDD9EZceSQKMOU/VpK0irrxrI= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.66.0/go.mod h1:KZHvrby65G+rA4V/vMTUXDV22TI+GgLIrCigYClpjzk= github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= github.com/prometheus/client_golang v1.4.0/go.mod h1:e9GMxYsXl05ICDXkRhurwBS4Q3OK1iX/F2sw+iXX5zU= @@ -731,8 +733,8 @@ golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE= -golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.10.0 h1:UpjohKhiEgNc0CSauXmwYftY1+LlaC75SJwh0SgCX58= +golang.org/x/text v0.10.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -932,8 +934,8 @@ k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f h1:2kWPakN3i/k81b0gvD5C5F k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f/go.mod h1:byini6yhqGC14c3ebc/QwanvYwhuMWF6yz2F8uwW8eg= k8s.io/kubectl v0.27.4 h1:RV1TQLIbtL34+vIM+W7HaS3KfAbqvy9lWn6pWB9els4= k8s.io/kubectl v0.27.4/go.mod h1:qtc1s3BouB9KixJkriZMQqTsXMc+OAni6FeKAhq7q14= -k8s.io/utils v0.0.0-20230308161112-d77c459e9343 h1:m7tbIjXGcGIAtpmQr7/NAi7RsWoW3E7Zcm4jI1HicTc= -k8s.io/utils v0.0.0-20230308161112-d77c459e9343/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/utils v0.0.0-20230505201702-9f6742963106 h1:EObNQ3TW2D+WptiYXlApGNLVy0zm/JIBVY9i+M4wpAU= +k8s.io/utils v0.0.0-20230505201702-9f6742963106/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= diff --git a/hack/install-prometheus-operator.sh b/hack/install-prometheus-operator.sh new file mode 100755 index 0000000000..3809e9f15c --- /dev/null +++ b/hack/install-prometheus-operator.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +if [[ "$(kubectl api-resources --api-group=monitoring.coreos.com -o name)" ]]; then + echo "Prometheus CRDs are there" +else + kubectl create -f https://github.com/prometheus-operator/prometheus-operator/releases/download/v0.66.0/bundle.yaml +fi diff --git a/kuttl-test-prometheuscr.yaml b/kuttl-test-prometheuscr.yaml new file mode 100644 index 0000000000..266e5ccd89 --- /dev/null +++ b/kuttl-test-prometheuscr.yaml @@ -0,0 +1,20 @@ +# Make sure that the OT operator after upgrading itself, can upgrade the OT collectors without error. +# The test is based on the version v0.49.0, a breaking change was introduced from PR +# https://github.com/open-telemetry/opentelemetry-operator/pull/797, which added a version label "app.kubernetes.io/version", +# The version label would change between OT operator upgrade, and since at the time, the collector pod selector was the same +# as this labels, resulted in selector being modified during reconciliation which caused error due to the selector is immutable. +# Please be aware of that the collector labels are changeable in various ways, so this issue may happen in any operator < v0.52.0 +# which changed the selector to be a static set of labels. +# The fix for this issue including: +# https://github.com/open-telemetry/opentelemetry-operator/issues/840, make the selector be a static set of labels; +# https://github.com/open-telemetry/opentelemetry-operator/issues/1117, delete the old collector to let the operator +# create a new one when the selector changed. +apiVersion: kuttl.dev/v1beta1 +kind: TestSuite +commands: + - command: make undeploy + - command: make enable-prometheus-feature-flag deploy install-prometheus-operator + - command: go run hack/check-operator-ready.go +testDirs: + - ./tests/e2e-prometheuscr/ +timeout: 300 diff --git a/main.go b/main.go index 9964d3d2a2..c7538a3068 100644 --- a/main.go +++ b/main.go @@ -25,6 +25,7 @@ import ( "time" routev1 "github.com/openshift/api/route/v1" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" "github.com/spf13/pflag" colfeaturegate "go.opentelemetry.io/collector/featuregate" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -71,6 +72,7 @@ func init() { utilruntime.Must(otelv1alpha1.AddToScheme(scheme)) utilruntime.Must(routev1.AddToScheme(scheme)) + utilruntime.Must(monitoringv1.AddToScheme(scheme)) // +kubebuilder:scaffold:scheme } diff --git a/pkg/collector/reconcile/servicemonitor.go b/pkg/collector/reconcile/servicemonitor.go new file mode 100644 index 0000000000..7da2f275be --- /dev/null +++ b/pkg/collector/reconcile/servicemonitor.go @@ -0,0 +1,170 @@ +// Copyright The OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package reconcile + +import ( + "context" + "fmt" + + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + + "github.com/open-telemetry/opentelemetry-operator/pkg/featuregate" + "github.com/open-telemetry/opentelemetry-operator/pkg/naming" +) + +// +kubebuilder:rbac:groups=monitoring.coreos.com,resources=servicemonitors,verbs=get;list;watch;create;update;patch;delete + +// ServiceMonitors reconciles the service monitor(s) required for the instance in the current context. +func ServiceMonitors(ctx context.Context, params Params) error { + if !params.Instance.Spec.Observability.Metrics.EnableMetrics || !featuregate.PrometheusOperatorIsAvailable.IsEnabled() { + return nil + } + + desired := desiredServiceMonitors(ctx, params) + + // first, handle the create/update parts + if err := expectedServiceMonitors(ctx, params, desired); err != nil { + return fmt.Errorf("failed to reconcile the expected service monitors: %w", err) + } + + // then, delete the extra objects + if err := deleteServiceMonitors(ctx, params, desired); err != nil { + return fmt.Errorf("failed to reconcile the service monitors to be deleted: %w", err) + } + + return nil +} + +func desiredServiceMonitors(_ context.Context, params Params) []monitoringv1.ServiceMonitor { + col := params.Instance + return []monitoringv1.ServiceMonitor{ + { + ObjectMeta: metav1.ObjectMeta{ + Namespace: col.Namespace, + Name: naming.ServiceMonitor(col), + Labels: map[string]string{ + "app.kubernetes.io/name": naming.ServiceMonitor(params.Instance), + "app.kubernetes.io/instance": fmt.Sprintf("%s.%s", params.Instance.Namespace, params.Instance.Name), + "app.kubernetes.io/managed-by": "opentelemetry-operator", + }, + }, + Spec: monitoringv1.ServiceMonitorSpec{ + Endpoints: []monitoringv1.Endpoint{{ + Port: "monitoring", + }}, + NamespaceSelector: monitoringv1.NamespaceSelector{ + MatchNames: []string{col.Namespace}, + }, + Selector: metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app.kubernetes.io/managed-by": "opentelemetry-operator", + }, + }, + }, + }, + } +} + +func expectedServiceMonitors(ctx context.Context, params Params, expected []monitoringv1.ServiceMonitor) error { + for _, obj := range expected { + desired := obj + + if err := controllerutil.SetControllerReference(¶ms.Instance, &desired, params.Scheme); err != nil { + return fmt.Errorf("failed to set controller reference: %w", err) + } + + existing := &monitoringv1.ServiceMonitor{} + nns := types.NamespacedName{Namespace: desired.Namespace, Name: desired.Name} + err := params.Client.Get(ctx, nns, existing) + if err != nil && k8serrors.IsNotFound(err) { + if err = params.Client.Create(ctx, &desired); err != nil { + return fmt.Errorf("failed to create: %w", err) + } + params.Log.V(2).Info("created", "servicemonitor.name", desired.Name, "servicemonitor.namespace", desired.Namespace) + continue + } else if err != nil { + return fmt.Errorf("failed to get: %w", err) + } + + // it exists already, merge the two if the end result isn't identical to the existing one + updated := existing.DeepCopy() + if updated.Annotations == nil { + updated.Annotations = map[string]string{} + } + if updated.Labels == nil { + updated.Labels = map[string]string{} + } + updated.ObjectMeta.OwnerReferences = desired.ObjectMeta.OwnerReferences + updated.Spec.Endpoints = desired.Spec.Endpoints + updated.Spec.NamespaceSelector = desired.Spec.NamespaceSelector + updated.Spec.Selector = desired.Spec.Selector + + for k, v := range desired.ObjectMeta.Annotations { + updated.ObjectMeta.Annotations[k] = v + } + for k, v := range desired.ObjectMeta.Labels { + updated.ObjectMeta.Labels[k] = v + } + + patch := client.MergeFrom(existing) + + if err := params.Client.Patch(ctx, updated, patch); err != nil { + return fmt.Errorf("failed to apply changes: %w", err) + } + + params.Log.V(2).Info("applied", "servicemonitor.name", desired.Name, "servicemonitor.namespace", desired.Namespace) + } + return nil +} + +func deleteServiceMonitors(ctx context.Context, params Params, expected []monitoringv1.ServiceMonitor) error { + opts := []client.ListOption{ + client.InNamespace(params.Instance.Namespace), + client.MatchingLabels(map[string]string{ + "app.kubernetes.io/instance": fmt.Sprintf("%s.%s", params.Instance.Namespace, params.Instance.Name), + "app.kubernetes.io/managed-by": "opentelemetry-operator", + }), + } + + list := &monitoringv1.ServiceMonitorList{} + if err := params.Client.List(ctx, list, opts...); err != nil { + return fmt.Errorf("failed to list: %w", err) + } + + for i := range list.Items { + existing := list.Items[i] + del := true + for _, keep := range expected { + if keep.Name == existing.Name && keep.Namespace == existing.Namespace { + del = false + break + } + } + + if del { + if err := params.Client.Delete(ctx, existing); err != nil { + return fmt.Errorf("failed to delete: %w", err) + } + params.Log.V(2).Info("deleted", "servicemonitor.name", existing.Name, "servicemonitor.namespace", existing.Namespace) + } + } + + return nil +} diff --git a/pkg/collector/reconcile/servicemonitor_test.go b/pkg/collector/reconcile/servicemonitor_test.go new file mode 100644 index 0000000000..d3ae598da7 --- /dev/null +++ b/pkg/collector/reconcile/servicemonitor_test.go @@ -0,0 +1,138 @@ +// Copyright The OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package reconcile + +import ( + "context" + "testing" + + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + colfeaturegate "go.opentelemetry.io/collector/featuregate" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + + "github.com/open-telemetry/opentelemetry-operator/pkg/featuregate" +) + +func TestDesiredServiceMonitors(t *testing.T) { + params := params() + + actual := desiredServiceMonitors(context.Background(), params) + assert.NotNil(t, actual) +} + +func TestExpectedServiceMonitors(t *testing.T) { + originalVal := featuregate.PrometheusOperatorIsAvailable.IsEnabled() + require.NoError(t, colfeaturegate.GlobalRegistry().Set(featuregate.PrometheusOperatorIsAvailable.ID(), false)) + t.Cleanup(func() { + require.NoError(t, colfeaturegate.GlobalRegistry().Set(featuregate.PrometheusOperatorIsAvailable.ID(), originalVal)) + }) + + t.Run("should create the service monitor", func(t *testing.T) { + p := params() + p.Instance.Spec.Observability.Metrics.EnableMetrics = true + + err := expectedServiceMonitors( + context.Background(), + p, + []monitoringv1.ServiceMonitor{servicemonitor("test-collector")}, + ) + assert.NoError(t, err) + + exists, err := populateObjectIfExists(t, &monitoringv1.ServiceMonitor{}, types.NamespacedName{Namespace: "default", Name: "test-collector"}) + + assert.NoError(t, err) + assert.True(t, exists) + + }) +} + +func TestDeleteServiceMonitors(t *testing.T) { + t.Run("should delete excess service monitors", func(t *testing.T) { + name := "sm-to-delete" + deleteServiceMonitor := servicemonitor(name) + createObjectIfNotExists(t, name, &deleteServiceMonitor) + + exists, err := populateObjectIfExists(t, &monitoringv1.ServiceMonitor{}, types.NamespacedName{Namespace: "default", Name: name}) + assert.NoError(t, err) + assert.True(t, exists) + + desired := desiredServiceMonitors(context.Background(), params()) + err = deleteServiceMonitors(context.Background(), params(), desired) + assert.NoError(t, err) + + exists, err = populateObjectIfExists(t, &v1.Service{}, types.NamespacedName{Namespace: "default", Name: name}) + assert.NoError(t, err) + assert.False(t, exists) + }) +} + +func servicemonitor(name string) monitoringv1.ServiceMonitor { + return monitoringv1.ServiceMonitor{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: "default", + }, + Spec: monitoringv1.ServiceMonitorSpec{ + Selector: metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app.kubernetes.io/managed-by": "opentelemetry-operator", + }, + }, + NamespaceSelector: monitoringv1.NamespaceSelector{ + MatchNames: []string{"default"}, + }, + Endpoints: []monitoringv1.Endpoint{ + { + Port: "monitoring", + }, + }, + }, + } +} + +func TestServiceMonitors(t *testing.T) { + t.Run("not enabled", func(t *testing.T) { + ctx := context.Background() + err := ServiceMonitors(ctx, params()) + assert.Nil(t, err) + }) + + t.Run("enabled but featuregate not enabled", func(t *testing.T) { + ctx := context.Background() + p := params() + p.Instance.Spec.Observability.Metrics.EnableMetrics = true + err := ServiceMonitors(ctx, p) + assert.Nil(t, err) + }) + + t.Run("enabled and featuregate enabled", func(t *testing.T) { + originalVal := featuregate.PrometheusOperatorIsAvailable.IsEnabled() + require.NoError(t, colfeaturegate.GlobalRegistry().Set(featuregate.PrometheusOperatorIsAvailable.ID(), false)) + t.Cleanup(func() { + require.NoError(t, colfeaturegate.GlobalRegistry().Set(featuregate.PrometheusOperatorIsAvailable.ID(), originalVal)) + }) + + ctx := context.Background() + p := params() + p.Instance.Spec.Observability.Metrics.EnableMetrics = true + err := ServiceMonitors(ctx, p) + assert.Nil(t, err) + }) + +} diff --git a/pkg/collector/reconcile/suite_test.go b/pkg/collector/reconcile/suite_test.go index 8bc3efcf49..f0f37bba2a 100644 --- a/pkg/collector/reconcile/suite_test.go +++ b/pkg/collector/reconcile/suite_test.go @@ -26,6 +26,7 @@ import ( "time" routev1 "github.com/openshift/api/route/v1" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" "github.com/stretchr/testify/assert" v1 "k8s.io/api/core/v1" apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" @@ -76,7 +77,10 @@ func TestMain(m *testing.M) { testEnv = &envtest.Environment{ CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "config", "crd", "bases")}, CRDInstallOptions: envtest.CRDInstallOptions{ - CRDs: []*apiextensionsv1.CustomResourceDefinition{testdata.OpenShiftRouteCRD}, + CRDs: []*apiextensionsv1.CustomResourceDefinition{ + testdata.OpenShiftRouteCRD, + testdata.ServiceMonitorCRD, + }, }, WebhookInstallOptions: envtest.WebhookInstallOptions{ Paths: []string{filepath.Join("..", "..", "..", "config", "webhook")}, @@ -97,6 +101,11 @@ func TestMain(m *testing.M) { fmt.Printf("failed to register scheme: %v", err) os.Exit(1) } + + if err = monitoringv1.AddToScheme(testScheme); err != nil { + fmt.Printf("failed to register scheme: %v", err) + os.Exit(1) + } // +kubebuilder:scaffold:scheme k8sClient, err = client.New(cfg, client.Options{Scheme: testScheme}) diff --git a/pkg/collector/testdata/sm_crd.go b/pkg/collector/testdata/sm_crd.go new file mode 100644 index 0000000000..cb7322dbf6 --- /dev/null +++ b/pkg/collector/testdata/sm_crd.go @@ -0,0 +1,38 @@ +package testdata + +import ( + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// ServiceMonitorCRD as go structure. +var ServiceMonitorCRD = &apiextensionsv1.CustomResourceDefinition{ + ObjectMeta: metav1.ObjectMeta{ + Name: "servicemonitors.monitoring.coreos.com", + }, + Spec: apiextensionsv1.CustomResourceDefinitionSpec{ + Group: "monitoring.coreos.com", + Versions: []apiextensionsv1.CustomResourceDefinitionVersion{ + { + Name: "v1", + Served: true, + Storage: true, + Schema: &apiextensionsv1.CustomResourceValidation{ + OpenAPIV3Schema: &apiextensionsv1.JSONSchemaProps{ + Type: "object", + XPreserveUnknownFields: func(v bool) *bool { return &v }(true), + }, + }, + Subresources: &apiextensionsv1.CustomResourceSubresources{ + Status: &apiextensionsv1.CustomResourceSubresourceStatus{}, + }, + }, + }, + Scope: apiextensionsv1.NamespaceScoped, + Names: apiextensionsv1.CustomResourceDefinitionNames{ + Plural: "servicemonitors", + Singular: "servicemonitor", + Kind: "ServiceMonitor", + }, + }, +} diff --git a/pkg/featuregate/featuregate.go b/pkg/featuregate/featuregate.go index c2bcfd250e..3f9f95bf4c 100644 --- a/pkg/featuregate/featuregate.go +++ b/pkg/featuregate/featuregate.go @@ -70,6 +70,14 @@ var ( featuregate.WithRegisterDescription("controls whether the operator should configure the collector's targetAllocator configuration"), featuregate.WithRegisterFromVersion("v0.76.1"), ) + + // PrometheusOperatorIsAvailable is the feature gate that enables features associated to the Prometheus Operator. + PrometheusOperatorIsAvailable = featuregate.GlobalRegistry().MustRegister( + "operator.observability.prometheus", + featuregate.StageAlpha, + featuregate.WithRegisterDescription("enables features associated to the Prometheus Operator"), + featuregate.WithRegisterFromVersion("v0.82.0"), + ) ) // Flags creates a new FlagSet that represents the available featuregate flags using the supplied featuregate registry. diff --git a/pkg/naming/main.go b/pkg/naming/main.go index af53636ef6..d7117c41ce 100644 --- a/pkg/naming/main.go +++ b/pkg/naming/main.go @@ -109,6 +109,11 @@ func ServiceAccount(otelcol v1alpha1.OpenTelemetryCollector) string { return DNSName(Truncate("%s-collector", 63, otelcol.Name)) } +// ServiceMonitor builds the service account name based on the instance. +func ServiceMonitor(otelcol v1alpha1.OpenTelemetryCollector) string { + return DNSName(Truncate("%s-collector", 63, otelcol.Name)) +} + // TargetAllocatorServiceAccount returns the TargetAllocator service account resource name. func TargetAllocatorServiceAccount(otelcol v1alpha1.OpenTelemetryCollector) string { return DNSName(Truncate("%s-targetallocator", 63, otelcol.Name)) diff --git a/tests/e2e-prometheuscr/gather-metrics-from-operands/00-assert.yaml b/tests/e2e-prometheuscr/gather-metrics-from-operands/00-assert.yaml new file mode 100644 index 0000000000..586d61e942 --- /dev/null +++ b/tests/e2e-prometheuscr/gather-metrics-from-operands/00-assert.yaml @@ -0,0 +1,10 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: simplest-collector +spec: + endpoints: + - port: monitoring + selector: + matchLabels: + app.kubernetes.io/managed-by: opentelemetry-operator diff --git a/tests/e2e-prometheuscr/gather-metrics-from-operands/00-install.yaml b/tests/e2e-prometheuscr/gather-metrics-from-operands/00-install.yaml new file mode 100644 index 0000000000..dffa7a0500 --- /dev/null +++ b/tests/e2e-prometheuscr/gather-metrics-from-operands/00-install.yaml @@ -0,0 +1,28 @@ +apiVersion: opentelemetry.io/v1alpha1 +kind: OpenTelemetryCollector +metadata: + name: simplest +spec: + observability: + metrics: + enableMetrics: true + config: | + receivers: + jaeger: + protocols: + grpc: + otlp: + protocols: + grpc: + http: + processors: + + exporters: + logging: + + service: + pipelines: + traces: + receivers: [jaeger,otlp] + processors: [] + exporters: [logging]