Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(prometheus.operator): retry GetInformer when running prometheus operator components #6415

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ v0.40.0 (2024-02-27)

- Fix issue where registry was not being properly deleted. (@mattdurham)

- Add retry when attempting to run `prometheus.operator` flow components. (@onematchfox)

### Other changes

- Removed support for Windows 2012 in line with Microsoft end of life. (@mattdurham)
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ require (
k8s.io/component-base v0.28.1
k8s.io/klog/v2 v2.100.1
k8s.io/utils v0.0.0-20230726121419-3b25d923346b
sigs.k8s.io/controller-runtime v0.16.2
sigs.k8s.io/controller-runtime v0.16.2 // TODO: Remove custom rest mapper from component/prometheus/operator/common/crdmanager.go when upgrading past v0.17.0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the comment meant to be in the go.mod? 👀

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I intentionally put it here so that it will be visible in the diff when someone does upgrade this dependency

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool, thanks for clarifying!

sigs.k8s.io/yaml v1.3.0
)

Expand Down
48 changes: 43 additions & 5 deletions internal/component/prometheus/operator/common/crdmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"github.com/grafana/agent/internal/service/http"
"github.com/grafana/agent/internal/service/labelstore"
"github.com/grafana/ckit/shard"
"github.com/grafana/dskit/backoff"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/config"
"github.com/prometheus/prometheus/discovery"
Expand All @@ -27,6 +28,7 @@ import (
toolscache "k8s.io/client-go/tools/cache"
"sigs.k8s.io/controller-runtime/pkg/cache"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/apiutil"

"github.com/grafana/agent/internal/component/prometheus/operator"
"github.com/grafana/agent/internal/component/prometheus/operator/configgen"
Expand All @@ -39,6 +41,13 @@ import (
// Generous timeout period for configuring all informers
const informerSyncTimeout = 10 * time.Second

// Retry configuration for configuring informers
var informerBackoff = backoff.Config{
MinBackoff: 100 * time.Millisecond,
MaxBackoff: time.Second,
MaxRetries: 10,
}

// crdManager is all of the fields required to run a crd based component.
// on update, this entire thing should be recreated and restarted
type crdManager struct {
Expand Down Expand Up @@ -132,7 +141,7 @@ func (c *crdManager) Run(ctx context.Context) error {
if err := c.runInformers(restConfig, ctx); err != nil {
return err
}
level.Info(c.logger).Log("msg", "informers started")
level.Info(c.logger).Log("msg", "informers started")

var cachedTargets map[string][]*targetgroup.Group
// Start the target discovery loop to update the scrape manager with new targets.
Expand Down Expand Up @@ -266,6 +275,20 @@ func (c *crdManager) runInformers(restConfig *rest.Config, ctx context.Context)
if ls != labels.Nothing() {
opts.DefaultLabelSelector = ls
}

// TODO: Remove custom opts.Mapper when sigs.k8s.io/controller-runtime >= 0.17.0 as `NewDynamicRESTMapper` is the default in that version
var err error
opts.HTTPClient, err = rest.HTTPClientFor(restConfig)
if err != nil {
return err
}

opts.Mapper, err = apiutil.NewDynamicRESTMapper(restConfig, opts.HTTPClient)
if err != nil {
return fmt.Errorf("could not create RESTMapper from config: %w", err)
}
// TODO: end custom opts.Mapps

cache, err := cache.New(restConfig, opts)
if err != nil {
return err
Expand Down Expand Up @@ -305,16 +328,31 @@ func (c *crdManager) configureInformers(ctx context.Context, informers cache.Inf
return fmt.Errorf("unknown kind to configure Informers: %s", c.kind)
}

informerCtx, cancel := context.WithTimeout(ctx, informerSyncTimeout)
defer cancel()
var informer cache.Informer
var err error
bo := backoff.New(ctx, informerBackoff)

for bo.Ongoing() {
informerCtx, cancel := context.WithTimeout(ctx, informerSyncTimeout)
defer cancel()

informer, err = informers.GetInformer(informerCtx, prototype)
if err == nil {
// Successfully got the informer
break
}

informer, err := informers.GetInformer(informerCtx, prototype)
if err != nil {
if errors.Is(informerCtx.Err(), context.DeadlineExceeded) { // Check the context to prevent GetInformer returning a fake timeout
return fmt.Errorf("timeout exceeded while configuring informers. Check the connection"+
" to the Kubernetes API is stable and that the Agent has appropriate RBAC permissions for %v", prototype)
}

level.Warn(c.logger).Log("msg", "failed to get informer - will retry", "err", err)

bo.Wait()
}

if err != nil {
return err
}
const resync = 5 * time.Minute
Expand Down
Loading