Skip to content

Commit

Permalink
Retry initialization error conditions (#2979)
Browse files Browse the repository at this point in the history
When the api server is flakey (e.g. during a cluster install), it is
possible for some of the OLM initialization to fail. When this happens,
OLM gets into a bad state (e.g. a monitoring go routine terminates)
and can't recover without a restart.

There were at least two places I found where a retry mechanism is
needed to handle intialization errors. This was as far as I peeled the
onion. It's not an exponential backoff retry, but a 1 minute retry
interval should be sufficient (no other backoffs are exponential).

The ServerVersion only retries once with a minute in between. This
required fixing a unit-test to take the retry into account.

Signed-off-by: Todd Short <[email protected]>
  • Loading branch information
tmshort authored Jul 5, 2023
1 parent 12217d1 commit e908cfc
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 4 deletions.
15 changes: 12 additions & 3 deletions pkg/lib/operatorstatus/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,18 @@ func (m *monitor) Run(stopCh <-chan struct{}) {
m.logger.Infof("initializing clusteroperator resource(s) for %s", m.names)

for _, name := range m.names {
if err := m.init(name); err != nil {
m.logger.Errorf("initialization error - %v", err)
break
for {
if err := m.init(name); err != nil {
m.logger.Errorf("initialization error - %v", err)
} else {
m.logger.Infof("initialized cluster resource - %s", name)
break
}
select {
case <-time.After(defaultProbeInterval):
case <-stopCh:
return
}
}
}

Expand Down
15 changes: 15 additions & 0 deletions pkg/lib/queueinformer/queueinformer_operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"fmt"
"sync"
"time"

"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/kubestate"
"github.com/pkg/errors"
Expand All @@ -13,6 +14,10 @@ import (
"k8s.io/client-go/tools/cache"
)

const (
defaultServerVersionInterval = 1 * time.Minute
)

// ExtensibleOperator describes a Reconciler that can be extended with additional informers and queue informers
type ExtensibleOperator interface {
// RegisterQueueInformer registers the given QueueInformer with the Operator.
Expand Down Expand Up @@ -194,6 +199,16 @@ func (o *operator) start(ctx context.Context) error {
go func() {
defer close(errs)
v, err := o.serverVersion.ServerVersion()
if err == nil {
o.logger.Infof("connection established. cluster-version: %v", v)
return
}
select {
case <-time.After(defaultServerVersionInterval):
case <-ctx.Done():
return
}
v, err = o.serverVersion.ServerVersion()
if err != nil {
select {
case errs <- errors.Wrap(err, "communicating with server failed"):
Expand Down
2 changes: 1 addition & 1 deletion pkg/lib/queueinformer/queueinformer_operator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ func TestOperatorRunChannelClosure(t *testing.T) {

o.Run(ctx)

timeout := time.After(time.Second)
timeout := time.After(2 * defaultServerVersionInterval)
for n, ch := range map[string]<-chan struct{}{
"ready": o.Ready(),
"done": o.Done(),
Expand Down

0 comments on commit e908cfc

Please sign in to comment.