diff --git a/changelog/fragments/1687301871-service-command-retries.yaml b/changelog/fragments/1687301871-service-command-retries.yaml deleted file mode 100644 index 162f7ecce89..00000000000 --- a/changelog/fragments/1687301871-service-command-retries.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Kind can be one of: -# - breaking-change: a change to previously-documented behavior -# - deprecation: functionality that is being removed in a later release -# - bug-fix: fixes a problem in a previous version -# - enhancement: extends functionality but does not break or fix existing behavior -# - feature: new functionality -# - known-issue: problems that we are aware of in a given version -# - security: impacts on the security of a product or a user’s deployment. -# - upgrade: important information for someone upgrading from a prior version -# - other: does not fit into any of the other categories -kind: enhancement - -# Change summary; a 80ish characters long description of the change. -summary: Retry service commands indefinitely with exponential backoff - -# Long description; in case the summary is not enough to describe the change -# this field accommodate a description without length limits. -# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. -#description: - -# Affected component; a word indicating the component this changeset affects. -component: agent - -# PR URL; optional; the PR number that added the changeset. -# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. -# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. -# Please provide it if you are adding a fragment for a different PR. -pr: https://github.com/elastic/elastic-agent/pull/2889 - -# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). -# If not present is automatically filled by the tooling with the issue linked to the PR number. -#issue: https://github.com/owner/repo/1234 diff --git a/docs/component-specs.md b/docs/component-specs.md index c0da37f345f..ac29df86a31 100644 --- a/docs/component-specs.md +++ b/docs/component-specs.md @@ -154,8 +154,6 @@ The path to this service's logs directory. - `args` (identical to `command.args`): the command-line arguments to pass for this operation - `env` (identical to `command.env`): the environment variables to set for this operation - `timeout`: the timeout duration for this operation. -- `retry`: (optional) configuration for retrying the operation - - `init_interval`: interval before the first retry. The interval between subsequent retries will increase exponentially (with some random jitter). For example: diff --git a/internal/pkg/agent/install/uninstall.go b/internal/pkg/agent/install/uninstall.go index f4810274970..9c27bb88704 100644 --- a/internal/pkg/agent/install/uninstall.go +++ b/internal/pkg/agent/install/uninstall.go @@ -202,7 +202,7 @@ func uninstallServiceComponent(ctx context.Context, log *logp.Logger, comp compo // Do not use infinite retries when uninstalling from the command line. If the uninstall needs to be // retried the entire uninstall command can be retried. Retries may complete asynchronously with the // execution of the uninstall command, leading to bugs like https://github.com/elastic/elastic-agent/issues/3060. - return comprt.UninstallService(ctx, log, comp, false) + return comprt.UninstallService(ctx, log, comp) } func serviceComponentsFromConfig(specs component.RuntimeSpecs, cfg *config.Config) ([]component.Component, error) { diff --git a/pkg/component/runtime/service.go b/pkg/component/runtime/service.go index 08d3fb8f206..71d71890052 100644 --- a/pkg/component/runtime/service.go +++ b/pkg/component/runtime/service.go @@ -29,9 +29,7 @@ var ( ErrInvalidServiceSpec = errors.New("invalid service spec") ) -// executeServiceCommandFunc executes the given binary according to configuration in spec. If shouldRetry == true, -// the command will be retried indefinitely; otherwise, it will not be retried. -type executeServiceCommandFunc func(ctx context.Context, log *logger.Logger, binaryPath string, spec *component.ServiceOperationsCommandSpec, shouldRetry bool) error +type executeServiceCommandFunc func(ctx context.Context, log *logger.Logger, binaryPath string, spec *component.ServiceOperationsCommandSpec) error // serviceRuntime provides the command runtime for running a component as a service. type serviceRuntime struct { @@ -435,7 +433,7 @@ func (s *serviceRuntime) check(ctx context.Context) error { return ErrOperationSpecUndefined } s.log.Debugf("check if the %s is installed", s.comp.InputSpec.BinaryName) - return s.executeServiceCommandImpl(ctx, s.log, s.comp.InputSpec.BinaryPath, s.comp.InputSpec.Spec.Service.Operations.Check, false) + return s.executeServiceCommandImpl(ctx, s.log, s.comp.InputSpec.BinaryPath, s.comp.InputSpec.Spec.Service.Operations.Check) } // install executes the service install command @@ -445,26 +443,26 @@ func (s *serviceRuntime) install(ctx context.Context) error { return ErrOperationSpecUndefined } s.log.Debugf("install %s service", s.comp.InputSpec.BinaryName) - return s.executeServiceCommandImpl(ctx, s.log, s.comp.InputSpec.BinaryPath, s.comp.InputSpec.Spec.Service.Operations.Install, true) + return s.executeServiceCommandImpl(ctx, s.log, s.comp.InputSpec.BinaryPath, s.comp.InputSpec.Spec.Service.Operations.Install) } // uninstall executes the service uninstall command func (s *serviceRuntime) uninstall(ctx context.Context) error { // Always retry for internal attempts to uninstall, because they are an attempt to converge the agent's current state // with its desired state based on the agent policy. - return uninstallService(ctx, s.log, s.comp, s.executeServiceCommandImpl, true) + return uninstallService(ctx, s.log, s.comp, s.executeServiceCommandImpl) } // UninstallService uninstalls the service. When shouldRetry is true the uninstall command will be retried until it succeeds. -func UninstallService(ctx context.Context, log *logger.Logger, comp component.Component, shouldRetry bool) error { - return uninstallService(ctx, log, comp, executeServiceCommand, shouldRetry) +func UninstallService(ctx context.Context, log *logger.Logger, comp component.Component) error { + return uninstallService(ctx, log, comp, executeServiceCommand) } -func uninstallService(ctx context.Context, log *logger.Logger, comp component.Component, executeServiceCommandImpl executeServiceCommandFunc, shouldRetry bool) error { +func uninstallService(ctx context.Context, log *logger.Logger, comp component.Component, executeServiceCommandImpl executeServiceCommandFunc) error { if comp.InputSpec.Spec.Service.Operations.Uninstall == nil { log.Errorf("missing uninstall spec for %s service", comp.InputSpec.BinaryName) return ErrOperationSpecUndefined } log.Debugf("uninstall %s service", comp.InputSpec.BinaryName) - return executeServiceCommandImpl(ctx, log, comp.InputSpec.BinaryPath, comp.InputSpec.Spec.Service.Operations.Uninstall, shouldRetry) + return executeServiceCommandImpl(ctx, log, comp.InputSpec.BinaryPath, comp.InputSpec.Spec.Service.Operations.Uninstall) } diff --git a/pkg/component/runtime/service_command.go b/pkg/component/runtime/service_command.go index e0c75a2cb69..61ccda076be 100644 --- a/pkg/component/runtime/service_command.go +++ b/pkg/component/runtime/service_command.go @@ -7,19 +7,13 @@ package runtime import ( "bufio" "context" - "crypto/sha256" - "encoding/hex" "errors" "fmt" "os/exec" "path/filepath" - "sort" "strings" - "sync" "time" - "github.com/cenkalti/backoff/v4" - "github.com/dolmen-go/contextio" "github.com/elastic/elastic-agent/pkg/component" @@ -27,8 +21,6 @@ import ( "github.com/elastic/elastic-agent/pkg/core/process" ) -var serviceCmdRetrier = cmdRetrier{} - func executeCommand(ctx context.Context, log *logger.Logger, binaryPath string, args []string, env []string, timeout time.Duration) error { log = log.With("context", "command output") // Create context with timeout if the timeout is greater than 0 @@ -63,22 +55,24 @@ func executeCommand(ctx context.Context, log *logger.Logger, binaryPath string, // channel for the last error message from the stderr output errch := make(chan string, 1) ctxStderr := contextio.NewReader(ctx, proc.Stderr) - go func() { - var errText string - scanner := bufio.NewScanner(ctxStderr) - for scanner.Scan() { - line := scanner.Text() - if len(line) > 0 { - txt := strings.TrimSpace(line) - if len(txt) > 0 { - errText = txt - // Log error output line - log.Error(errText) + if ctxStderr != nil { + go func() { + var errText string + scanner := bufio.NewScanner(ctxStderr) + for scanner.Scan() { + line := scanner.Text() + if len(line) > 0 { + txt := strings.TrimSpace(line) + if len(txt) > 0 { + errText = txt + // Log error output line + log.Error(errText) + } } } - } - errch <- errText - }() + errch <- errText + }() + } procState := <-proc.Wait() if errors.Is(ctx.Err(), context.DeadlineExceeded) { @@ -98,200 +92,12 @@ func executeCommand(ctx context.Context, log *logger.Logger, binaryPath string, return err } -func executeServiceCommand(ctx context.Context, log *logger.Logger, binaryPath string, spec *component.ServiceOperationsCommandSpec, shouldRetry bool) error { +func executeServiceCommand(ctx context.Context, log *logger.Logger, binaryPath string, spec *component.ServiceOperationsCommandSpec) error { if spec == nil { log.Warnf("spec is nil, nothing to execute, binaryPath: %s", binaryPath) return nil } - - if !shouldRetry { - return executeCommand(ctx, log, binaryPath, spec.Args, envSpecToEnv(spec.Env), spec.Timeout) - } - - executeServiceCommandWithRetries( - ctx, log, binaryPath, spec, - context.Background(), 20*time.Second, 15*time.Minute, - ) - return nil -} - -func executeServiceCommandWithRetries( - cmdCtx context.Context, log *logger.Logger, binaryPath string, spec *component.ServiceOperationsCommandSpec, - retryCtx context.Context, defaultRetryInitInterval time.Duration, retryMaxInterval time.Duration, -) { - // If no initial retry interval is specified, use default value - retryInitInterval := spec.Retry.InitInterval - if retryInitInterval == 0 { - retryInitInterval = defaultRetryInitInterval - } - - serviceCmdRetrier.Start( - cmdCtx, log, - binaryPath, spec.Args, envSpecToEnv(spec.Env), spec.Timeout, - retryCtx, retryInitInterval, retryMaxInterval, - ) -} - -type cmdRetryInfo struct { - retryCancelFn context.CancelFunc - cmdCancelFn context.CancelFunc - cmdDone <-chan struct{} -} - -type cmdRetrier struct { - mu sync.RWMutex - cmds map[string]cmdRetryInfo -} - -func (cr *cmdRetrier) Start( - cmdCtx context.Context, log *logger.Logger, - binaryPath string, args []string, env []string, timeout time.Duration, - retryCtx context.Context, retrySleepInitDuration time.Duration, retrySleepMaxDuration time.Duration, -) { - cmdKey := cr.cmdKey(binaryPath, args, env) - - // Due to infinite retries, we may still be trying to (re)execute - // a command from a previous call to Start(). We should first stop - // these retries as well as the command process. - cr.Stop(cmdKey, log) - - // Track the command so we can cancel it and it's retries later. - cmdCtx, cmdCancelFn := context.WithCancel(cmdCtx) - retryCtx, retryCancelFn := context.WithCancel(retryCtx) - cmdDone := make(chan struct{}, 1) - cr.track(cmdKey, cmdCancelFn, retryCancelFn, cmdDone) - - // Execute command with retries and exponential backoff between attempts - expBackoff := backoff.NewExponentialBackOff() - expBackoff.InitialInterval = retrySleepInitDuration - expBackoff.MaxInterval = retrySleepMaxDuration - - backoffCtx := backoff.WithContext(expBackoff, retryCtx) - - // Since we will be executing the command with infinite retries, we don't - // want to block. So we execute the command with retries in its own - // goroutine. - go func() { - retryAttempt := 0 - - // Here we indefinitely retry the executeCommand call, as long as it - // returns a non-nil error. We will block here until executeCommand - // returns a nil error, indicating that the command being executed has - // successfully completed execution. - //nolint: errcheck // No point checking the error inside the goroutine. - backoff.RetryNotify( - func() error { - err := executeCommand(cmdCtx, log, binaryPath, args, env, timeout) - cmdDone <- struct{}{} - return err - }, - backoffCtx, - func(err error, retryAfter time.Duration) { - retryAttempt++ - log.Warnf( - "service command execution failed with error [%s], retrying (will be retry [%d]) after [%s]", - err.Error(), - retryAttempt, - retryAfter, - ) - <-cmdDone - }, - ) - - cr.untrack(cmdKey) - }() -} - -func (cr *cmdRetrier) Stop(cmdKey string, log *logger.Logger) { - cr.mu.Lock() - defer cr.mu.Unlock() - info, exists := cr.cmds[cmdKey] - if !exists { - log.Debugf("no retries for command key [%s] are pending; nothing to do", cmdKey) - return - } - - // Cancel the previous retries - info.retryCancelFn() - - // Cancel the previous command - info.cmdCancelFn() - - // Ensure that the previous command actually stopped running - <-info.cmdDone - - // Stop tracking - delete(cr.cmds, cmdKey) - log.Debugf("retries and command process for command key [%s] stopped", cmdKey) -} - -func (cr *cmdRetrier) track(cmdKey string, cmdCancelFn context.CancelFunc, retryCanceFn context.CancelFunc, cmdDone <-chan struct{}) { - cr.mu.Lock() - defer cr.mu.Unlock() - - // Initialize map if needed - if cr.cmds == nil { - cr.cmds = map[string]cmdRetryInfo{} - } - - cr.cmds[cmdKey] = cmdRetryInfo{ - retryCancelFn: retryCanceFn, - cmdCancelFn: cmdCancelFn, - cmdDone: cmdDone, - } -} - -func (cr *cmdRetrier) untrack(cmdKey string) { - cr.mu.Lock() - defer cr.mu.Unlock() - - delete(cr.cmds, cmdKey) -} - -// cmdKey returns a unique, deterministic integer for the combination of the given -// binaryPath, args, and env. This integer can be used to determine if the same command -// is being executed again or not. -func (cr *cmdRetrier) cmdKey(binaryPath string, args []string, env []string) string { - var sb strings.Builder - - sb.WriteString(binaryPath) - sb.WriteString("|") - - var posArgs, flagArgs []string - for i := 0; i < len(args); i++ { - arg := args[i] - if strings.HasPrefix(arg, "-") { - flagArgs = append(flagArgs, arg+" "+args[i+1]) - i++ - } else { - posArgs = append(posArgs, arg) - } - } - - sort.Strings(posArgs) - sort.Strings(flagArgs) - - for _, arg := range posArgs { - sb.WriteString(arg) - sb.WriteString("|") - } - - for _, arg := range flagArgs { - sb.WriteString(arg) - sb.WriteString("|") - } - - sort.Strings(env) - - for _, kv := range env { - sb.WriteString(kv) - sb.WriteString("|") - } - - digest := sha256.New() - digest.Write([]byte(sb.String())) - - return hex.EncodeToString(digest.Sum(nil)) + return executeCommand(ctx, log, binaryPath, spec.Args, envSpecToEnv(spec.Env), spec.Timeout) } func envSpecToEnv(envSpecs []component.CommandEnvSpec) []string { diff --git a/pkg/component/runtime/service_command_test.go b/pkg/component/runtime/service_command_test.go index 1f6dc17a628..07e419f2472 100644 --- a/pkg/component/runtime/service_command_test.go +++ b/pkg/component/runtime/service_command_test.go @@ -7,7 +7,6 @@ package runtime import ( "context" "errors" - "fmt" "os" "os/exec" "path/filepath" @@ -17,24 +16,17 @@ import ( "text/template" "time" - "go.uber.org/zap/zaptest/observer" - "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" - "go.uber.org/zap/zapcore" - - "github.com/stretchr/testify/require" "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent/pkg/component" "github.com/elastic/elastic-agent/pkg/core/logger" ) type progConfig struct { - ErrMessage string - ExitCode int - SleepMS int - SucceedAfter int64 // ms since unix epoch + ErrMessage string + ExitCode int + SleepMS int } const testProgramTemplate = ` @@ -47,13 +39,6 @@ import ( ) func main() { - if {{.SucceedAfter}} > 0 { - if time.Now().After(time.UnixMilli({{.SucceedAfter}})) { - fmt.Fprintln(os.Stderr, "testprog succeeded") - os.Exit(0) - } - } - if len("{{.ErrMessage}}") > 0 { fmt.Fprintf(os.Stderr, "{{.ErrMessage}}") } @@ -68,7 +53,7 @@ func main() { const testModFile = ` module prog -go 1.19 +go 1.18 ` func renderTestProg(cfg progConfig) string { @@ -126,15 +111,15 @@ func TestExecuteCommand(t *testing.T) { }, { name: "fail no error output", - cfg: progConfig{"", 1, 0, 0}, + cfg: progConfig{"", 1, 0}, }, { name: "fail with error output", - cfg: progConfig{"something failed", 2, 0, 0}, + cfg: progConfig{"something failed", 2, 0}, }, { name: "fail with timeout", - cfg: progConfig{"", 3, 5000, 0}, // executable runs for 5 seconds + cfg: progConfig{"", 3, 5000}, // executable runs for 5 seconds timeout: 100 * time.Millisecond, wantErr: context.DeadlineExceeded, }, @@ -196,285 +181,3 @@ func TestExecuteCommand(t *testing.T) { } } - -func TestExecuteServiceCommand(t *testing.T) { - // No spec - t.Run("no_spec", func(t *testing.T) { - ctx := context.Background() - log, obs := logger.NewTesting(t.Name()) - - exePath, err := prepareTestProg(ctx, log, t.TempDir(), progConfig{}) - require.NoError(t, err) - - err = executeServiceCommand(ctx, log, exePath, nil, false) - require.NoError(t, err) - - warnLogs := obs.FilterLevelExact(zapcore.WarnLevel) - require.Equal(t, 1, warnLogs.Len()) - require.Equal(t, fmt.Sprintf("spec is nil, nothing to execute, binaryPath: %s", exePath), warnLogs.TakeAll()[0].Message) - }) - - // Execution fails (without retries) - t.Run("no_retries_failed_execution", func(t *testing.T) { - ctx := context.Background() - log, obs := logger.NewTesting(t.Name()) - - exeConfig := progConfig{ - ErrMessage: "foo bar", - ExitCode: 111, - } - exePath, err := prepareTestProg(ctx, log, t.TempDir(), exeConfig) - require.NoError(t, err) - - err = executeServiceCommand(ctx, log, exePath, &component.ServiceOperationsCommandSpec{}, false) - require.EqualError(t, err, fmt.Sprintf("%s: exit status %d", exeConfig.ErrMessage, exeConfig.ExitCode)) - - require.Equal(t, 1, obs.Len()) - logs := obs.TakeAll() - require.Equal(t, zapcore.ErrorLevel, logs[0].Level) - require.Equal(t, exeConfig.ErrMessage, logs[0].Message) - }) - - // Execution succeeds (without retries) - t.Run("no_retries_successful_execution", func(t *testing.T) { - ctx := context.Background() - log, obs := logger.NewTesting(t.Name()) - - exePath, err := prepareTestProg(ctx, log, t.TempDir(), progConfig{}) - require.NoError(t, err) - - err = executeServiceCommand(ctx, log, exePath, &component.ServiceOperationsCommandSpec{}, false) - require.NoError(t, err) - - require.Equal(t, 0, obs.Len()) - }) - - // Execution succeeds on first attempt (with retries) - t.Run("with_retries_successful_execution", func(t *testing.T) { - ctx := context.Background() - log, obs := logger.NewTesting(t.Name()) - - exePath, err := prepareTestProg(ctx, log, t.TempDir(), progConfig{}) - require.NoError(t, err) - - err = executeServiceCommand(ctx, log, exePath, &component.ServiceOperationsCommandSpec{}, true) - require.NoError(t, err) - - // Remove debug-level logs as those are only being emitted from - // retrier internals. - obs = obs.Filter(func(entry observer.LoggedEntry) bool { - return entry.Level != zapcore.DebugLevel - }) - - require.Equal(t, 0, obs.Len()) - }) - - // Execution fails indefinitely and there is no retry configuration in spec - t.Run("failed_execution_no_retry_config", func(t *testing.T) { - cmdCtx := context.Background() - log, obs := logger.NewTesting(t.Name()) - - exeConfig := progConfig{ - ErrMessage: "foo bar", - ExitCode: 111, - } - exePath, err := prepareTestProg(cmdCtx, log, t.TempDir(), exeConfig) - require.NoError(t, err) - - // Since the service command is retried indefinitely, we need a way to - // stop the test within a reasonable amount of time - retryCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second) - defer cancel() - - defaultRetryInitInterval := 50 * time.Millisecond - retryMaxInterval := 200 * time.Millisecond - - executeServiceCommandWithRetries( - cmdCtx, log, exePath, &component.ServiceOperationsCommandSpec{}, - retryCtx, defaultRetryInitInterval, retryMaxInterval, - ) - - <-retryCtx.Done() - checkRetryLogs(t, obs, exeConfig) - }) - - // Execution fails indefinitely but there is retry configuration in spec - t.Run("failed_execution_with_retry_config", func(t *testing.T) { - cmdCtx := context.Background() - log, obs := logger.NewTesting(t.Name()) - - exeConfig := progConfig{ - ErrMessage: "foo bar", - ExitCode: 111, - } - exePath, err := prepareTestProg(cmdCtx, log, t.TempDir(), exeConfig) - require.NoError(t, err) - - // Since the service command is retried indefinitely, we need a way to - // stop the test within a reasonable amount of time - retryCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second) - defer cancel() - - defaultRetryInitInterval := 50 * time.Millisecond - retryMaxInterval := 1 * time.Second - - spec := &component.ServiceOperationsCommandSpec{ - // We deliberately set RetrySleepInitDuration to just shorter - // than the retryCtx timeout. With this we should observe: - // - the initial execution of the command (before any retries) - // - one message about the next (first) retry - // - one more execution of the command, as a result of the first retry - // - one message about the next (second) retry - Retry: component.RetryConfig{ - InitInterval: 700 * time.Millisecond, - }, - } - executeServiceCommandWithRetries( - cmdCtx, log, exePath, spec, - retryCtx, defaultRetryInitInterval, retryMaxInterval, - ) - - <-retryCtx.Done() - checkRetryLogs(t, obs, exeConfig) - }) - - // Execution fails initially but then succeeds after a few retries - t.Run("succeed_after_retry", func(t *testing.T) { - cmdCtx := context.Background() - log, obs := logger.NewTesting(t.Name()) - - const succeedCmdAfter = 3 * time.Second - now := time.Now() - exeConfig := progConfig{ - ErrMessage: "foo bar", - ExitCode: 111, - SucceedAfter: now.Add(succeedCmdAfter).UnixMilli(), - } - exePath, err := prepareTestProg(cmdCtx, log, t.TempDir(), exeConfig) - require.NoError(t, err) - - // Since the service command is retried indefinitely, we need a way to - // stop the test within a reasonable amount of time. However, we should never - // hit this timeout as the command should succeed before the timeout is reached. - retryCtx, cancel := context.WithTimeout(context.Background(), 6*time.Second) - defer cancel() - - defaultRetryInitInterval := 50 * time.Millisecond - retryMaxInterval := 1 * time.Second - - spec := &component.ServiceOperationsCommandSpec{ - Retry: component.RetryConfig{ - InitInterval: 200 * time.Millisecond, - }, - } - executeServiceCommandWithRetries( - cmdCtx, log, exePath, spec, - retryCtx, defaultRetryInitInterval, retryMaxInterval, - ) - - // Give the command time to succeed. - successMsgFilterFn := func(l observer.LoggedEntry) bool { - return strings.Contains(l.Message, "testprog succeeded") - } - require.Eventually(t, func() bool { - return obs.Filter(successMsgFilterFn).Len() == 1 - }, 5*time.Second, 1*time.Second) - - require.NoError(t, retryCtx.Err()) - - obs = obs.Filter(func(l observer.LoggedEntry) bool { - return !successMsgFilterFn(l) - }) - checkRetryLogs(t, obs, exeConfig) - }) - - // Ensure two calls to executeServiceCommandWithRetries cancels - // the previous command execution. - t.Run("previous_execution_cancels", func(t *testing.T) { - log, obs := logger.NewTesting(t.Name()) - - exeConfig := progConfig{ - ErrMessage: "foo bar", - ExitCode: 111, - } - exePath, err := prepareTestProg(context.Background(), log, t.TempDir(), exeConfig) - require.NoError(t, err) - - defaultRetryInitInterval := 50 * time.Millisecond - retryMaxInterval := 200 * time.Millisecond - - // First call - cmd1Ctx := context.Background() - retry1Ctx := context.Background() - - executeServiceCommandWithRetries( - cmd1Ctx, log, exePath, &component.ServiceOperationsCommandSpec{}, - retry1Ctx, defaultRetryInitInterval, retryMaxInterval, - ) - - debugLogs := obs.FilterLevelExact(zapcore.DebugLevel).TakeAll() - require.Len(t, debugLogs, 1) - require.Equal(t, - fmt.Sprintf( - "no retries for command key [%s] are pending; nothing to do", - serviceCmdRetrier.cmdKey(exePath, nil, nil), - ), - debugLogs[0].Message, - ) - - // Second call - cmd2Ctx := context.Background() - // Since the service command is retried indefinitely, we need a way to - // stop the test within a reasonable amount of time. However, we should never - // hit this timeout as the command should succeed before the timeout is reached. - retry2Ctx, cancel2 := context.WithTimeout(context.Background(), 4*time.Second) - defer cancel2() - - executeServiceCommandWithRetries( - cmd2Ctx, log, exePath, &component.ServiceOperationsCommandSpec{}, - retry2Ctx, defaultRetryInitInterval, retryMaxInterval, - ) - - debugLogs = obs.FilterLevelExact(zapcore.DebugLevel).TakeAll() - require.Len(t, debugLogs, 2) - require.Equal(t, - fmt.Sprintf( - "retries and command process for command key [%s] stopped", - serviceCmdRetrier.cmdKey(exePath, nil, nil), - ), - debugLogs[1].Message, - ) - - <-retry2Ctx.Done() - - checkRetryLogs(t, obs, exeConfig) - }) -} - -func checkRetryLogs(t *testing.T, obs *observer.ObservedLogs, exeConfig progConfig) { - t.Helper() - - // Remove debug-level logs as those are only being emitted from - // retrier internals. - obs = obs.Filter(func(entry observer.LoggedEntry) bool { - return entry.Level != zapcore.DebugLevel - }) - - // We expect there to be at least 2 log entries as this would indicate that - // at least one round of retries was attempted. - logs := obs.TakeAll() - require.GreaterOrEqual(t, len(logs), 2) - - for i, l := range logs { - if i%2 == 0 { - require.Equal(t, zapcore.ErrorLevel, l.Level) - require.Equal(t, exeConfig.ErrMessage, l.Message) - } else { - require.Equal(t, zapcore.WarnLevel, l.Level) - require.Contains(t, l.Message, fmt.Sprintf( - "service command execution failed with error [%s: exit status %d], retrying (will be retry [%d]) after", - exeConfig.ErrMessage, exeConfig.ExitCode, (i/2)+1, - )) - } - } -} diff --git a/pkg/component/spec.go b/pkg/component/spec.go index daeddf7beb5..dda00f4c223 100644 --- a/pkg/component/spec.go +++ b/pkg/component/spec.go @@ -154,9 +154,4 @@ type ServiceOperationsCommandSpec struct { Args []string `config:"args,omitempty" yaml:"args,omitempty"` Env []CommandEnvSpec `config:"env,omitempty" yaml:"env,omitempty"` Timeout time.Duration `config:"timeout,omitempty" yaml:"timeout,omitempty"` - Retry RetryConfig -} - -type RetryConfig struct { - InitInterval time.Duration `config:"init_interval,omitempty" yaml:"init_interval,omitempty"` }