diff --git a/changelog/fragments/1692141038-upgrade-watcher-logging.yaml b/changelog/fragments/1692141038-upgrade-watcher-logging.yaml new file mode 100644 index 00000000000..3212098077c --- /dev/null +++ b/changelog/fragments/1692141038-upgrade-watcher-logging.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: enhancement + +# Change summary; a 80ish characters long description of the change. +summary: Adds more logging to the rollback process + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +#description: + +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: elastic-agent + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/elastic-agent/pull/3245 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +#issue: https://github.com/owner/repo/1234 diff --git a/internal/pkg/agent/application/upgrade/rollback.go b/internal/pkg/agent/application/upgrade/rollback.go index f2ba0dee309..84d15e37a81 100644 --- a/internal/pkg/agent/application/upgrade/rollback.go +++ b/internal/pkg/agent/application/upgrade/rollback.go @@ -45,7 +45,7 @@ func Rollback(ctx context.Context, log *logger.Logger, prevHash string, currentH // Restart log.Info("Restarting the agent after rollback") - if err := restartAgent(ctx); err != nil { + if err := restartAgent(ctx, log); err != nil { return err } @@ -127,7 +127,7 @@ func InvokeWatcher(log *logger.Logger) error { return cmd.Start() } -func restartAgent(ctx context.Context) error { +func restartAgent(ctx context.Context, log *logger.Logger) error { restartFn := func(ctx context.Context) error { c := client.New() err := c.Connect(ctx) @@ -147,16 +147,21 @@ func restartAgent(ctx context.Context) error { signal := make(chan struct{}) backExp := backoff.NewExpBackoff(signal, restartBackoffInit, restartBackoffMax) - for i := maxRestartCount; i >= 1; i-- { + for restartAttempt := 1; restartAttempt <= maxRestartCount; restartAttempt++ { backExp.Wait() + log.Infof("Restarting Agent via control protocol; attempt %d of %d", restartAttempt, maxRestartCount) + err := restartFn(ctx) if err == nil { break } - if i == 1 { + if restartAttempt == maxRestartCount { + log.Error("Failed to restart agent via control protocol after final attempt") return err } + + log.Warnf("Failed to restart agent via control protocol: %s; will try again in %v", err.Error(), backExp.NextWait()) } close(signal) diff --git a/internal/pkg/agent/cmd/run.go b/internal/pkg/agent/cmd/run.go index 01b24a48b02..bc5d6a0be9b 100644 --- a/internal/pkg/agent/cmd/run.go +++ b/internal/pkg/agent/cmd/run.go @@ -144,6 +144,9 @@ func run(override cfgOverrider, testingMode bool, fleetInitTimeout time.Duration return err } + // Make sure to flush any buffered logs before we're done. + defer baseLogger.Sync() //nolint:errcheck // flushing buffered logs is best effort. + l := baseLogger.With("log", map[string]interface{}{ "source": agentName, }) diff --git a/internal/pkg/agent/cmd/watch.go b/internal/pkg/agent/cmd/watch.go index b3c29509471..83e31d38366 100644 --- a/internal/pkg/agent/cmd/watch.go +++ b/internal/pkg/agent/cmd/watch.go @@ -47,6 +47,9 @@ func newWatchCommandWithArgs(_ []string, streams *cli.IOStreams) *cobra.Command os.Exit(3) } + // Make sure to flush any buffered logs before we're done. + defer log.Sync() //nolint:errcheck // flushing buffered logs is best effort. + if err := watchCmd(log, cfg); err != nil { log.Errorw("Watch command failed", "error.message", err) fmt.Fprintf(streams.Err, "Watch command failed: %v\n%s\n", err, troubleshootMessage())