Skip to content

Commit

Permalink
[Upgrade Watcher] Add logging to rollback restart step (#3245)
Browse files Browse the repository at this point in the history
* Add logging to rollback restart

* Better logging

* [Testing] Crashing agent

* Adding CHANGELOG entry

* Flush buffered log entries before exiting

* Adding nolint directives

* Sync'ing base logger

* Remove FIXMEs introduced for testing

* Inverting logic

(cherry picked from commit 1c8e56b)
  • Loading branch information
ycombinator authored and mergify[bot] committed Aug 28, 2023
1 parent 8baa62e commit 5298307
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 4 deletions.
32 changes: 32 additions & 0 deletions changelog/fragments/1692141038-upgrade-watcher-logging.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: enhancement

# Change summary; a 80ish characters long description of the change.
summary: Adds more logging to the rollback process

# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
#description:

# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
component: elastic-agent

# PR URL; optional; the PR number that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
pr: https://github.com/elastic/elastic-agent/pull/3245

# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
#issue: https://github.com/owner/repo/1234
13 changes: 9 additions & 4 deletions internal/pkg/agent/application/upgrade/rollback.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ func Rollback(ctx context.Context, log *logger.Logger, prevHash string, currentH

// Restart
log.Info("Restarting the agent after rollback")
if err := restartAgent(ctx); err != nil {
if err := restartAgent(ctx, log); err != nil {
return err
}

Expand Down Expand Up @@ -127,7 +127,7 @@ func InvokeWatcher(log *logger.Logger) error {
return cmd.Start()
}

func restartAgent(ctx context.Context) error {
func restartAgent(ctx context.Context, log *logger.Logger) error {
restartFn := func(ctx context.Context) error {
c := client.New()
err := c.Connect(ctx)
Expand All @@ -147,16 +147,21 @@ func restartAgent(ctx context.Context) error {
signal := make(chan struct{})
backExp := backoff.NewExpBackoff(signal, restartBackoffInit, restartBackoffMax)

for i := maxRestartCount; i >= 1; i-- {
for restartAttempt := 1; restartAttempt <= maxRestartCount; restartAttempt++ {
backExp.Wait()
log.Infof("Restarting Agent via control protocol; attempt %d of %d", restartAttempt, maxRestartCount)

err := restartFn(ctx)
if err == nil {
break
}

if i == 1 {
if restartAttempt == maxRestartCount {
log.Error("Failed to restart agent via control protocol after final attempt")
return err
}

log.Warnf("Failed to restart agent via control protocol: %s; will try again in %v", err.Error(), backExp.NextWait())
}

close(signal)
Expand Down
3 changes: 3 additions & 0 deletions internal/pkg/agent/cmd/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,9 @@ func run(override cfgOverrider, testingMode bool, fleetInitTimeout time.Duration
return err
}

// Make sure to flush any buffered logs before we're done.
defer baseLogger.Sync() //nolint:errcheck // flushing buffered logs is best effort.

l := baseLogger.With("log", map[string]interface{}{
"source": agentName,
})
Expand Down
3 changes: 3 additions & 0 deletions internal/pkg/agent/cmd/watch.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ func newWatchCommandWithArgs(_ []string, streams *cli.IOStreams) *cobra.Command
os.Exit(3)
}

// Make sure to flush any buffered logs before we're done.
defer log.Sync() //nolint:errcheck // flushing buffered logs is best effort.

if err := watchCmd(log, cfg); err != nil {
log.Errorw("Watch command failed", "error.message", err)
fmt.Fprintf(streams.Err, "Watch command failed: %v\n%s\n", err, troubleshootMessage())
Expand Down

0 comments on commit 5298307

Please sign in to comment.