Skip to content

Commit

Permalink
[Upgrade Watcher] Try restarting Agent in multiple ways during rollba…
Browse files Browse the repository at this point in the history
…ck (#3268) (#3311)

* Try restarting via the service as a backup

* [Testing] Crashing agent

* Adding CHANGELOG fragment

* [Testing] Bump up version

* Adding install.RestartService

* Removing testing code

* Use service manager's restart

* Updating comment

(cherry picked from commit 9377dca)

Co-authored-by: Shaunak Kashyap <[email protected]>
  • Loading branch information
mergify[bot] and ycombinator authored Aug 29, 2023
1 parent 0ecc897 commit 368242f
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 4 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: bug-fix

# Change summary; a 80ish characters long description of the change.
summary: Ensure that Elastic Agent is restarted during rollback

# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
#description:

# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
component: elastic-agent

# PR URL; optional; the PR number that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
pr: https://github.com/elastic/elastic-agent/pull/3268

# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
#issue: https://github.com/owner/repo/1234
27 changes: 23 additions & 4 deletions internal/pkg/agent/application/upgrade/rollback.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ func InvokeWatcher(log *logger.Logger) error {
}

func restartAgent(ctx context.Context, log *logger.Logger) error {
restartFn := func(ctx context.Context) error {
restartViaDaemonFn := func(ctx context.Context) error {
c := client.New()
err := c.Connect(ctx)
if err != nil {
Expand All @@ -143,24 +143,43 @@ func restartAgent(ctx context.Context, log *logger.Logger) error {
return nil
}

restartViaServiceFn := func(ctx context.Context) error {
topPath := paths.Top()
err := install.RestartService(topPath)
if err != nil {
return fmt.Errorf("failed to restart agent via service: %w", err)
}

return nil
}

signal := make(chan struct{})
backExp := backoff.NewExpBackoff(signal, restartBackoffInit, restartBackoffMax)

for restartAttempt := 1; restartAttempt <= maxRestartCount; restartAttempt++ {
backExp.Wait()
log.Infof("Restarting Agent via control protocol; attempt %d of %d", restartAttempt, maxRestartCount)
// First, try to restart Agent by sending a restart command
// to its daemon (via GRPC).
err := restartViaDaemonFn(ctx)
if err == nil {
break
}
log.Warnf("Failed to restart agent via control protocol: %s", err.Error())

err := restartFn(ctx)
// Next, try to restart Agent via the service.
log.Infof("Restarting Agent via service; attempt %d of %d", restartAttempt, maxRestartCount)
err = restartViaServiceFn(ctx)
if err == nil {
break
}

if restartAttempt == maxRestartCount {
log.Error("Failed to restart agent via control protocol after final attempt")
log.Error("Failed to restart agent after final attempt")
return err
}

log.Warnf("Failed to restart agent via control protocol: %s; will try again in %v", err.Error(), backExp.NextWait())
log.Warnf("Failed to restart agent via service: %s; will try again in %v", err.Error(), backExp.NextWait())
}

close(signal)
Expand Down
16 changes: 16 additions & 0 deletions internal/pkg/agent/install/install.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,22 @@ func StopService(topPath string) error {
return nil
}

// RestartService restarts the installed service.
func RestartService(topPath string) error {
svc, err := newService(topPath)
if err != nil {
return err
}
err = svc.Restart()
if err != nil {
return errors.New(
err,
fmt.Sprintf("failed to restart service (%s)", paths.ServiceName),
errors.M("service", paths.ServiceName))
}
return nil
}

// FixPermissions fixes the permissions on the installed system.
func FixPermissions(topPath string) error {
return fixPermissions(topPath)
Expand Down

0 comments on commit 368242f

Please sign in to comment.