diff --git a/changelog/fragments/1692323065-bugfix-rollback-restart-agent-no-process.yaml b/changelog/fragments/1692323065-bugfix-rollback-restart-agent-no-process.yaml new file mode 100644 index 00000000000..226e015db0f --- /dev/null +++ b/changelog/fragments/1692323065-bugfix-rollback-restart-agent-no-process.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: bug-fix + +# Change summary; a 80ish characters long description of the change. +summary: Ensure that Elastic Agent is restarted during rollback + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +#description: + +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: elastic-agent + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/elastic-agent/pull/3268 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +#issue: https://github.com/owner/repo/1234 diff --git a/internal/pkg/agent/application/upgrade/rollback.go b/internal/pkg/agent/application/upgrade/rollback.go index 9e2f54171cc..ac835cd67e2 100644 --- a/internal/pkg/agent/application/upgrade/rollback.go +++ b/internal/pkg/agent/application/upgrade/rollback.go @@ -127,7 +127,7 @@ func InvokeWatcher(log *logger.Logger) error { } func restartAgent(ctx context.Context, log *logger.Logger) error { - restartFn := func(ctx context.Context) error { + restartViaDaemonFn := func(ctx context.Context) error { c := client.New() err := c.Connect(ctx) if err != nil { @@ -143,24 +143,43 @@ func restartAgent(ctx context.Context, log *logger.Logger) error { return nil } + restartViaServiceFn := func(ctx context.Context) error { + topPath := paths.Top() + err := install.RestartService(topPath) + if err != nil { + return fmt.Errorf("failed to restart agent via service: %w", err) + } + + return nil + } + signal := make(chan struct{}) backExp := backoff.NewExpBackoff(signal, restartBackoffInit, restartBackoffMax) for restartAttempt := 1; restartAttempt <= maxRestartCount; restartAttempt++ { backExp.Wait() log.Infof("Restarting Agent via control protocol; attempt %d of %d", restartAttempt, maxRestartCount) + // First, try to restart Agent by sending a restart command + // to its daemon (via GRPC). + err := restartViaDaemonFn(ctx) + if err == nil { + break + } + log.Warnf("Failed to restart agent via control protocol: %s", err.Error()) - err := restartFn(ctx) + // Next, try to restart Agent via the service. + log.Infof("Restarting Agent via service; attempt %d of %d", restartAttempt, maxRestartCount) + err = restartViaServiceFn(ctx) if err == nil { break } if restartAttempt == maxRestartCount { - log.Error("Failed to restart agent via control protocol after final attempt") + log.Error("Failed to restart agent after final attempt") return err } - log.Warnf("Failed to restart agent via control protocol: %s; will try again in %v", err.Error(), backExp.NextWait()) + log.Warnf("Failed to restart agent via service: %s; will try again in %v", err.Error(), backExp.NextWait()) } close(signal) diff --git a/internal/pkg/agent/install/install.go b/internal/pkg/agent/install/install.go index c21e9e97ffa..b91c931afd6 100644 --- a/internal/pkg/agent/install/install.go +++ b/internal/pkg/agent/install/install.go @@ -172,6 +172,22 @@ func StopService(topPath string) error { return nil } +// RestartService restarts the installed service. +func RestartService(topPath string) error { + svc, err := newService(topPath) + if err != nil { + return err + } + err = svc.Restart() + if err != nil { + return errors.New( + err, + fmt.Sprintf("failed to restart service (%s)", paths.ServiceName), + errors.M("service", paths.ServiceName)) + } + return nil +} + // FixPermissions fixes the permissions on the installed system. func FixPermissions(topPath string) error { return fixPermissions(topPath)