Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[8.10](backport #3382) Improve Upgrade-related logging #3412

Merged
merged 2 commits into from
Sep 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions changelog/fragments/1694190260-improve-upgrade-logging.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: enhancement

# Change summary; a 80ish characters long description of the change.
summary: Improve logging during Agent upgrades

# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
#description:

# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
component: elastic-agent

# PR URL; optional; the PR number that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
pr: https://github.com/elastic/elastic-agent/pull/3382

# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
#issue: https://github.com/owner/repo/1234
2 changes: 1 addition & 1 deletion internal/pkg/agent/application/upgrade/cleanup.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import (
// cleanNonMatchingVersionsFromDownloads will remove files that do not have the passed version number from the downloads directory.
func cleanNonMatchingVersionsFromDownloads(log *logger.Logger, version string) error {
downloadsPath := paths.Downloads()
log.Debugw("Cleaning up non-matching downloaded versions", "version", version, "downloads.path", downloadsPath)
log.Infow("Cleaning up non-matching downloaded versions", "version", version, "downloads.path", downloadsPath)

files, err := os.ReadDir(downloadsPath)
if os.IsNotExist(err) {
Expand Down
8 changes: 3 additions & 5 deletions internal/pkg/agent/application/upgrade/crash_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ package upgrade
import (
"context"
"fmt"
"os"
"sync"
"time"

Expand Down Expand Up @@ -53,7 +52,7 @@ func NewCrashChecker(ctx context.Context, ch chan error, log *logger.Logger, che
return nil, err
}

log.Debugf("running checks using '%s' controller", c.sc.Name())
log.Infof("running checks using '%s' controller", c.sc.Name())

return c, nil
}
Expand All @@ -62,9 +61,8 @@ func NewCrashChecker(ctx context.Context, ch chan error, log *logger.Logger, che
func (ch *CrashChecker) Run(ctx context.Context) {
defer ch.sc.Close()

ch.log.Debug("Crash checker started")
ch.log.Info("Crash checker started")
for {
ch.log.Debugf("watcher having PID: %d", os.Getpid())
t := time.NewTimer(ch.checkInterval)

select {
Expand All @@ -79,7 +77,7 @@ func (ch *CrashChecker) Run(ctx context.Context) {

ch.q.Push(pid)
restarts := ch.q.Distinct()
ch.log.Debugf("retrieved service PID [%d] changed %d times within %d", pid, restarts, evaluatedPeriods)
ch.log.Infof("retrieved service PID [%d] changed %d times within %d", pid, restarts, evaluatedPeriods)
if restarts > crashesAllowed {
ch.notifyChan <- errors.New(fmt.Sprintf("service restarted '%d' times within '%v' seconds", restarts, ch.checkInterval.Seconds()))
}
Expand Down
2 changes: 1 addition & 1 deletion internal/pkg/agent/application/upgrade/error_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ func NewErrorChecker(ch chan error, log *logger.Logger, checkInterval time.Durat

// Run runs the checking loop.
func (ch *ErrorChecker) Run(ctx context.Context) {
ch.log.Debug("Error checker started")
ch.log.Info("Error checker started")
for {
t := time.NewTimer(ch.checkInterval)
select {
Expand Down
22 changes: 15 additions & 7 deletions internal/pkg/agent/application/upgrade/rollback.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ func Rollback(ctx context.Context, log *logger.Logger, prevHash string, currentH

// Cleanup removes all artifacts and files related to a specified version.
func Cleanup(log *logger.Logger, currentHash string, removeMarker bool, keepLogs bool) error {
log.Debugw("Cleaning up upgrade", "hash", currentHash, "remove_marker", removeMarker)
log.Infow("Cleaning up upgrade", "hash", currentHash, "remove_marker", removeMarker)
<-time.After(afterRestartDelay)

// remove upgrade marker
Expand All @@ -78,7 +78,7 @@ func Cleanup(log *logger.Logger, currentHash string, removeMarker bool, keepLogs

// remove symlink to avoid upgrade failures, ignore error
prevSymlink := prevSymlinkPath()
log.Debugw("Removing previous symlink path", "file.path", prevSymlinkPath())
log.Infow("Removing previous symlink path", "file.path", prevSymlinkPath())
_ = os.Remove(prevSymlink)

dirPrefix := fmt.Sprintf("%s-", agentName)
Expand All @@ -93,7 +93,7 @@ func Cleanup(log *logger.Logger, currentHash string, removeMarker bool, keepLogs
}

hashedDir := filepath.Join(paths.Data(), dir)
log.Debugw("Removing hashed data directory", "file.path", hashedDir)
log.Infow("Removing hashed data directory", "file.path", hashedDir)
var ignoredDirs []string
if keepLogs {
ignoredDirs = append(ignoredDirs, "logs")
Expand All @@ -110,20 +110,28 @@ func Cleanup(log *logger.Logger, currentHash string, removeMarker bool, keepLogs
// agent during upgrade period.
func InvokeWatcher(log *logger.Logger) error {
if !IsUpgradeable() {
log.Debug("agent is not upgradable, not starting watcher")
log.Info("agent is not upgradable, not starting watcher")
return nil
}

cmd := invokeCmd()
defer func() {
if cmd.Process != nil {
log.Debugf("releasing watcher %v", cmd.Process.Pid)
log.Infof("releasing watcher %v", cmd.Process.Pid)
_ = cmd.Process.Release()
}
}()

log.Debugw("Starting upgrade watcher", "path", cmd.Path, "args", cmd.Args, "env", cmd.Env, "dir", cmd.Dir)
return cmd.Start()
log.Infow("Starting upgrade watcher", "path", cmd.Path, "args", cmd.Args, "env", cmd.Env, "dir", cmd.Dir)
if err := cmd.Start(); err != nil {
return fmt.Errorf("failed to start Upgrade Watcher: %w", err)
}

upgradeWatcherPID := cmd.Process.Pid
agentPID := os.Getpid()
log.Infow("Upgrade Watcher invoked", "agent.upgrade.watcher.process.pid", upgradeWatcherPID, "agent.process.pid", agentPID)

return nil
}

func restartAgent(ctx context.Context, log *logger.Logger) error {
Expand Down
4 changes: 2 additions & 2 deletions internal/pkg/agent/application/upgrade/step_download.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ func (u *Upgrader) downloadArtifact(ctx context.Context, version, sourceURI stri
}
}

u.log.Debugw("Downloading upgrade artifact", "version", version,
u.log.Infow("Downloading upgrade artifact", "version", version,
"source_uri", settings.SourceURI, "drop_path", settings.DropPath,
"target_path", settings.TargetDirectory, "install_path", settings.InstallPath)

Expand Down Expand Up @@ -162,7 +162,7 @@ func (u *Upgrader) downloadWithRetries(

opFn := func() error {
attempt++
u.log.Debugf("download attempt %d", attempt)
u.log.Infof("download attempt %d", attempt)

downloader, err := downloaderCtor(version, u.log, settings)
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion internal/pkg/agent/application/upgrade/step_mark.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ func UpdateActiveCommit(log *logger.Logger, hash string) error {
// CleanMarker removes a marker from disk.
func CleanMarker(log *logger.Logger) error {
markerFile := markerFilePath()
log.Debugw("Removing marker file", "file.path", markerFile)
log.Infow("Removing marker file", "file.path", markerFile)
if err := os.Remove(markerFile); !os.IsNotExist(err) {
return err
}
Expand Down
12 changes: 6 additions & 6 deletions internal/pkg/agent/application/upgrade/upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ import (
"path/filepath"
"strings"

"github.com/elastic/elastic-agent/internal/pkg/config"

"github.com/otiai10/copy"
"go.elastic.co/apm"

"github.com/elastic/elastic-agent/internal/pkg/config"

"github.com/elastic/elastic-agent/internal/pkg/agent/application/info"
"github.com/elastic/elastic-agent/internal/pkg/agent/application/paths"
"github.com/elastic/elastic-agent/internal/pkg/agent/application/reexec"
Expand Down Expand Up @@ -172,7 +172,7 @@ func (u *Upgrader) Upgrade(ctx context.Context, version string, sourceURI string
cb := shutdownCallback(u.log, paths.Home(), release.Version(), version, release.TrimCommit(newHash))

// Clean everything from the downloads dir
u.log.Debugw("Removing downloads directory", "file.path", paths.Downloads())
u.log.Infow("Removing downloads directory", "file.path", paths.Downloads())
err = os.RemoveAll(paths.Downloads())
if err != nil {
u.log.Errorw("Unable to clean downloads after update", "error.message", err, "file.path", paths.Downloads())
Expand Down Expand Up @@ -231,11 +231,11 @@ func copyActionStore(log *logger.Logger, newHash string) error {
// copies legacy action_store.yml, state.yml and state.enc encrypted file if exists
storePaths := []string{paths.AgentActionStoreFile(), paths.AgentStateStoreYmlFile(), paths.AgentStateStoreFile()}
newHome := filepath.Join(filepath.Dir(paths.Home()), fmt.Sprintf("%s-%s", agentName, newHash))
log.Debugw("Copying action store", "new_home_path", newHome)
log.Infow("Copying action store", "new_home_path", newHome)

for _, currentActionStorePath := range storePaths {
newActionStorePath := filepath.Join(newHome, filepath.Base(currentActionStorePath))
log.Debugw("Copying action store path", "from", currentActionStorePath, "to", newActionStorePath)
log.Infow("Copying action store path", "from", currentActionStorePath, "to", newActionStorePath)
currentActionStore, err := os.ReadFile(currentActionStorePath)
if os.IsNotExist(err) {
// nothing to copy
Expand Down Expand Up @@ -266,7 +266,7 @@ func copyRunDirectory(log *logger.Logger, newHash string) error {
err := copyDir(log, oldRunPath, newRunPath, true)
if os.IsNotExist(err) {
// nothing to copy, operation ok
log.Debugw("Run directory not present", "old_run_path", oldRunPath)
log.Infow("Run directory not present", "old_run_path", oldRunPath)
return nil
}
if err != nil {
Expand Down
2 changes: 2 additions & 0 deletions internal/pkg/agent/cmd/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ func run(override cfgOverrider, testingMode bool, fleetInitTimeout time.Duration
"source": agentName,
})

l.Infow("Elastic Agent started", "process.pid", os.Getpid(), "agent.version", version.GetAgentPackageVersion())

cfg, err = tryDelayEnroll(ctx, l, cfg, override)
if err != nil {
err = errors.New(err, "failed to perform delayed enrollment")
Expand Down
9 changes: 6 additions & 3 deletions internal/pkg/agent/cmd/watch.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ import (
"syscall"
"time"

"github.com/elastic/elastic-agent/version"

"github.com/elastic/elastic-agent/internal/pkg/config"

"github.com/spf13/cobra"
Expand Down Expand Up @@ -62,21 +64,22 @@ func newWatchCommandWithArgs(_ []string, streams *cli.IOStreams) *cobra.Command
}

func watchCmd(log *logp.Logger, cfg *configuration.Configuration) error {
log.Infow("Upgrade Watcher started", "process.pid", os.Getpid(), "agent.version", version.GetAgentPackageVersion())
marker, err := upgrade.LoadMarker()
if err != nil {
log.Error("failed to load marker", err)
return err
}
if marker == nil {
// no marker found we're not in upgrade process
log.Debugf("update marker not present at '%s'", paths.Data())
log.Infof("update marker not present at '%s'", paths.Data())
return nil
}

locker := filelock.NewAppLocker(paths.Top(), watcherLockFile)
if err := locker.TryLock(); err != nil {
if errors.Is(err, filelock.ErrAppAlreadyRunning) {
log.Debugf("exiting, lock already exists")
log.Info("exiting, lock already exists")
return nil
}

Expand All @@ -89,7 +92,7 @@ func watchCmd(log *logp.Logger, cfg *configuration.Configuration) error {

isWithinGrace, tilGrace := gracePeriod(marker, cfg.Settings.Upgrade.Watcher.GracePeriod)
if !isWithinGrace {
log.Debugf("not within grace [updatedOn %v] %v", marker.UpdatedOn.String(), time.Since(marker.UpdatedOn).String())
log.Infof("not within grace [updatedOn %v] %v", marker.UpdatedOn.String(), time.Since(marker.UpdatedOn).String())
// if it is started outside of upgrade loop
// if we're not within grace and marker is still there it might mean
// that cleanup was not performed ok, cleanup everything except current version
Expand Down
Loading