Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

install fails if enroll fails and surface errors #3207

Merged
merged 1 commit into from
Oct 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: bug-fix

# Change summary; a 80ish characters long description of the change.
summary: Surface errors during Agent's enroll process, failing if any happens.

# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
#description:

# Affected component; a word indicating the component this changeset affects.
component: install/enroll

# PR URL; optional; the PR number that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
pr: https://github.com/elastic/elastic-agent/pull/3207

# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
#issue: https://github.com/owner/repo/1234
2 changes: 1 addition & 1 deletion internal/pkg/agent/cmd/enroll.go
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error {
// Error: failed to fix permissions: chown /Library/Elastic/Agent/data/elastic-agent-c13f91/elastic-agent.app: operation not permitted
// This is because we are fixing permissions twice, once during installation and again during the enrollment step.
// When we are enrolling as part of installation on MacOS, skip the second attempt to fix permissions.
var fixPermissions bool = fromInstall
fixPermissions := fromInstall
if runtime.GOOS == "darwin" {
fixPermissions = false
}
Expand Down
62 changes: 45 additions & 17 deletions internal/pkg/agent/cmd/enroll_cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ func newEnrollCmd(
)
}

// newEnrollCmdWithStore creates an new enrollment and accept a custom store.
// newEnrollCmdWithStore creates a new enrollment and accept a custom store.
func newEnrollCmdWithStore(
log *logger.Logger,
options *enrollCmdOption,
Expand All @@ -187,10 +187,11 @@ func newEnrollCmdWithStore(
}, nil
}

// Execute tries to enroll the agent into Fleet.
// Execute enrolls the agent into Fleet.
func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
var err error
defer c.stopAgent() // ensure its stopped no matter what

span, ctx := apm.StartSpan(ctx, "enroll", "app.internal")
defer func() {
apm.CaptureError(ctx, err).Send()
Expand Down Expand Up @@ -235,7 +236,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
// Ensure that the agent does not use a proxy configuration
// when connecting to the local fleet server.
// Note that when running fleet-server the enroll request will be sent to :8220,
// however when the agent is running afterwards requests will be sent to :8221
// however when the agent is running afterward requests will be sent to :8221
c.remoteConfig.Transport.Proxy.Disable = true
}

Expand All @@ -256,7 +257,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {

err = c.enrollWithBackoff(ctx, persistentConfig)
if err != nil {
return errors.New(err, "fail to enroll")
return fmt.Errorf("fail to enroll: %w", err)
}

if c.options.FixPermissions {
Expand All @@ -267,17 +268,23 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
}

defer func() {
fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.")
if err != nil {
fmt.Fprintf(streams.Err, "Something went wrong while enrolling the Elastic Agent: %v\n", err)
} else {
fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.")
}
}()

if c.agentProc == nil {
if err := c.daemonReload(ctx); err != nil {
c.log.Infow("Elastic Agent might not be running; unable to trigger restart", "error", err)
} else {
c.log.Info("Successfully triggered restart on running Elastic Agent.")
if err = c.daemonReloadWithBackoff(ctx); err != nil {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the 1 minute retry with a 10 minute wait might be too slow for attempts to restart the agent on the same host. We can likely retry every second to start with a maximum wait of up to 2 minutes. If we are still retrying after 2 minutes changes are we aren't going to get the agent to restart.

We seem to need this logic in integration tests to get them to pass reliably and using a 1 minute minimum interval is going to make every test that hits this case take an additional minute to run.

The same is probably true for the Fleet retries but that one is tied to system scalability during mass enrollments so I am more hesitant to change it.

c.log.Errorf("Elastic Agent might not be running; unable to trigger restart: %v", err)
return fmt.Errorf("could not reload agent daemon, unable to trigger restart: %w", err)
Comment on lines +280 to +281
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure you need the log and the error, at least I hope the error you return gets logged somewhere obvious.

}

c.log.Info("Successfully triggered restart on running Elastic Agent.")
return nil
}

c.log.Info("Elastic Agent has been enrolled; start Elastic Agent")
return nil
}
Expand Down Expand Up @@ -443,24 +450,32 @@ func (c *enrollCmd) prepareFleetTLS() error {

func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error {
err := c.daemonReload(ctx)
if err != nil &&
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need to special case this? Why can't we just enter the backoff loop below and exit it once it is finished?

(errors.Is(err, context.DeadlineExceeded) ||
errors.Is(err, context.Canceled)) {
return fmt.Errorf("could not reload deamon: %w", err)
}
if err == nil {
return nil
}

signal := make(chan struct{})
backExp := backoff.NewExpBackoff(signal, 10*time.Second, 1*time.Minute)

for i := 5; i >= 0; i-- {
var i int
for ; i < 5; i++ {
backExp.Wait()
c.log.Info("Retrying to restart...")
err = c.daemonReload(ctx)
if err == nil {
if err == nil ||
errors.Is(err, context.DeadlineExceeded) ||
errors.Is(err, context.Canceled) {
Comment on lines +470 to +472
Copy link
Member

@cmacknz cmacknz Oct 5, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could replace this with https://pkg.go.dev/github.com/cenkalti/backoff/v4 which has a WithContext option to natively understands contexts. We already use this package for upgrades. Example:

expBo := backoff.NewExponentialBackOff()
expBo.InitialInterval = settings.RetrySleepInitDuration
boCtx := backoff.WithContext(expBo, cancelCtx)
var path string
var attempt uint
opFn := func() error {
attempt++
u.log.Infof("download attempt %d", attempt)
downloader, err := downloaderCtor(version, u.log, settings)
if err != nil {
return fmt.Errorf("unable to create fetcher: %w", err)
}
// All download artifacts expect a name that includes <major>.<minor.<patch>[-SNAPSHOT] so we have to
// make sure not to include build metadata we might have in the parsed version (for snapshots we already
// used that to configure the URL we download the files from)
path, err = downloader.Download(cancelCtx, agentArtifact, version.VersionWithPrerelease())
if err != nil {
return fmt.Errorf("unable to download package: %w", err)
}
// Download successful
return nil
}
opFailureNotificationFn := func(err error, retryAfter time.Duration) {
u.log.Warnf("%s; retrying (will be retry %d) in %s.", err.Error(), attempt, retryAfter)
}

break
}
}

close(signal)
return err
return fmt.Errorf("could not reload deamon after %d retries: %w", i+1, err)
}

func (c *enrollCmd) daemonReload(ctx context.Context) error {
Expand All @@ -478,8 +493,20 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[

c.log.Infof("Starting enrollment to URL: %s", c.client.URI())
err := c.enroll(ctx, persistentConfig)
if err == nil {
return nil
}

const deadline = 10 * time.Minute
const frequency = 60 * time.Second
cmacknz marked this conversation as resolved.
Show resolved Hide resolved

c.log.Infof("1st enrollment attempt failed, retrying for %s, every %s enrolling to URL: %s",
deadline,
frequency,
c.client.URI())
signal := make(chan struct{})
backExp := backoff.NewExpBackoff(signal, 60*time.Second, 10*time.Minute)
defer close(signal)
backExp := backoff.NewExpBackoff(signal, frequency, deadline)

for {
retry := false
Expand All @@ -498,7 +525,6 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[
err = c.enroll(ctx, persistentConfig)
}

close(signal)
return err
}

Expand Down Expand Up @@ -547,8 +573,10 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte
c.options.FleetServer.ElasticsearchInsecure,
)
if err != nil {
return err
return fmt.Errorf(
"failed creating fleet-server bootstrap config: %w", err)
}

// no longer need bootstrap at this point
serverConfig.Server.Bootstrap = false
fleetConfig.Server = serverConfig.Server
Expand All @@ -568,11 +596,11 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte

reader, err := yamlToReader(configToStore)
if err != nil {
return err
return fmt.Errorf("yamlToReader failed: %w", err)
}

if err := safelyStoreAgentInfo(c.configStore, reader); err != nil {
return err
return fmt.Errorf("failed to store agent config: %w", err)
}

// clear action store
Expand Down
71 changes: 50 additions & 21 deletions internal/pkg/agent/cmd/enroll_cmd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@ import (
"os"
"runtime"
"strconv"
"strings"
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"github.com/elastic/elastic-agent/internal/pkg/agent/configuration"
Expand Down Expand Up @@ -159,14 +162,24 @@ func TestEnroll(t *testing.T) {
require.NoError(t, err)

streams, _, _, _ := cli.NewTestingIOStreams()
err = cmd.Execute(context.Background(), streams)
require.NoError(t, err)
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
defer cancel()
err = cmd.Execute(ctx, streams)

if err != nil &&
// There is no agent running, therefore nothing to be restarted.
// However, this will cause the Enroll command to return an error
// which we'll ignore here.
!strings.Contains(err.Error(),
"could not reload agent daemon, unable to trigger restart") {
t.Fatalf("enrrol coms returned and unexpected error: %v", err)
}

config, err := readConfig(store.Content)

require.NoError(t, err)
require.Equal(t, "my-access-api-key", config.AccessAPIKey)
require.Equal(t, host, config.Client.Host)

assert.Equal(t, "my-access-api-key", config.AccessAPIKey)
assert.Equal(t, host, config.Client.Host)
},
))

Expand Down Expand Up @@ -216,16 +229,24 @@ func TestEnroll(t *testing.T) {
require.NoError(t, err)

streams, _, _, _ := cli.NewTestingIOStreams()
err = cmd.Execute(context.Background(), streams)
require.NoError(t, err)

require.True(t, store.Called)

ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
defer cancel()
err = cmd.Execute(ctx, streams)
if err != nil &&
// There is no agent running, therefore nothing to be restarted.
// However, this will cause the Enroll command to return an error
// which we'll ignore here.
!strings.Contains(err.Error(),
"could not reload agent daemon, unable to trigger restart") {
t.Fatalf("enrrol coms returned and unexpected error: %v", err)
}

assert.True(t, store.Called)
config, err := readConfig(store.Content)

require.NoError(t, err)
require.Equal(t, "my-access-api-key", config.AccessAPIKey)
require.Equal(t, host, config.Client.Host)
assert.NoError(t, err)
assert.Equal(t, "my-access-api-key", config.AccessAPIKey)
assert.Equal(t, host, config.Client.Host)
},
))

Expand Down Expand Up @@ -275,16 +296,24 @@ func TestEnroll(t *testing.T) {
require.NoError(t, err)

streams, _, _, _ := cli.NewTestingIOStreams()
err = cmd.Execute(context.Background(), streams)
require.NoError(t, err)

require.True(t, store.Called)

ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
defer cancel()
err = cmd.Execute(ctx, streams)

if err != nil &&
// There is no agent running, therefore nothing to be restarted.
// However, this will cause the Enroll command to return an error
// which we'll ignore here.
!strings.Contains(err.Error(),
"could not reload agent daemon, unable to trigger restart") {
t.Fatalf("enrrol coms returned and unexpected error: %v", err)
}

assert.True(t, store.Called)
config, err := readConfig(store.Content)

require.NoError(t, err)
require.Equal(t, "my-access-api-key", config.AccessAPIKey)
require.Equal(t, host, config.Client.Host)
assert.Equal(t, "my-access-api-key", config.AccessAPIKey)
assert.Equal(t, host, config.Client.Host)
},
))

Expand Down
4 changes: 3 additions & 1 deletion internal/pkg/agent/cmd/install.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error {
return fmt.Errorf("problem reading prompt response")
}
if url == "" {
fmt.Fprintf(streams.Out, "Enrollment cancelled because no URL was provided.\n")
fmt.Fprintln(streams.Out, "Enrollment cancelled because no URL was provided.")
return nil
}
}
Expand Down Expand Up @@ -224,6 +224,8 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error {
}
}()
}

fmt.Fprintln(streams.Out, "Elastic Agent successfully installed, starting enrollment.")
}

if enroll {
Expand Down
32 changes: 20 additions & 12 deletions internal/pkg/agent/install/perms_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ package install

import (
"errors"
"fmt"
"io/fs"
"os"
"path/filepath"
Expand All @@ -18,19 +19,26 @@ func fixPermissions(topPath string) error {
return recursiveRootPermissions(topPath)
}

func recursiveRootPermissions(path string) error {
return filepath.Walk(path, func(name string, info fs.FileInfo, err error) error {
if err == nil {
// all files should be owned by root:root
err = os.Chown(name, 0, 0)
if err != nil {
return err
}
// remove any world permissions from the file
err = os.Chmod(name, info.Mode().Perm()&0770)
} else if errors.Is(err, fs.ErrNotExist) {
func recursiveRootPermissions(root string) error {
return filepath.Walk(root, func(path string, info fs.FileInfo, err error) error {
if errors.Is(err, fs.ErrNotExist) {
return nil
}
return err
if err != nil {
return fmt.Errorf("walk on %q failed: %w", path, err)
}

// all files should be owned by root:root
err = os.Chown(path, 0, 0)
if err != nil {
return fmt.Errorf("could not fix ownership of %q: %w", path, err)
}
// remove any world permissions from the file
err = os.Chmod(path, info.Mode().Perm()&0770)
if err != nil {
return fmt.Errorf("could not fix permissions of %q: %w", path, err)
}

return nil
})
}
Loading