Skip to content

Commit

Permalink
Fix install/enroll command not failing when the daemon restart fails (#…
Browse files Browse the repository at this point in the history
…3815)

* fix install/enroll cmd not failing when agent restart fails

* surface errors that might occur during enroll
* fail install command if agent cannot be restarted
* do not print success message if there was an enroll error. Print an error message and the error instead
* add logs to show the different enroll attempts
* add more context t errors
* refactor internal/pkg/agent/install/perms_unix.go and add more context to errors
restore main version
* ignore agent restart error on enroll tests as there is no agent to be restarted
* daemonReloadWithBackoff does not retry on context deadline exceeded

* Do not reload the Agent daemon if enrolling from a container

The enroll command would always try to restart the daemon, however
when enrolling as part of the container command, there is no running
daemon to reload.

This commit adds a CLI flag, --skip-daemon-reload, to the enroll
command to skip the reloading step, the container command now makes
use of this flag.

* Apply suggestions from code review

Co-authored-by: Paolo Chilà <[email protected]>

* PR improvements

* Add integration test

* make lint happy

* PR improvements

* Fix after rebase

* Fix some issues

* more PR improvments

* Fix enroll command

* Fix TestContainterCMD

* Fix implementation

* Remove flaky integration test assertion

Asserting there are no errors in the logs from Elastic-Agent and all

Co-authored-by: Anderson Queriroz <[email protected]>
Co-authored-by: Paolo Chilà <[email protected]>
Co-authored-by: Pierre HILBERT <[email protected]>
(cherry picked from commit 938f0b9)

# Conflicts:
#	internal/pkg/composable/providers/kubernetessecrets/kubernetes_secrets_test.go
  • Loading branch information
belimawr authored and mergify[bot] committed Dec 28, 2023
1 parent 7156e03 commit 91f0603
Show file tree
Hide file tree
Showing 12 changed files with 513 additions and 68 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: bug-fix

# Change summary; a 80ish characters long description of the change.
summary: Surface errors during Agent's enroll process, failing if any happens.

# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
#description:

# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
component: elastic-agent

# PR URL; optional; the PR number that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
pr: https://github.com/elastic/elastic-agent/pull/3815/

# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
issue: https://github.com/elastic/elastic-agent/issues/3664
2 changes: 1 addition & 1 deletion dev-tools/mage/godaemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ var (
}
)

// BuildGoDaemon builds the go-deamon binary.
// BuildGoDaemon builds the go-daemon binary.
func BuildGoDaemon() error {
if GOOS != "linux" {
return errors.New("go-daemon only builds for linux")
Expand Down
1 change: 1 addition & 0 deletions internal/pkg/agent/cmd/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,7 @@ func buildEnrollArgs(cfg setupConfig, token string, policyID string) ([]string,
"--path.home", paths.Top(), // --path.home actually maps to paths.Top()
"--path.config", paths.Config(),
"--path.logs", paths.Logs(),
"--skip-daemon-reload",
}
if paths.Downloads() != "" {
args = append(args, "--path.downloads", paths.Downloads())
Expand Down
11 changes: 10 additions & 1 deletion internal/pkg/agent/cmd/enroll.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,10 @@ func addEnrollFlags(cmd *cobra.Command) {
cmd.Flags().BoolP("delay-enroll", "", false, "Delays enrollment to occur on first start of the Elastic Agent service")
cmd.Flags().DurationP("daemon-timeout", "", 0, "Timeout waiting for Elastic Agent daemon")
cmd.Flags().DurationP("fleet-server-timeout", "", 0, "Timeout waiting for Fleet Server to be ready to start enrollment")
cmd.Flags().Bool("skip-daemon-reload", false, "Skip daemon reload after enrolling")
cmd.Flags().StringSliceP("tag", "", []string{}, "User set tags")

cmd.Flags().MarkHidden("skip-daemon-reload") //nolint:errcheck // an error is only returned if the flag does not exist.
}

func validateEnrollFlags(cmd *cobra.Command) error {
Expand Down Expand Up @@ -141,6 +144,7 @@ func buildEnrollmentFlags(cmd *cobra.Command, url string, token string) []string
delayEnroll, _ := cmd.Flags().GetBool("delay-enroll")
daemonTimeout, _ := cmd.Flags().GetDuration("daemon-timeout")
fTimeout, _ := cmd.Flags().GetDuration("fleet-server-timeout")
skipDaemonReload, _ := cmd.Flags().GetBool("skip-daemon-reload")
fTags, _ := cmd.Flags().GetStringSlice("tag")
args := []string{}
if url != "" {
Expand Down Expand Up @@ -249,6 +253,9 @@ func buildEnrollmentFlags(cmd *cobra.Command, url string, token string) []string
args = append(args, "--fleet-server-es-insecure")
}

if skipDaemonReload {
args = append(args, "--skip-daemon-reload")
}
for _, v := range fTags {
args = append(args, "--tag", v)
}
Expand Down Expand Up @@ -338,6 +345,7 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error {
delayEnroll, _ := cmd.Flags().GetBool("delay-enroll")
daemonTimeout, _ := cmd.Flags().GetDuration("daemon-timeout")
fTimeout, _ := cmd.Flags().GetDuration("fleet-server-timeout")
skipDaemonReload, _ := cmd.Flags().GetBool("skip-daemon-reload")
tags, _ := cmd.Flags().GetStringSlice("tag")

caStr, _ := cmd.Flags().GetString("certificate-authorities")
Expand All @@ -351,7 +359,7 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error {
// Error: failed to fix permissions: chown /Library/Elastic/Agent/data/elastic-agent-c13f91/elastic-agent.app: operation not permitted
// This is because we are fixing permissions twice, once during installation and again during the enrollment step.
// When we are enrolling as part of installation on MacOS, skip the second attempt to fix permissions.
var fixPermissions bool = fromInstall
fixPermissions := fromInstall
if runtime.GOOS == "darwin" {
fixPermissions = false
}
Expand All @@ -370,6 +378,7 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error {
ProxyHeaders: mapFromEnvList(proxyHeaders),
DelayEnroll: delayEnroll,
DaemonTimeout: daemonTimeout,
SkipDaemonRestart: skipDaemonReload,
Tags: tags,
FleetServer: enrollCmdFleetServerOption{
ConnStr: fServer,
Expand Down
92 changes: 60 additions & 32 deletions internal/pkg/agent/cmd/enroll_cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,6 @@ import (
"strings"
"time"

"github.com/elastic/elastic-agent/pkg/utils"

"github.com/elastic/elastic-agent/pkg/control/v2/client"

"go.elastic.co/apm"
"gopkg.in/yaml.v2"

Expand All @@ -42,8 +38,10 @@ import (
fleetclient "github.com/elastic/elastic-agent/internal/pkg/fleetapi/client"
"github.com/elastic/elastic-agent/internal/pkg/release"
"github.com/elastic/elastic-agent/internal/pkg/remote"
"github.com/elastic/elastic-agent/pkg/control/v2/client"
"github.com/elastic/elastic-agent/pkg/core/logger"
"github.com/elastic/elastic-agent/pkg/core/process"
"github.com/elastic/elastic-agent/pkg/utils"
)

const (
Expand Down Expand Up @@ -115,6 +113,7 @@ type enrollCmdOption struct {
DelayEnroll bool `yaml:"-"`
FleetServer enrollCmdFleetServerOption `yaml:"-"`
SkipCreateSecret bool `yaml:"-"`
SkipDaemonRestart bool `yaml:"-"`
Tags []string `yaml:"omitempty"`
}

Expand Down Expand Up @@ -174,7 +173,7 @@ func newEnrollCmd(
)
}

// newEnrollCmdWithStore creates an new enrollment and accept a custom store.
// newEnrollCmdWithStore creates a new enrollment and accept a custom store.
func newEnrollCmdWithStore(
log *logger.Logger,
options *enrollCmdOption,
Expand All @@ -189,10 +188,11 @@ func newEnrollCmdWithStore(
}, nil
}

// Execute tries to enroll the agent into Fleet.
// Execute enrolls the agent into Fleet.
func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
var err error
defer c.stopAgent() // ensure its stopped no matter what

span, ctx := apm.StartSpan(ctx, "enroll", "app.internal")
defer func() {
apm.CaptureError(ctx, err).Send()
Expand Down Expand Up @@ -237,7 +237,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
// Ensure that the agent does not use a proxy configuration
// when connecting to the local fleet server.
// Note that when running fleet-server the enroll request will be sent to :8220,
// however when the agent is running afterwards requests will be sent to :8221
// however when the agent is running afterward requests will be sent to :8221
c.remoteConfig.Transport.Proxy.Disable = true
}

Expand All @@ -258,7 +258,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {

err = c.enrollWithBackoff(ctx, persistentConfig)
if err != nil {
return errors.New(err, "fail to enroll")
return fmt.Errorf("fail to enroll: %w", err)
}

if c.options.FixPermissions {
Expand All @@ -269,17 +269,23 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
}

defer func() {
fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.")
if err != nil {
fmt.Fprintf(streams.Err, "Something went wrong while enrolling the Elastic Agent: %v\n", err)
} else {
fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.")
}
}()

if c.agentProc == nil {
if err := c.daemonReload(ctx); err != nil {
c.log.Infow("Elastic Agent might not be running; unable to trigger restart", "error", err)
} else {
c.log.Info("Successfully triggered restart on running Elastic Agent.")
if c.agentProc == nil && !c.options.SkipDaemonRestart {
if err = c.daemonReloadWithBackoff(ctx); err != nil {
c.log.Errorf("Elastic Agent might not be running; unable to trigger restart: %v", err)
return fmt.Errorf("could not reload agent daemon, unable to trigger restart: %w", err)
}

c.log.Info("Successfully triggered restart on running Elastic Agent.")
return nil
}

c.log.Info("Elastic Agent has been enrolled; start Elastic Agent")
return nil
}
Expand Down Expand Up @@ -444,25 +450,34 @@ func (c *enrollCmd) prepareFleetTLS() error {
}

func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error {
err := c.daemonReload(ctx)
if err == nil {
return nil
}

signal := make(chan struct{})
backExp := backoff.NewExpBackoff(signal, 10*time.Second, 1*time.Minute)
defer close(signal)
backExp := backoff.NewExpBackoff(signal, 1*time.Second, 1*time.Minute)

for i := 5; i >= 0; i-- {
backExp.Wait()
c.log.Info("Retrying to restart...")
err = c.daemonReload(ctx)
var lastErr error
for i := 0; i < 5; i++ {
attempt := i

c.log.Infof("Restarting agent daemon, attempt %d", attempt)
err := c.daemonReload(ctx)
if err == nil {
break
return nil
}

// If the context was cancelled, return early
if errors.Is(err, context.DeadlineExceeded) ||
errors.Is(err, context.Canceled) {
return fmt.Errorf("could not reload daemon after %d retries: %w",
attempt, err)
}
lastErr = err

c.log.Errorf("Restart attempt %d failed: '%s'. Waiting for %s", attempt, err, backExp.NextWait().String())
backExp.Wait()

}

close(signal)
return err
return fmt.Errorf("could not reload agent's daemon, all retries failed. Last error: %w", lastErr)
}

func (c *enrollCmd) daemonReload(ctx context.Context) error {
Expand All @@ -480,8 +495,20 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[

c.log.Infof("Starting enrollment to URL: %s", c.client.URI())
err := c.enroll(ctx, persistentConfig)
if err == nil {
return nil
}

const deadline = 10 * time.Minute
const frequency = 60 * time.Second

c.log.Infof("1st enrollment attempt failed, retrying for %s, every %s enrolling to URL: %s",
deadline,
frequency,
c.client.URI())
signal := make(chan struct{})
backExp := backoff.NewExpBackoff(signal, 60*time.Second, 10*time.Minute)
defer close(signal)
backExp := backoff.NewExpBackoff(signal, frequency, deadline)

for {
retry := false
Expand All @@ -500,7 +527,6 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[
err = c.enroll(ctx, persistentConfig)
}

close(signal)
return err
}

Expand Down Expand Up @@ -549,8 +575,10 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte
c.options.FleetServer.ElasticsearchInsecure,
)
if err != nil {
return err
return fmt.Errorf(
"failed creating fleet-server bootstrap config: %w", err)
}

// no longer need bootstrap at this point
serverConfig.Server.Bootstrap = false
fleetConfig.Server = serverConfig.Server
Expand All @@ -570,11 +598,11 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte

reader, err := yamlToReader(configToStore)
if err != nil {
return err
return fmt.Errorf("yamlToReader failed: %w", err)
}

if err := safelyStoreAgentInfo(c.configStore, reader); err != nil {
return err
return fmt.Errorf("failed to store agent config: %w", err)
}

// clear action store
Expand Down
Loading

0 comments on commit 91f0603

Please sign in to comment.