From e883361ec44de8dab31ccd5fc9f99a41065ebb96 Mon Sep 17 00:00:00 2001 From: Jaime Soriano Pastor Date: Tue, 30 Jul 2019 18:17:58 +0200 Subject: [PATCH] Fail earlier on compose errors Compose wrapper was retrying on docker-compose up till timeout in case it was caused by resources exhaustion, but most of the times it fails it is caused by unrecoverable errors like mistakes in docker-compose.yml or Dockerfiles. Fail earlier in all cases except in the ones that can be recovered with the time. The motivation to handle the case of networks exhaustion is that we are planning to support multiple docker compose files, and multiple scenarios or versions at the same time, this can consume all the available network ranges when tests for several modules are run in parallel. --- libbeat/tests/compose/compose.go | 2 +- libbeat/tests/compose/wrapper.go | 45 +++++++++++++++++++++++++++++++- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/libbeat/tests/compose/compose.go b/libbeat/tests/compose/compose.go index d39459ee88f..2e554dd8397 100644 --- a/libbeat/tests/compose/compose.go +++ b/libbeat/tests/compose/compose.go @@ -54,7 +54,7 @@ func EnsureUpWithTimeout(t *testing.T, timeout int, services ...string) { for _, service := range services { err = compose.Start(service) if err != nil { - t.Fatal("failed to start service", service, err) + t.Fatalf("failed to start service %s: %v", service, err) } } diff --git a/libbeat/tests/compose/wrapper.go b/libbeat/tests/compose/wrapper.go index 4a10ef5135b..7d048f26019 100644 --- a/libbeat/tests/compose/wrapper.go +++ b/libbeat/tests/compose/wrapper.go @@ -18,11 +18,14 @@ package compose import ( + "bytes" "context" "fmt" + "io" "os" "os/exec" "strings" + "time" "github.com/docker/docker/api/types" "github.com/docker/docker/api/types/filters" @@ -95,7 +98,47 @@ func (d *wrapperDriver) Up(ctx context.Context, opts UpOptions, service string) args = append(args, service) } - return d.cmd(ctx, "up", args...).Run() + var stdout, stderr bytes.Buffer + defer io.Copy(os.Stdout, &stdout) + defer io.Copy(os.Stderr, &stderr) + for { + // Up can fail if we have reached some system limit, specially + // number of networks, retry while the context is not done + cmd := d.cmd(ctx, "up", args...) + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + if err == nil { + return nil + } + if err := fatalError(&stderr); err != nil { + return errors.Wrapf(err, "docker-compose up failed for service %s", service) + } + + select { + case <-time.After(time.Second): + case <-ctx.Done(): + return err + } + } +} + +var recoverableErrors = []string{ + `could not find an available, non-overlapping IPv4 address pool`, +} + +// fatalError parses the error message and check if it is caused by an unrecoverable error or not. +// It considers recoverable errors the ones caused by resources exhaustion. +// Parsing the error message is not nice, but it is the only way to clasify docker compose errors. +func fatalError(message *bytes.Buffer) error { + data := message.String() + for _, errorMsg := range recoverableErrors { + if strings.Contains(data, errorMsg) { + return nil + } + } + return errors.New(data) } func (d *wrapperDriver) Kill(ctx context.Context, signal string, service string) error {