Skip to content

Commit

Permalink
storage: prevent node from restarting after corruption
Browse files Browse the repository at this point in the history
Many deployments auto-restart crashed nodes unconditionally. As a
result, we are often pulled into inconsistency failures relatively late,
which results in a loss of information (in particular if the source of
the problem turns out to be one of the nodes that did not actually
crash); if the node that crashes *is* the one with the problem,
restarting it lets it serve data again (until the next time the
consistency checker nukes it), which is bad as well.

This commit introduces a "death rattle" - if a node is told to terminate
as the result of a consistency check, it will write a marker file that
prevents subsequent restarts of the node with an informative message
alerting the operator that there is a serious problem that needs to be
addressed. We use the same strategy when a replica finds that its
internal invariants are violated.

Fixes #40442.

Release note (general change): nodes that have been terminated as the
result of a failed consistency check now refuse to restart, making it
more likely that the operator notices that there is a persistent issue
in a timely manner.
  • Loading branch information
tbg committed Nov 18, 2019
1 parent 7dbdd94 commit ba6699b
Show file tree
Hide file tree
Showing 9 changed files with 153 additions and 6 deletions.
47 changes: 47 additions & 0 deletions pkg/base/store_spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ package base
import (
"bytes"
"fmt"
"io/ioutil"
"net"
"os"
"path/filepath"
"regexp"
"sort"
Expand Down Expand Up @@ -357,6 +359,51 @@ func (ssl StoreSpecList) String() string {
return buffer.String()
}

// AuxiliaryDir is the path of the auxiliary dir relative to an engine.Engine's
// root directory. It must not be changed without a proper migration.
const AuxiliaryDir = "auxiliary"

// PreventedStartupFile is the filename (relative to 'dir') used for files that
// can block server startup.
func PreventedStartupFile(dir string) string {
return filepath.Join(dir, "_CRITICAL_ALERT.txt")
}

// GetPreventedStartupMessage attempts to read the PreventedStartupFile for each
// store directory and returns their concatenated contents. These files
// typically request operator intervention after a corruption event by
// preventing the affected node(s) from starting back up.
func (ssl StoreSpecList) GetPreventedStartupMessage() (string, error) {
var buf strings.Builder
for _, ss := range ssl.Specs {
path := ss.PreventedStartupFile()
if path == "" {
continue
}
b, err := ioutil.ReadFile(path)
if err != nil {
if !os.IsNotExist(err) {
return "", err
}
continue
}
fmt.Fprintf(&buf, "From %s:\n\n", path)
_, _ = buf.Write(b)
fmt.Fprintln(&buf)
}
return buf.String(), nil
}

// PreventedStartupFile returns the path to a file which, if it exists, should
// prevent the server from starting up. Returns an empty string for in-memory
// engines.
func (ss StoreSpec) PreventedStartupFile() string {
if ss.InMemory {
return ""
}
return PreventedStartupFile(filepath.Join(ss.Path, AuxiliaryDir))
}

// Type returns the underlying type in string form. This is part of pflag's
// value interface.
func (ssl *StoreSpecList) Type() string {
Expand Down
38 changes: 38 additions & 0 deletions pkg/base/store_spec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,17 @@ package base_test

import (
"fmt"
"io/ioutil"
"os"
"path/filepath"
"reflect"
"testing"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/stretchr/testify/require"
)

// TestNewStoreSpec verifies that the --store arguments are correctly parsed
Expand Down Expand Up @@ -217,3 +221,37 @@ func TestJoinListType(t *testing.T) {
})
}
}

func TestStoreSpecListPreventedStartupMessage(t *testing.T) {
defer leaktest.AfterTest(t)()

dir, cleanup := testutils.TempDir(t)
defer cleanup()

boomStoreDir := filepath.Join(dir, "boom")
boomAuxDir := filepath.Join(boomStoreDir, base.AuxiliaryDir)
okStoreDir := filepath.Join(dir, "ok")
okAuxDir := filepath.Join(okStoreDir, base.AuxiliaryDir)

for _, sd := range []string{boomAuxDir, okAuxDir} {
require.NoError(t, os.MkdirAll(sd, 0755))
}

ssl := base.StoreSpecList{
Specs: []base.StoreSpec{
{Path: "foo", InMemory: true},
{Path: okStoreDir},
{Path: boomStoreDir},
},
}

msg, err := ssl.GetPreventedStartupMessage()
require.NoError(t, err)
require.Empty(t, msg)

require.NoError(t, ioutil.WriteFile(ssl.Specs[2].PreventedStartupFile(), []byte("boom"), 0644))

msg, err = ssl.GetPreventedStartupMessage()
require.NoError(t, err)
require.Contains(t, msg, "boom")
}
6 changes: 6 additions & 0 deletions pkg/cli/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,12 @@ func runStart(cmd *cobra.Command, args []string, disableReplication bool) error
return err
}

if s, err := serverCfg.Stores.GetPreventedStartupMessage(); err != nil {
return err
} else if s != "" {
log.Fatal(context.Background(), s)
}

// Set up the signal handlers. This also ensures that any of these
// signals received beyond this point do not interrupt the startup
// sequence until the point signals are checked below.
Expand Down
7 changes: 7 additions & 0 deletions pkg/storage/consistency_queue_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"bytes"
"context"
"fmt"
"io/ioutil"
"math/rand"
"path/filepath"
"testing"
Expand All @@ -35,6 +36,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
"github.com/cockroachdb/cockroach/pkg/util/uuid"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

// TestConsistencyQueueRequiresLive verifies the queue will not
Expand Down Expand Up @@ -356,6 +358,11 @@ func TestCheckConsistencyInconsistent(t *testing.T) {
assert.Equal(t, roachpb.CheckConsistencyResponse_RANGE_INCONSISTENT, resp.Result[0].Status)
assert.Contains(t, resp.Result[0].Detail, `[minority]`)
assert.Contains(t, resp.Result[0].Detail, `stats`)

// A death rattle should have been written on s2 (store index 1).
b, err := ioutil.ReadFile(base.PreventedStartupFile(mtc.stores[1].Engine().GetAuxiliaryDir()))
require.NoError(t, err)
require.NotEmpty(t, b)
}

// TestConsistencyQueueRecomputeStats is an end-to-end test of the mechanism CockroachDB
Expand Down
2 changes: 1 addition & 1 deletion pkg/storage/engine/pebble.go
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ func NewPebble(ctx context.Context, cfg PebbleConfig) (*Pebble, error) {
return nil, err
}
} else {
auxDir = cfg.Opts.FS.PathJoin(cfg.Dir, "auxiliary")
auxDir = cfg.Opts.FS.PathJoin(cfg.Dir, base.AuxiliaryDir)
if err := cfg.Opts.FS.MkdirAll(auxDir, 0755); err != nil {
return nil, err
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/storage/engine/rocksdb.go
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,7 @@ func NewRocksDB(cfg RocksDBConfig, cache RocksDBCache) (*RocksDB, error) {
cache: cache.ref(),
}

if err := r.setAuxiliaryDir(filepath.Join(cfg.Dir, "auxiliary")); err != nil {
if err := r.setAuxiliaryDir(filepath.Join(cfg.Dir, base.AuxiliaryDir)); err != nil {
return nil, err
}

Expand Down
23 changes: 23 additions & 0 deletions pkg/storage/replica_corruption.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@ package storage

import (
"context"
"fmt"
"io/ioutil"
"os"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/util/log"
)
Expand Down Expand Up @@ -51,6 +55,25 @@ func (r *Replica) setCorruptRaftMuLocked(
log.ErrorfDepth(ctx, 1, "stalling replica due to: %s", cErr.ErrorMsg)
cErr.Processed = true
r.mu.destroyStatus.Set(cErr, destroyReasonRemoved)

auxDir := r.store.engine.GetAuxiliaryDir()
_ = os.MkdirAll(auxDir, 0755)
path := base.PreventedStartupFile(auxDir)

preventStartupMsg := fmt.Sprintf(`ATTENTION:
this node is terminating because replica %s detected an inconsistent state.
Please contact the CockroachDB support team. It is not necessarily safe
to replace this node; cluster data may still be at risk of corruption.
A file preventing this node from restarting was placed at:
%s
`, r, path)

if err := ioutil.WriteFile(path, []byte(preventStartupMsg), 0644); err != nil {
log.Warning(ctx, err)
}

log.FatalfDepth(ctx, 1, "replica is corrupted: %s", cErr)
return roachpb.NewError(cErr)
}
29 changes: 25 additions & 4 deletions pkg/storage/replica_proposal.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,14 @@ package storage
import (
"context"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strings"
"time"
"unsafe"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
Expand Down Expand Up @@ -256,14 +258,33 @@ func (r *Replica) computeChecksumPostApply(ctx context.Context, cc storagepb.Com
// holder and thus won't be printed to the logs. Since we're already
// in a goroutine that's about to end, simply sleep for a few seconds
// and then terminate.
auxDir := r.store.engine.GetAuxiliaryDir()
_ = os.MkdirAll(auxDir, 0755)
path := base.PreventedStartupFile(auxDir)

preventStartupMsg := fmt.Sprintf(`ATTENTION:
this node is terminating because a replica inconsistency was detected between %s
and its other replicas. Please check your cluster-wide log files for more
information and contact the CockroachDB support team. It is not necessarily safe
to replace this node; cluster data may still be at risk of corruption.
A checkpoints directory to aid (expert) debugging should be present in:
%s
A file preventing this node from restarting was placed at:
%s
`, r, auxDir, path)

if err := ioutil.WriteFile(path, []byte(preventStartupMsg), 0644); err != nil {
log.Warning(ctx, err)
}

if p := r.store.cfg.TestingKnobs.ConsistencyTestingKnobs.OnBadChecksumFatal; p != nil {
p(*r.store.Ident)
} else {
time.Sleep(10 * time.Second)
log.Fatalf(r.AnnotateCtx(context.Background()),
"this node is terminating because a replica inconsistency was detected "+
"between %s and its other replicas. Please check your cluster-wide log files for "+
"more information and contact the CockroachDB support team.", r)
log.Fatalf(r.AnnotateCtx(context.Background()), preventStartupMsg)
}
}

Expand Down
5 changes: 5 additions & 0 deletions pkg/storage/replica_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"fmt"
"math"
"math/rand"
"os"
"reflect"
"regexp"
"sort"
Expand Down Expand Up @@ -6409,6 +6410,10 @@ func TestReplicaCorruption(t *testing.T) {
t.Fatalf("unexpected error: %s", pErr)
}

// Should have laid down marker file to prevent startup.
_, err := os.Stat(base.PreventedStartupFile(tc.engine.GetAuxiliaryDir()))
require.NoError(t, err)

// Should have triggered fatal error.
if exitStatus != 255 {
t.Fatalf("unexpected exit status %d", exitStatus)
Expand Down

0 comments on commit ba6699b

Please sign in to comment.