Skip to content

Commit

Permalink
Set leave_on_terminate=true for servers and hardcode maxUnavailable=1
Browse files Browse the repository at this point in the history
When leave_on_terminate=false (default), rolling the statefulset is
disruptive because the new servers come up with the same node IDs but
different IP addresses. They can't join the server cluster until the old
server's node ID is marked as failed by serf. During this time, they continually
start leader elections because they don't know there's a leader. When
they eventually join the cluster, their election term is higher, and so
they trigger a leadership swap. The leadership swap happens at the same
time as the next node to be rolled is being stopped, and so the cluster
can end up without a leader.

With leave_on_terminate=true, the stopping server cleanly leaves the
cluster, so the new server can join smoothly, even though it has the
same node ID as the old server. This increases the speed of the rollout
and in my testing eliminates the period without a leader.

The downside of this change is that when a server leaves gracefully, it
also reduces the number of raft peers. The number of peers is used to
calculate the quorum size, so this can unexpectedly change the fault
tolerance of the cluster. When running with an odd number of servers, 1
server leaving the cluster does not affect quorum size. E.g. 5 servers
=> quorum 3, 4 servers => quorum still 3. During a rollout, Kubernetes
only stops 1 server at a time, so the quorum won't change. During a
voluntary disruption event, e.g. a node being drained, Kubernetes uses
the pod disruption budget to determine how many pods in a statefulset
can be made unavailable at a time. That's why this change hardcodes this
number to 1 now.
  • Loading branch information
lkysow committed Nov 23, 2023
1 parent e57edf5 commit e8b4e85
Show file tree
Hide file tree
Showing 9 changed files with 151 additions and 19 deletions.
2 changes: 1 addition & 1 deletion .changelog/2962.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
```releast-note:feature
```release-note:feature
api-gateway: (Consul Enterprise) Add JWT authentication and authorization for API Gateway and HTTPRoutes.
```
3 changes: 3 additions & 0 deletions .changelog/3000.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:breaking-change
server: set `leave_on_terminate` to `true` and set the server pod disruption budget `maxUnavailable` to `1`.
```
18 changes: 18 additions & 0 deletions acceptance/tests/server/main_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0

package server

import (
"os"
"testing"

testsuite "github.com/hashicorp/consul-k8s/acceptance/framework/suite"
)

var suite testsuite.Suite

func TestMain(m *testing.M) {
suite = testsuite.NewSuite(m)
os.Exit(suite.Run())
}
85 changes: 85 additions & 0 deletions acceptance/tests/server/server_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0

package server

import (
"encoding/json"
"fmt"
"testing"
"time"

"github.com/hashicorp/consul-k8s/acceptance/framework/consul"
"github.com/hashicorp/consul-k8s/acceptance/framework/helpers"
"github.com/hashicorp/consul-k8s/acceptance/framework/k8s"
"github.com/hashicorp/consul-k8s/acceptance/framework/logger"
"github.com/hashicorp/go-multierror"
"github.com/stretchr/testify/require"
)

// Test that when servers are restarted, they don't lose quorum.
func TestServerRestart(t *testing.T) {
ctx := suite.Environment().DefaultContext(t)
replicas := 3
releaseName := helpers.RandomName()
helmValues := map[string]string{
"global.enabled": "false",
"connectInject.enabled": "false",
"server.enabled": "true",
"server.replicas": fmt.Sprintf("%d", replicas),
}
consulCluster := consul.NewHelmCluster(t, helmValues, suite.Environment().DefaultContext(t), suite.Config(), releaseName)

consulCluster.Create(t)

// Start a separate goroutine to check if at any point quorum is lost.
// Use number of ready replicas as proxy for whether there is quorum because
// replicas are marked unready if they don't think there's a leader so if
// >n/2 replicas are unready then there's no quorum.
noQuorumCount := 0
var unmarshallErrs error
done := make(chan bool)
go func() {
for {
select {
case <-done:
return
default:
out, err := k8s.RunKubectlAndGetOutputE(t, ctx.KubectlOptions(t), "get", fmt.Sprintf("statefulset/%s-consul-server", releaseName),
"-o", "jsonpath={.status}")
if err != nil {
// Not failing the test on this error to reduce flakiness.
logger.Logf(t, "kubectl err: %s: %s", err, out)
break
}
type statefulsetOut struct {
ReadyReplicas *int `json:"readyReplicas,omitempty"`
}
var jsonOut statefulsetOut
if err = json.Unmarshal([]byte(out), &jsonOut); err != nil {
unmarshallErrs = multierror.Append(err)
} else if jsonOut.ReadyReplicas == nil || *jsonOut.ReadyReplicas < replicas-1 {
// note: for some k8s api reason when readyReplicas is 0 it's not included in the json output so
// that's why we're checking if it's nil.
noQuorumCount++
}
time.Sleep(1 * time.Second)
}
}
}()

// Restart servers
out, err := k8s.RunKubectlAndGetOutputE(t, ctx.KubectlOptions(t), "rollout", "restart", fmt.Sprintf("statefulset/%s-consul-server", releaseName))
require.NoError(t, err, out)

// Wait for restart to finish.
start := time.Now()
out, err = k8s.RunKubectlAndGetOutputE(t, ctx.KubectlOptions(t), "rollout", "status", "--timeout", "5m", "--watch", fmt.Sprintf("statefulset/%s-consul-server", releaseName))
require.NoError(t, err, out, "rollout status command errored, this likely means the rollout didn't complete in time")
close(done)

require.NoError(t, unmarshallErrs, "there were some json unmarshall errors, this is likely a bug")

logger.Logf(t, "restart took %s, there were %d seconds without quorum", time.Now().Sub(start), noQuorumCount)
require.Equal(t, 0, noQuorumCount, "there was %d seconds without quorum", noQuorumCount)
}
21 changes: 10 additions & 11 deletions charts/consul/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -166,23 +166,22 @@ Expand the name of the chart.
{{- end -}}

{{/*
Compute the maximum number of unavailable replicas for the PodDisruptionBudget.
This defaults to (n/2)-1 where n is the number of members of the server cluster.
Special case of replica equaling 3 and allowing a minor disruption of 1 otherwise
use the integer value
Add a special case for replicas=1, where it should default to 0 as well.
Calculate max number of server pods that are allowed to be voluntarily disrupted.
When there's 1 server, this is set to 0 because this pod should not be disrupted. This is an edge
case and I'm not sure it makes a difference when there's only one server but that's what the previous config was and
I don't want to change it for this edge case.
Otherwise we've changed this to always be 1 as part of the move to set leave_on_terminate
to true. With leave_on_terminate set to true, whenever a server pod is stopped, the number of peers in raft
is reduced. If the number of servers is odd and the count is reduced by 1, the quorum size doesn't change,
but if it's reduced by more than 1, the quorum size can change so that's why this is now always hardcoded to 1.
*/}}
{{- define "consul.pdb.maxUnavailable" -}}
{{- define "consul.server.pdb.maxUnavailable" -}}
{{- if eq (int .Values.server.replicas) 1 -}}
{{ 0 }}
{{- else if .Values.server.disruptionBudget.maxUnavailable -}}
{{ .Values.server.disruptionBudget.maxUnavailable -}}
{{- else -}}
{{- if eq (int .Values.server.replicas) 3 -}}
{{- 1 -}}
{{- else -}}
{{- sub (div (int .Values.server.replicas) 2) 1 -}}
{{- end -}}
{{ 1 }}
{{- end -}}
{{- end -}}
Expand Down
3 changes: 2 additions & 1 deletion charts/consul/templates/server-config-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ data:
"enabled": true
},
{{- end }}
"server": true
"server": true,
"leave_on_terminate": true
}
{{- $vaultConnectCAEnabled := and .Values.global.secretsBackend.vault.connectCA.address .Values.global.secretsBackend.vault.connectCA.rootPKIPath .Values.global.secretsBackend.vault.connectCA.intermediatePKIPath -}}
{{- if and .Values.global.secretsBackend.vault.enabled $vaultConnectCAEnabled }}
Expand Down
2 changes: 1 addition & 1 deletion charts/consul/templates/server-disruptionbudget.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ metadata:
release: {{ .Release.Name }}
component: server
spec:
maxUnavailable: {{ template "consul.pdb.maxUnavailable" . }}
maxUnavailable: {{ template "consul.server.pdb.maxUnavailable" . }}
selector:
matchLabels:
app: {{ template "consul.name" . }}
Expand Down
28 changes: 25 additions & 3 deletions charts/consul/test/unit/server-disruptionbudget.bats
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,16 @@ load _helpers
[ "${actual}" = "0" ]
}

@test "server/DisruptionBudget: correct maxUnavailable with replicas=2" {
cd `chart_dir`
local actual=$(helm template \
-s templates/server-disruptionbudget.yaml \
--set 'server.replicas=2' \
. | tee /dev/stderr |
yq '.spec.maxUnavailable' | tee /dev/stderr)
[ "${actual}" = "1" ]
}

@test "server/DisruptionBudget: correct maxUnavailable with replicas=3" {
cd `chart_dir`
local actual=$(helm template \
Expand Down Expand Up @@ -97,7 +107,7 @@ load _helpers
--set 'server.replicas=6' \
. | tee /dev/stderr |
yq '.spec.maxUnavailable' | tee /dev/stderr)
[ "${actual}" = "2" ]
[ "${actual}" = "1" ]
}

@test "server/DisruptionBudget: correct maxUnavailable with replicas=7" {
Expand All @@ -107,7 +117,7 @@ load _helpers
--set 'server.replicas=7' \
. | tee /dev/stderr |
yq '.spec.maxUnavailable' | tee /dev/stderr)
[ "${actual}" = "2" ]
[ "${actual}" = "1" ]
}

@test "server/DisruptionBudget: correct maxUnavailable with replicas=8" {
Expand All @@ -117,9 +127,21 @@ load _helpers
--set 'server.replicas=8' \
. | tee /dev/stderr |
yq '.spec.maxUnavailable' | tee /dev/stderr)
[ "${actual}" = "3" ]
[ "${actual}" = "1" ]
}

@test "server/DisruptionBudget: correct maxUnavailable when set with value" {
cd `chart_dir`
local actual=$(helm template \
-s templates/server-disruptionbudget.yaml \
--set 'server.replicas=5' \
--set 'server.disruptionBudget.maxUnavailable=5' \
. | tee /dev/stderr |
yq '.spec.maxUnavailable' | tee /dev/stderr)
[ "${actual}" = "5" ]
}


#--------------------------------------------------------------------
# apiVersion

Expand Down
8 changes: 6 additions & 2 deletions charts/consul/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -979,8 +979,12 @@ server:
# the server cluster is enabled. To disable, set to `false`.
enabled: true

# The maximum number of unavailable pods. By default, this will be
# automatically computed based on the `server.replicas` value to be `(n/2)-1`.
# The maximum number of unavailable pods. In most cases you should not change this. This setting has been kept
# as to not break backwards compatibility.
# By default, this is set to 1 internally in the chart. When server pods are stopped gracefully, they leave the Raft
# consensus pool. When running an odd number of servers, one server leaving the pool does not change the quorum
# size, and so fault tolerance is not affected. However, if more than one server were to leave the pool, the quorum
# size would change. That's why this is set to 1 internally and should not be changed in most cases.
# If you need to set this to `0`, you will need to add a
# --set 'server.disruptionBudget.maxUnavailable=0'` flag to the helm chart installation
# command because of a limitation in the Helm templating language.
Expand Down

0 comments on commit e8b4e85

Please sign in to comment.