Skip to content

Commit

Permalink
VAULT-30819: verify DR secondary leader before unsealing followers
Browse files Browse the repository at this point in the history
After we've enabled DR replication on the secondary leader the existing
cluster followers will be resealed with the primary clusters encryption
keys. We have to unseal the followers to make them available. To ensure
that we absolutely take every precaution before attempting to unseal the
followers we now verify that the secondary leader is the cluster leader,
has a valid merkle tree, and is streaming wals from the primary cluster
before we attempt to unseal the secondary followers.

Signed-off-by: Ryan Cragun <[email protected]>
  • Loading branch information
ryancragun committed Sep 20, 2024
1 parent e848f16 commit c9f8bed
Show file tree
Hide file tree
Showing 11 changed files with 125 additions and 10 deletions.
2 changes: 1 addition & 1 deletion enos/enos-dev-scenario-pr-replication.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -815,7 +815,7 @@ scenario "dev_pr_replication" {
Depending on how we're configured we'll pass the unseal keys according to this guide:
https://developer.hashicorp.com/vault/docs/enterprise/replication#seals
EOF
module = module.vault_unseal_nodes
module = module.vault_unseal_replication_follower_nodes
depends_on = [
step.create_primary_cluster,
step.create_secondary_cluster,
Expand Down
4 changes: 2 additions & 2 deletions enos/enos-modules.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -256,8 +256,8 @@ module "vault_test_ui" {
ui_run_tests = var.ui_run_tests
}

module "vault_unseal_nodes" {
source = "./modules/vault_unseal_nodes"
module "vault_unseal_replication_followers" {
source = "./modules/vault_unseal_replication_followers"

vault_install_dir = var.vault_install_dir
}
Expand Down
14 changes: 11 additions & 3 deletions enos/enos-scenario-dr-replication.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -814,7 +814,11 @@ scenario "dr_replication" {
enos = local.enos_provider[matrix.distro]
}

verifies = quality.vault_api_sys_replication_dr_secondary_enable_write
verifies = [
quality.vault_api_sys_leader_read,
quality.vault_api_sys_replication_dr_secondary_enable_write,
quality.vault_api_sys_replication_dr_status_read,
]

variables {
ip_version = matrix.ip_version
Expand All @@ -834,7 +838,7 @@ scenario "dr_replication" {
type combinations. See the guide for more information:
https://developer.hashicorp.com/vault/docs/enterprise/replication#seals
EOF
module = module.vault_unseal_nodes
module = module.vault_unseal_replication_followers
depends_on = [
step.configure_dr_replication_secondary
]
Expand Down Expand Up @@ -883,7 +887,11 @@ scenario "dr_replication" {
and ensuring that all secondary nodes are unsealed.
EOF
module = module.vault_verify_dr_replication
depends_on = [step.configure_dr_replication_secondary]
depends_on = [
step.configure_dr_replication_secondary,
step.unseal_secondary_followers,
step.verify_secondary_cluster_is_unsealed_after_enabling_replication,
]

providers = {
enos = local.enos_provider[matrix.distro]
Expand Down
2 changes: 1 addition & 1 deletion enos/enos-scenario-pr-replication.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -820,7 +820,7 @@ scenario "pr_replication" {
type combinations. See the guide for more information:
https://developer.hashicorp.com/vault/docs/enterprise/replication#seals
EOF
module = module.vault_unseal_nodes
module = module.vault_unseal_replication_followers
depends_on = [
step.create_primary_cluster,
step.create_secondary_cluster,
Expand Down
6 changes: 4 additions & 2 deletions enos/modules/vault_setup_dr_primary/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,16 @@ variable "vault_root_token" {
type = string
description = "The vault root token"
}
resource "enos_remote_exec" "configure_dr_primary" {

// Enable DR replication on the primary. This will immediately clear all data in the secondary.
resource "enos_remote_exec" "enable_dr_replication" {
environment = {
VAULT_ADDR = var.vault_addr
VAULT_TOKEN = var.vault_root_token
VAULT_INSTALL_DIR = var.vault_install_dir
}

scripts = [abspath("${path.module}/scripts/configure-vault-dr-primary.sh")]
scripts = [abspath("${path.module}/scripts/enable.sh")]

transport = {
ssh = {
Expand Down
42 changes: 41 additions & 1 deletion enos/modules/vault_setup_replication_secondary/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ variable "wrapping_token" {
description = "The wrapping token created on primary cluster"
}

resource "enos_remote_exec" "configure_pr_secondary" {
resource "enos_remote_exec" "enable_replication" {
environment = {
VAULT_ADDR = var.vault_addr
VAULT_TOKEN = var.vault_root_token
Expand All @@ -72,3 +72,43 @@ resource "enos_remote_exec" "configure_pr_secondary" {
}
}
}

// Wait for our primary host to be the "leader", which means it's running and all "setup" tasks
// have been completed. We'll have to unseal our follower nodes after this has occurred.
module "wait_for_leader" {
source = "../vault_wait_for_leader"

depends_on = [
enos_remote_exec.enable_replication
]

hosts = { "0" : var.secondary_leader_host }
ip_version = var.ip_version
vault_addr = var.vault_addr
vault_install_dir = var.vault_install_dir
vault_root_token = var.vault_root_token
}

// Ensure that our leader is ready to for us to unseal follower nodes.
resource "enos_remote_exec" "wait_for_leader_ready" {
depends_on = [
module.wait_for_leader,
]

environment = {
REPLICATION_TYPE = var.replication_type
RETRY_INTERVAL = 3 // seconds
TIMEOUT_SECONDS = 60 // seconds
VAULT_ADDR = var.vault_addr
VAULT_TOKEN = var.vault_root_token
VAULT_INSTALL_DIR = var.vault_install_dir
}

scripts = [abspath("${path.module}/scripts/wait-for-leader-ready.sh")]

transport = {
ssh = {
host = var.secondary_leader_host.public_ip
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/usr/bin/env bash
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1

set -e

fail() {
echo "$1" 1>&2
return 1
}

[[ -z "$REPLICATION_TYPE" ]] && fail "REPLICATION_TYPE env variable has not been set"
[[ -z "$RETRY_INTERVAL" ]] && fail "RETRY_INTERVAL env variable has not been set"
[[ -z "$TIMEOUT_SECONDS" ]] && fail "TIMEOUT_SECONDS env variable has not been set"
[[ -z "$VAULT_ADDR" ]] && fail "VAULT_ADDR env variable has not been set"
[[ -z "$VAULT_INSTALL_DIR" ]] && fail "VAULT_INSTALL_DIR env variable has not been set"
[[ -z "$VAULT_TOKEN" ]] && fail "VAULT_TOKEN env variable has not been set"

binpath=${VAULT_INSTALL_DIR}/vault
test -x "$binpath" || fail "unable to locate vault binary at $binpath"

export VAULT_FORMAT=json

replicationStatus() {
$binpath read "sys/replication/${REPLICATION_TYPE}/status" | jq .data
}

isReady() {
# Find the leader private IP address
local status
if ! status=$(replicationStatus); then
return 1
fi

if ! jq -eMc '.state == "stream-wals"' &> /dev/null <<< "$status"; then
echo "DR replication state is not yet running" 1>&2
echo "DR replication is not yet running, got: $(jq '.state' <<< "$status")" 1>&2
return 1
fi

if ! jq -eMc '.mode == "secondary"' &> /dev/null <<< "$status"; then
echo "DR replication mode is not yet primary, got: $(jq '.mode' <<< "$status")" 1>&2
return 1
fi

if ! jq -eMc '.corrupted_merkle_tree == false' &> /dev/null <<< "$status"; then
echo "DR replication merkle is corrupted" 1>&2
return 1
fi

echo "${REPLICATION_TYPE} primary is ready for followers to be unsealed!" 1>&2
return 0
}

begin_time=$(date +%s)
end_time=$((begin_time + TIMEOUT_SECONDS))
while [ "$(date +%s)" -lt "$end_time" ]; do
if isReady; then
exit 0
fi

sleep "$RETRY_INTERVAL"
done

fail "Timed out waiting for ${REPLICATION_TYPE} primary to ready: $(replicationStatus)"
File renamed without changes.

0 comments on commit c9f8bed

Please sign in to comment.