Skip to content

Commit

Permalink
feat: etcd cluster single node failure APISIX startup failure (#5158)
Browse files Browse the repository at this point in the history
  • Loading branch information
shuaijinchao authored Oct 14, 2021
1 parent 2b23907 commit a413014
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 21 deletions.
48 changes: 30 additions & 18 deletions apisix/cli/etcd.lua
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ local tonumber = tonumber
local str_format = string.format
local str_sub = string.sub
local table_concat = table.concat
local table_insert = table.insert
local io_stderr = io.stderr

local _M = {}

Expand Down Expand Up @@ -187,6 +189,7 @@ function _M.init(env, args)
end

-- check the etcd cluster version
local etcd_healthy_hosts = {}
for index, host in ipairs(yaml_conf.etcd.host) do
local version_url = host .. "/version"
local errmsg
Expand All @@ -206,29 +209,38 @@ function _M.init(env, args)
version_url, err, retry_time))
end

if not res then
errmsg = str_format("request etcd endpoint \'%s\' error, %s\n", version_url, err)
util.die(errmsg)
end
if res then
local body, _, err = dkjson.decode(res)
if err or (body and not body["etcdcluster"]) then
errmsg = str_format("got malformed version message: \"%s\" from etcd \"%s\"\n", res,
version_url)
util.die(errmsg)
end

local body, _, err = dkjson.decode(res)
if err or (body and not body["etcdcluster"]) then
errmsg = str_format("got malformed version message: \"%s\" from etcd \"%s\"\n", res,
version_url)
util.die(errmsg)
end
local cluster_version = body["etcdcluster"]
if compare_semantic_version(cluster_version, env.min_etcd_version) then
util.die("etcd cluster version ", cluster_version,
" is less than the required version ", env.min_etcd_version,
", please upgrade your etcd cluster\n")
end

local cluster_version = body["etcdcluster"]
if compare_semantic_version(cluster_version, env.min_etcd_version) then
util.die("etcd cluster version ", cluster_version,
" is less than the required version ",
env.min_etcd_version,
", please upgrade your etcd cluster\n")
table_insert(etcd_healthy_hosts, host)
else
io_stderr:write(str_format("request etcd endpoint \'%s\' error, %s\n", version_url,
err))
end
end

if #etcd_healthy_hosts <= 0 then
util.die("all etcd nodes are unavailable\n")
end

if (#etcd_healthy_hosts / host_count * 100) <= 50 then
util.die("the etcd cluster needs at least 50% and above healthy nodes\n")
end

local etcd_ok = false
for index, host in ipairs(yaml_conf.etcd.host) do
for index, host in ipairs(etcd_healthy_hosts) do
local is_success = true

local errmsg
Expand Down Expand Up @@ -358,7 +370,7 @@ function _M.init(env, args)
end

if not etcd_ok then
util.die("none of the configured etcd works well")
util.die("none of the configured etcd works well\n")
end
end

Expand Down
42 changes: 39 additions & 3 deletions t/cli/test_etcd_healthcheck.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ etcd:

docker-compose -f ./t/cli/docker-compose-etcd-cluster.yaml up -d

# Check apisix not got effected when one etcd node disconnected
# case 1: Check apisix not got effected when one etcd node disconnected
make init && make run

docker stop ${ETCD_NAME_0}
Expand Down Expand Up @@ -69,7 +69,7 @@ make stop

echo "passed: apisix not got effected when one etcd node disconnected"

# Check when all etcd nodes disconnected, apisix trying to reconnect with backoff, and could successfully recover when reconnected
# case 2: Check when all etcd nodes disconnected, apisix trying to reconnect with backoff, and could successfully recover when reconnected
make init && make run

docker stop ${ETCD_NAME_0} && docker stop ${ETCD_NAME_1} && docker stop ${ETCD_NAME_2}
Expand All @@ -84,7 +84,7 @@ fi

docker start ${ETCD_NAME_0} && docker start ${ETCD_NAME_1} && docker start ${ETCD_NAME_2}

# sleep till etcd health check try to check again
# case 3: sleep till etcd health check try to check again
current_time=$(date +%s)
sleep_seconds=$(( $sleep_till - $current_time + 3))
if [ "$sleep_seconds" -gt 0 ]; then
Expand All @@ -102,3 +102,39 @@ fi
make stop

echo "passed: when all etcd nodes disconnected, apisix trying to reconnect with backoff, and could successfully recover when reconnected"

# case 4: stop one etcd node (result: start successful)
docker stop ${ETCD_NAME_0}

out=$(make init 2>&1)
if echo "$out" | grep "23790" | grep "connection refused"; then
echo "passed: APISIX successfully to start, stop only one etcd node"
else
echo "failed: stop only one etcd node APISIX should start normally"
exit 1
fi

# case 5: stop two etcd nodes (result: start failure)
docker stop ${ETCD_NAME_1}

out=$(make init 2>&1 || true)
if echo "$out" | grep "23791" | grep "connection refused"; then
echo "passed: APISIX failed to start, etcd cluster must have two or more healthy nodes"
else
echo "failed: two etcd nodes have been stopped, APISIX should fail to start"
exit 1
fi

# case 6: stop all etcd nodes (result: start failure)
docker stop ${ETCD_NAME_2}

out=$(make init 2>&1 || true)
if echo "$out" | grep "23792" | grep "connection refused"; then
echo "passed: APISIX failed to start, all etcd nodes have stopped"
else
echo "failed: all etcd nodes have stopped, APISIX should not be able to start"
exit 1
fi

# stop etcd docker container
docker-compose -f ./t/cli/docker-compose-etcd-cluster.yaml down

0 comments on commit a413014

Please sign in to comment.