feat: etcd cluster single node failure APISIX startup failure (#5158)

apache · Oct 14, 2021 · a413014 · a413014
1 parent 2b23907
commit a413014
Show file tree

Hide file tree

Showing 2 changed files with 69 additions and 21 deletions.
diff --git a/apisix/cli/etcd.lua b/apisix/cli/etcd.lua
@@ -32,6 +32,8 @@ local tonumber = tonumber
 local str_format = string.format
 local str_sub = string.sub
 local table_concat = table.concat
+local table_insert = table.insert
+local io_stderr = io.stderr
 
 local _M = {}
 
@@ -187,6 +189,7 @@ function _M.init(env, args)
     end
 
     -- check the etcd cluster version
+    local etcd_healthy_hosts = {}
     for index, host in ipairs(yaml_conf.etcd.host) do
         local version_url = host .. "/version"
         local errmsg
@@ -206,29 +209,38 @@ function _M.init(env, args)
                              version_url, err, retry_time))
         end
 
-        if not res then
-            errmsg = str_format("request etcd endpoint \'%s\' error, %s\n", version_url, err)
-            util.die(errmsg)
-        end
+        if res then
+            local body, _, err = dkjson.decode(res)
+            if err or (body and not body["etcdcluster"]) then
+                errmsg = str_format("got malformed version message: \"%s\" from etcd \"%s\"\n", res,
+                        version_url)
+                util.die(errmsg)
+            end
 
-        local body, _, err = dkjson.decode(res)
-        if err or (body and not body["etcdcluster"]) then
-            errmsg = str_format("got malformed version message: \"%s\" from etcd \"%s\"\n", res,
-                                version_url)
-            util.die(errmsg)
-        end
+            local cluster_version = body["etcdcluster"]
+            if compare_semantic_version(cluster_version, env.min_etcd_version) then
+                util.die("etcd cluster version ", cluster_version,
+                         " is less than the required version ", env.min_etcd_version,
+                         ", please upgrade your etcd cluster\n")
+            end
 
-        local cluster_version = body["etcdcluster"]
-        if compare_semantic_version(cluster_version, env.min_etcd_version) then
-            util.die("etcd cluster version ", cluster_version,
-                     " is less than the required version ",
-                     env.min_etcd_version,
-                     ", please upgrade your etcd cluster\n")
+            table_insert(etcd_healthy_hosts, host)
+        else
+            io_stderr:write(str_format("request etcd endpoint \'%s\' error, %s\n", version_url,
+                    err))
         end
     end
 
+    if #etcd_healthy_hosts <= 0 then
+        util.die("all etcd nodes are unavailable\n")
+    end
+
+    if (#etcd_healthy_hosts / host_count * 100) <= 50 then
+        util.die("the etcd cluster needs at least 50% and above healthy nodes\n")
+    end
+
     local etcd_ok = false
-    for index, host in ipairs(yaml_conf.etcd.host) do
+    for index, host in ipairs(etcd_healthy_hosts) do
         local is_success = true
 
         local errmsg
@@ -358,7 +370,7 @@ function _M.init(env, args)
     end
 
     if not etcd_ok then
-        util.die("none of the configured etcd works well")
+        util.die("none of the configured etcd works well\n")
     end
 end
 

diff --git a/t/cli/test_etcd_healthcheck.sh b/t/cli/test_etcd_healthcheck.sh
@@ -41,7 +41,7 @@ etcd:
 
 docker-compose -f ./t/cli/docker-compose-etcd-cluster.yaml up -d
 
-# Check apisix not got effected when one etcd node disconnected
+# case 1: Check apisix not got effected when one etcd node disconnected
 make init && make run
 
 docker stop ${ETCD_NAME_0}
@@ -69,7 +69,7 @@ make stop
 
 echo "passed: apisix not got effected when one etcd node disconnected"
 
-# Check when all etcd nodes disconnected, apisix trying to reconnect with backoff, and could successfully recover when reconnected
+# case 2: Check when all etcd nodes disconnected, apisix trying to reconnect with backoff, and could successfully recover when reconnected
 make init && make run
 
 docker stop ${ETCD_NAME_0} && docker stop ${ETCD_NAME_1} && docker stop ${ETCD_NAME_2}
@@ -84,7 +84,7 @@ fi
 
 docker start ${ETCD_NAME_0} && docker start ${ETCD_NAME_1} && docker start ${ETCD_NAME_2}
 
-# sleep till etcd health check try to check again
+# case 3: sleep till etcd health check try to check again
 current_time=$(date +%s)
 sleep_seconds=$(( $sleep_till - $current_time + 3))
 if [ "$sleep_seconds" -gt 0 ]; then
@@ -102,3 +102,39 @@ fi
 make stop
 
 echo "passed: when all etcd nodes disconnected, apisix trying to reconnect with backoff, and could successfully recover when reconnected"
+
+# case 4: stop one etcd node (result: start successful)
+docker stop ${ETCD_NAME_0}
+
+out=$(make init 2>&1)
+if echo "$out" | grep "23790" | grep "connection refused"; then
+    echo "passed: APISIX successfully to start, stop only one etcd node"
+else
+    echo "failed: stop only one etcd node APISIX should start normally"
+    exit 1
+fi
+
+# case 5: stop two etcd nodes (result: start failure)
+docker stop ${ETCD_NAME_1}
+
+out=$(make init 2>&1 || true)
+if echo "$out" | grep "23791" | grep "connection refused"; then
+    echo "passed: APISIX failed to start, etcd cluster must have two or more healthy nodes"
+else
+    echo "failed: two etcd nodes have been stopped, APISIX should fail to start"
+    exit 1
+fi
+
+# case 6: stop all etcd nodes (result: start failure)
+docker stop ${ETCD_NAME_2}
+
+out=$(make init 2>&1 || true)
+if echo "$out" | grep "23792" | grep "connection refused"; then
+    echo "passed: APISIX failed to start, all etcd nodes have stopped"
+else
+    echo "failed: all etcd nodes have stopped, APISIX should not be able to start"
+    exit 1
+fi
+
+# stop etcd docker container
+docker-compose -f ./t/cli/docker-compose-etcd-cluster.yaml down