From 966ca586637f95664f349cf894fd081ed4dfdb2e Mon Sep 17 00:00:00 2001 From: hty Date: Tue, 16 Mar 2021 19:02:50 +0800 Subject: [PATCH 1/2] Improve logs when there is a timeout error The agent logs will print the failure message when the connection to the controller is timeout. Also adding readiness probe to remind the users of the connectivity issue. Fixes #822 --- pkg/agent/apiserver/apiserver.go | 14 ++++++++++++-- .../networkpolicy/networkpolicy_controller.go | 9 +++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/pkg/agent/apiserver/apiserver.go b/pkg/agent/apiserver/apiserver.go index 4b767d3c345..75152d68109 100644 --- a/pkg/agent/apiserver/apiserver.go +++ b/pkg/agent/apiserver/apiserver.go @@ -18,6 +18,7 @@ import ( "fmt" "io/ioutil" "net" + "net/http" "os" "path" @@ -28,6 +29,7 @@ import ( k8sversion "k8s.io/apimachinery/pkg/version" "k8s.io/apiserver/pkg/registry/rest" genericapiserver "k8s.io/apiserver/pkg/server" + "k8s.io/apiserver/pkg/server/healthz" genericoptions "k8s.io/apiserver/pkg/server/options" "github.com/vmware-tanzu/antrea/pkg/agent/apiserver/handlers/addressgroup" @@ -93,7 +95,7 @@ func installAPIGroup(s *genericapiserver.GenericAPIServer, aq agentquerier.Agent // New creates an APIServer for running in antrea agent. func New(aq agentquerier.AgentQuerier, npq querier.AgentNetworkPolicyInfoQuerier, bindPort int, enableMetrics bool, kubeconfig string, cipherSuites []uint16, tlsMinVersion uint16) (*agentAPIServer, error) { - cfg, err := newConfig(bindPort, enableMetrics, kubeconfig) + cfg, err := newConfig(npq, bindPort, enableMetrics, kubeconfig) if err != nil { return nil, err } @@ -110,7 +112,7 @@ func New(aq agentquerier.AgentQuerier, npq querier.AgentNetworkPolicyInfoQuerier return &agentAPIServer{GenericAPIServer: s}, nil } -func newConfig(bindPort int, enableMetrics bool, kubeconfig string) (*genericapiserver.CompletedConfig, error) { +func newConfig(npq querier.AgentNetworkPolicyInfoQuerier, bindPort int, enableMetrics bool, kubeconfig string) (*genericapiserver.CompletedConfig, error) { secureServing := genericoptions.NewSecureServingOptions().WithLoopback() authentication := genericoptions.NewDelegatingAuthenticationOptions() authorization := genericoptions.NewDelegatingAuthorizationOptions().WithAlwaysAllowPaths("/healthz", "/livez", "/readyz") @@ -155,6 +157,14 @@ func newConfig(bindPort int, enableMetrics bool, kubeconfig string) (*genericapi GitCommit: antreaversion.GetGitSHA(), } serverConfig.EnableMetrics = enableMetrics + // Add readiness probe to check the status of watchers. + check := healthz.NamedCheck("watcher", func(_ *http.Request) error { + if npq.GetControllerConnectionStatus() { + return nil + } + return fmt.Errorf("some watchers may not be connected") + }) + serverConfig.ReadyzChecks = append(serverConfig.ReadyzChecks, check) completedServerCfg := serverConfig.Complete(nil) return &completedServerCfg, nil diff --git a/pkg/agent/controller/networkpolicy/networkpolicy_controller.go b/pkg/agent/controller/networkpolicy/networkpolicy_controller.go index 38f18085248..877ce3f2f8f 100644 --- a/pkg/agent/controller/networkpolicy/networkpolicy_controller.go +++ b/pkg/agent/controller/networkpolicy/networkpolicy_controller.go @@ -17,6 +17,7 @@ package networkpolicy import ( "context" "fmt" + "reflect" "sync" "time" @@ -44,6 +45,8 @@ const ( defaultWorkers = 4 ) +var emptyWatch = watch.NewEmptyWatch() + // Controller is responsible for watching Antrea AddressGroups, AppliedToGroups, // and NetworkPolicies, feeding them to ruleCache, getting dirty rules from // ruleCache, invoking reconciler to reconcile them. @@ -584,6 +587,12 @@ func (w *watcher) watch() { klog.Warningf("Failed to start watch for %s: %v", w.objectType, err) return } + // Watch method doesn't return error but "emptyWatch" in case of some partial data errors, + // e.g. timeout error. Make sure that watcher is not empty and log warning otherwise. + if reflect.TypeOf(watcher) == reflect.TypeOf(emptyWatch) { + klog.Warningf("Failed to start watch for %s. Please ensure antrea service is reachable for the agent", w.objectType) + return + } klog.Infof("Started watch for %s", w.objectType) w.setConnected(true) From a3081a66a25049509c2f8240749a201cbfe6cc93 Mon Sep 17 00:00:00 2001 From: hty690 <35486302+hty690@users.noreply.github.com> Date: Thu, 18 Mar 2021 09:15:23 +0800 Subject: [PATCH 2/2] Update pkg/agent/controller/networkpolicy/networkpolicy_controller.go Co-authored-by: Antonin Bas --- pkg/agent/controller/networkpolicy/networkpolicy_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/agent/controller/networkpolicy/networkpolicy_controller.go b/pkg/agent/controller/networkpolicy/networkpolicy_controller.go index 877ce3f2f8f..5bcfe5742c1 100644 --- a/pkg/agent/controller/networkpolicy/networkpolicy_controller.go +++ b/pkg/agent/controller/networkpolicy/networkpolicy_controller.go @@ -590,7 +590,7 @@ func (w *watcher) watch() { // Watch method doesn't return error but "emptyWatch" in case of some partial data errors, // e.g. timeout error. Make sure that watcher is not empty and log warning otherwise. if reflect.TypeOf(watcher) == reflect.TypeOf(emptyWatch) { - klog.Warningf("Failed to start watch for %s. Please ensure antrea service is reachable for the agent", w.objectType) + klog.Warningf("Failed to start watch for %s, please ensure antrea service is reachable for the agent", w.objectType) return }