Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve logs when there is a timeout error #1946

Merged
merged 2 commits into from
Mar 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions pkg/agent/apiserver/apiserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"fmt"
"io/ioutil"
"net"
"net/http"
"os"
"path"

Expand All @@ -28,6 +29,7 @@ import (
k8sversion "k8s.io/apimachinery/pkg/version"
"k8s.io/apiserver/pkg/registry/rest"
genericapiserver "k8s.io/apiserver/pkg/server"
"k8s.io/apiserver/pkg/server/healthz"
genericoptions "k8s.io/apiserver/pkg/server/options"

"github.com/vmware-tanzu/antrea/pkg/agent/apiserver/handlers/addressgroup"
Expand Down Expand Up @@ -93,7 +95,7 @@ func installAPIGroup(s *genericapiserver.GenericAPIServer, aq agentquerier.Agent
// New creates an APIServer for running in antrea agent.
func New(aq agentquerier.AgentQuerier, npq querier.AgentNetworkPolicyInfoQuerier, bindPort int,
enableMetrics bool, kubeconfig string, cipherSuites []uint16, tlsMinVersion uint16) (*agentAPIServer, error) {
cfg, err := newConfig(bindPort, enableMetrics, kubeconfig)
cfg, err := newConfig(npq, bindPort, enableMetrics, kubeconfig)
if err != nil {
return nil, err
}
Expand All @@ -110,7 +112,7 @@ func New(aq agentquerier.AgentQuerier, npq querier.AgentNetworkPolicyInfoQuerier
return &agentAPIServer{GenericAPIServer: s}, nil
}

func newConfig(bindPort int, enableMetrics bool, kubeconfig string) (*genericapiserver.CompletedConfig, error) {
func newConfig(npq querier.AgentNetworkPolicyInfoQuerier, bindPort int, enableMetrics bool, kubeconfig string) (*genericapiserver.CompletedConfig, error) {
secureServing := genericoptions.NewSecureServingOptions().WithLoopback()
authentication := genericoptions.NewDelegatingAuthenticationOptions()
authorization := genericoptions.NewDelegatingAuthorizationOptions().WithAlwaysAllowPaths("/healthz", "/livez", "/readyz")
Expand Down Expand Up @@ -155,6 +157,14 @@ func newConfig(bindPort int, enableMetrics bool, kubeconfig string) (*genericapi
GitCommit: antreaversion.GetGitSHA(),
}
serverConfig.EnableMetrics = enableMetrics
// Add readiness probe to check the status of watchers.
check := healthz.NamedCheck("watcher", func(_ *http.Request) error {
if npq.GetControllerConnectionStatus() {
return nil
}
return fmt.Errorf("some watchers may not be connected")
})
Comment on lines +160 to +166
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

when I test the probe, I get [-]watcher failed: reason withheld

reason withheld is not what I'd expect here, shouldn't it say some watchers may not be connected?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

serverConfig.ReadyzChecks = append(serverConfig.ReadyzChecks, check)

completedServerCfg := serverConfig.Complete(nil)
return &completedServerCfg, nil
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package networkpolicy
import (
"context"
"fmt"
"reflect"
"sync"
"time"

Expand Down Expand Up @@ -44,6 +45,8 @@ const (
defaultWorkers = 4
)

var emptyWatch = watch.NewEmptyWatch()

// Controller is responsible for watching Antrea AddressGroups, AppliedToGroups,
// and NetworkPolicies, feeding them to ruleCache, getting dirty rules from
// ruleCache, invoking reconciler to reconcile them.
Expand Down Expand Up @@ -584,6 +587,12 @@ func (w *watcher) watch() {
klog.Warningf("Failed to start watch for %s: %v", w.objectType, err)
return
}
// Watch method doesn't return error but "emptyWatch" in case of some partial data errors,
// e.g. timeout error. Make sure that watcher is not empty and log warning otherwise.
if reflect.TypeOf(watcher) == reflect.TypeOf(emptyWatch) {
klog.Warningf("Failed to start watch for %s, please ensure antrea service is reachable for the agent", w.objectType)
return
}

klog.Infof("Started watch for %s", w.objectType)
w.setConnected(true)
Expand Down