diff --git a/pods/status.go b/pods/status.go index ee0b0f4c..6f30310b 100644 --- a/pods/status.go +++ b/pods/status.go @@ -14,6 +14,10 @@ import ( log "k8s.io/klog/v2" ) +// MaxPullTime is how long we will give a pod to start after seeing +// ErrImagePull. +var MaxPullTime = time.Minute * 15 + // A Watcher watches pod and container updates. type Watcher struct { ctx context.Context @@ -21,6 +25,7 @@ type Watcher struct { wstop func() cancel func() podStates map[types.UID]string + podStart map[types.UID]time.Time cStates map[string]string ch chan *PodStatus stdout io.Writer @@ -54,6 +59,7 @@ func newWatcher(ctx context.Context, cancel func(), ch chan *PodStatus, stop fun cancel: cancel, stdout: os.Stdout, podStates: map[types.UID]string{}, + podStart: map[types.UID]time.Time{}, cStates: map[string]string{}, warningf: log.Warningf, } @@ -194,6 +200,17 @@ func (w *Watcher) updatePod(s *PodStatus) bool { w.cancel() return false case c.Reason == "ErrImagePull": + if t, ok := w.podStart[s.UID]; ok { + if t.Add(MaxPullTime).Before(time.Now()) { + showContainer(s, &c, "PULL TIMED OUT") + w.warningf("%s: pull timed out after %v", fullName, MaxPullTime) + w.errCh <- fmt.Errorf("%s IMAGE:%s pull timed out after %v", fullName, c.Image, MaxPullTime) + w.cancel() + return false + } + } else { + w.podStart[s.UID] = time.Now() + } showContainer(s, &c, c.Reason) log.Infof("%s in ErrImagePull", fullName) case c.Reason == "ImagePullBackOff": diff --git a/pods/status_test.go b/pods/status_test.go index d6d61c9e..f127b4cd 100644 --- a/pods/status_test.go +++ b/pods/status_test.go @@ -200,6 +200,50 @@ func TestUpdatePod(t *testing.T) { } } +func TestImagePullTimeout(t *testing.T) { + var buf strings.Builder + + cancel := func() {} + stop := func() {} + + w := newWatcher(context.TODO(), cancel, nil, stop) + w.stdout = &buf + w.SetProgress(true) + const uid = types.UID("uid1") + + getErr := func() error { + select { + case err := <-w.errCh: + return err + default: + return nil + } + } + + if tm, ok := w.podStart[uid]; ok { + t.Fatalf("w.podStart[%v] is %v, want unset", uid, tm) + } + ps := &PodStatus{Name: "pod", UID: uid, Namespace: "ns", Phase: PodPending, + Containers: []ContainerStatus{{Name: "cont", Reason: "ErrImagePull"}}, + } + w.updatePod(ps) + if err := getErr(); err != nil { + t.Fatalf("unexpected error %v", err) + } + if _, ok := w.podStart[uid]; !ok { + t.Fatalf("w.podStart[%v] is not set", uid) + } + w.updatePod(ps) + if err := getErr(); err != nil { + t.Fatalf("unexpected error %v", err) + } + w.podStart[uid] = w.podStart[uid].Add(-(MaxPullTime + 1)) + w.updatePod(ps) + if s := errdiff.Check(getErr(), "pull timed out after"); s != "" { + t.Error(s) + } +} + func TestStop(t *testing.T) { canceled := false cancel := func() { canceled = true }