Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Set a maximum amount of time to wait for an image to be pulled #550

Merged
merged 2 commits into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions pods/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,18 @@ import (
log "k8s.io/klog/v2"
)

// MaxPullTime is how long we will give a pod to start after seeing
// ErrImagePull.
var MaxPullTime = time.Minute * 15
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

15 minutes seems generous, but I suppose it's better than the current situation where it waits indefinitely!

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made it public so it can be turned into a flag :-)

We have tests that fail if they take more than an hour. I decide 15 minutes was sufficient to a) it really probably will never succeed and b) short enough that it won't accidentally timeout a test.


// A Watcher watches pod and container updates.
type Watcher struct {
ctx context.Context
errCh chan error
wstop func()
cancel func()
podStates map[types.UID]string
podStart map[types.UID]time.Time
cStates map[string]string
ch chan *PodStatus
stdout io.Writer
Expand Down Expand Up @@ -54,6 +59,7 @@ func newWatcher(ctx context.Context, cancel func(), ch chan *PodStatus, stop fun
cancel: cancel,
stdout: os.Stdout,
podStates: map[types.UID]string{},
podStart: map[types.UID]time.Time{},
cStates: map[string]string{},
warningf: log.Warningf,
}
Expand Down Expand Up @@ -194,6 +200,17 @@ func (w *Watcher) updatePod(s *PodStatus) bool {
w.cancel()
return false
case c.Reason == "ErrImagePull":
if t, ok := w.podStart[s.UID]; ok {
if t.Add(MaxPullTime).Before(time.Now()) {
showContainer(s, &c, "PULL TIMED OUT")
w.warningf("%s: pull timed out after %v", fullName, MaxPullTime)
w.errCh <- fmt.Errorf("%s IMAGE:%s pull timed out after %v", fullName, c.Image, MaxPullTime)
w.cancel()
return false
}
} else {
w.podStart[s.UID] = time.Now()
}
showContainer(s, &c, c.Reason)
log.Infof("%s in ErrImagePull", fullName)
case c.Reason == "ImagePullBackOff":
Expand Down
44 changes: 44 additions & 0 deletions pods/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,50 @@ func TestUpdatePod(t *testing.T) {
}
}

func TestImagePullTimeout(t *testing.T) {
var buf strings.Builder

cancel := func() {}
stop := func() {}

w := newWatcher(context.TODO(), cancel, nil, stop)
w.stdout = &buf
w.SetProgress(true)
const uid = types.UID("uid1")

getErr := func() error {
select {
case err := <-w.errCh:
return err
default:
return nil
}
}

if tm, ok := w.podStart[uid]; ok {
t.Fatalf("w.podStart[%v] is %v, want unset", uid, tm)
}
ps := &PodStatus{Name: "pod", UID: uid, Namespace: "ns", Phase: PodPending,
Containers: []ContainerStatus{{Name: "cont", Reason: "ErrImagePull"}},
}
w.updatePod(ps)
if err := getErr(); err != nil {
t.Fatalf("unexpected error %v", err)
}
if _, ok := w.podStart[uid]; !ok {
t.Fatalf("w.podStart[%v] is not set", uid)
}
w.updatePod(ps)
if err := getErr(); err != nil {
t.Fatalf("unexpected error %v", err)
}
w.podStart[uid] = w.podStart[uid].Add(-(MaxPullTime + 1))
w.updatePod(ps)
if s := errdiff.Check(getErr(), "pull timed out after"); s != "" {
t.Error(s)
}
}

func TestStop(t *testing.T) {
canceled := false
cancel := func() { canceled = true }
Expand Down
Loading