Skip to content

Commit

Permalink
Set Machine Phase to Terminating & make drain calculations based on d…
Browse files Browse the repository at this point in the history
…eletionTimestamp (gardener#564)

* Fixes issues with machines being force drained

The machines were being force drained during race conditions due to PR gardener#492.
This commit fixes that by
	- Make the drain calculation based on deletion timestamp
	- Reverted logic from PR gardener#492 to first set machine Termination phase
  • Loading branch information
prashanth26 committed Nov 24, 2020
1 parent 7a45ca1 commit c054e44
Show file tree
Hide file tree
Showing 8 changed files with 277 additions and 38 deletions.
3 changes: 3 additions & 0 deletions pkg/controller/controller_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,8 @@ func newMachines(
labels = make(map[string]string, 0)
}

currentTime := metav1.Now()

for i := range machines {
m := &v1alpha1.Machine{
TypeMeta: metav1.TypeMeta{
Expand All @@ -324,6 +326,7 @@ func newMachines(
Labels: labels,
Annotations: annotations,
CreationTimestamp: metav1.Now(),
DeletionTimestamp: &currentTime,
},
Spec: *newMachineSpec(&specTemplate.Spec, i),
}
Expand Down
74 changes: 40 additions & 34 deletions pkg/controller/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ import (
"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
"github.com/gardener/machine-controller-manager/pkg/apis/machine/validation"
"github.com/gardener/machine-controller-manager/pkg/driver"
utiltime "github.com/gardener/machine-controller-manager/pkg/util/time"
)

const (
Expand Down Expand Up @@ -624,9 +625,37 @@ func (c *controller) machineDelete(machine *v1alpha1.Machine, driver driver.Driv
forceDeleteLabelPresent = machine.Labels["force-deletion"] == "True"
)

// Timeout value obtained by subtracting last operation with expected time out period
timeOut := metav1.Now().Add(-timeOutDuration).Sub(machine.Status.CurrentStatus.LastUpdateTime.Time)
timeOutOccurred = timeOut > 0
if machine.Status.CurrentStatus.Phase != v1alpha1.MachineTerminating {
lastOperation := v1alpha1.LastOperation{
Description: "Deleting machine from cloud provider",
State: v1alpha1.MachineStateProcessing,
Type: v1alpha1.MachineOperationDelete,
LastUpdateTime: metav1.Now(),
}
currentStatus := v1alpha1.CurrentStatus{
Phase: v1alpha1.MachineTerminating,
TimeoutActive: false,
LastUpdateTime: metav1.Now(),
}

err = c.deleteBootstrapToken(machine.Name)
if err != nil {
klog.Warning(err)
}

machine, err = c.updateMachineStatus(machine, lastOperation, currentStatus)
if err != nil && apierrors.IsNotFound(err) {
// Object no longer exists and has been deleted
klog.Warning(err)
return nil
} else if err != nil {
// Any other type of errors
klog.Error(err)
return err
}
}

timeOutOccurred = utiltime.HasTimeOutOccurred(*machine.DeletionTimestamp, timeOutDuration)

if forceDeleteLabelPresent || timeOutOccurred {
// To perform forceful machine drain/delete either one of the below conditions must be satified
Expand All @@ -638,11 +667,18 @@ func (c *controller) machineDelete(machine *v1alpha1.Machine, driver driver.Driv
maxEvictRetries = 1

klog.V(2).Infof(
"Force deletion has been triggerred for machine %q due to ForceDeletionLabel:%t, Timeout:%t",
"Force delete/drain has been triggerred for machine %q due to Label:%t, timeout:%t",
machine.Name,
forceDeleteLabelPresent,
timeOutOccurred,
)
} else {
klog.V(2).Infof(
"Normal delete/drain has been triggerred for machine %q with drain-timeout:%v & maxEvictRetries:%d",
machine.Name,
timeOutDuration,
maxEvictRetries,
)
}

// If machine was created on the cloud provider
Expand All @@ -660,36 +696,6 @@ func (c *controller) machineDelete(machine *v1alpha1.Machine, driver driver.Driv
}
}

if machine.Status.CurrentStatus.Phase != v1alpha1.MachineTerminating {
lastOperation := v1alpha1.LastOperation{
Description: "Deleting machine from cloud provider",
State: v1alpha1.MachineStateProcessing,
Type: v1alpha1.MachineOperationDelete,
LastUpdateTime: metav1.Now(),
}
currentStatus := v1alpha1.CurrentStatus{
Phase: v1alpha1.MachineTerminating,
TimeoutActive: false,
LastUpdateTime: metav1.Now(),
}

err = c.deleteBootstrapToken(machine.Name)
if err != nil {
klog.Warning(err)
}

machine, err = c.updateMachineStatus(machine, lastOperation, currentStatus)
if err != nil && apierrors.IsNotFound(err) {
// Object no longer exists and has been deleted
klog.Warning(err)
return nil
} else if err != nil {
// Any other type of errors
klog.Error(err)
return err
}
}

if machineID != "" && nodeName != "" {
// Begin drain logic only when the nodeName & providerID exist's for the machine
buf := bytes.NewBuffer([]byte{})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,7 @@ func newMachines(
Labels: labels,
Annotations: annotations,
CreationTimestamp: creationTimestamp,
DeletionTimestamp: &creationTimestamp, //TODO: Add parametrize this
},
Spec: *newMachineSpec(&specTemplate.Spec, i),
}
Expand Down
111 changes: 110 additions & 1 deletion pkg/util/provider/machinecontroller/machine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1684,6 +1684,115 @@ var _ = Describe("machine", func() {
),
},
}),
Entry("Drain machine failure before drain timeout, hence deletion fails", &data{
setup: setup{
secrets: []*corev1.Secret{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
},
},
machineClasses: []*v1alpha1.MachineClass{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
SecretRef: newSecretReference(objMeta, 0),
},
},
machines: newMachines(
1,
&v1alpha1.MachineTemplateSpec{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: v1alpha1.MachineSpec{
Class: v1alpha1.ClassSpec{
Kind: "MachineClass",
Name: "machine-0",
},
ProviderID: "fakeID",
},
},
&v1alpha1.MachineStatus{
Node: "fakeNode",
CurrentStatus: v1alpha1.CurrentStatus{
Phase: v1alpha1.MachineTerminating,
LastUpdateTime: metav1.NewTime(time.Now().Add(-3 * time.Minute)),
},
LastOperation: v1alpha1.LastOperation{
Description: machineutils.InitiateDrain,
State: v1alpha1.MachineStateProcessing,
Type: v1alpha1.MachineOperationDelete,
LastUpdateTime: metav1.NewTime(time.Now().Add(-3 * time.Minute)),
},
},
nil,
map[string]string{
machineutils.MachinePriority: "3",
},
map[string]string{
"node": "fakeID-0",
},
true,
metav1.NewTime(time.Now().Add(-3*time.Minute)),
),
nodes: []*corev1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "fakeID-0",
},
},
},
fakeResourceActions: &customfake.ResourceActions{
Node: customfake.Actions{
Update: "Failed to update node",
},
},
},
action: action{
machine: "machine-0",
fakeDriver: &driver.FakeDriver{
VMExists: true,
ProviderID: "fakeID-0",
NodeName: "fakeNode-0",
Err: nil,
},
},
expect: expect{
err: fmt.Errorf("failed to create update conditions for node \"fakeID-0\": Failed to update node"),
retry: machineutils.ShortRetry,
machine: newMachine(
&v1alpha1.MachineTemplateSpec{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: v1alpha1.MachineSpec{
Class: v1alpha1.ClassSpec{
Kind: "MachineClass",
Name: "machine-0",
},
ProviderID: "fakeID",
},
},
&v1alpha1.MachineStatus{
Node: "fakeNode",
CurrentStatus: v1alpha1.CurrentStatus{
Phase: v1alpha1.MachineTerminating,
LastUpdateTime: metav1.Now(),
},
LastOperation: v1alpha1.LastOperation{
Description: fmt.Sprintf("Drain failed due to failure in update of node conditions - %s. Will retry in next sync. %s", "failed to create update conditions for node \"fakeID-0\": Failed to update node", machineutils.InitiateDrain),
State: v1alpha1.MachineStateFailed,
Type: v1alpha1.MachineOperationDelete,
LastUpdateTime: metav1.Now(),
},
},
nil,
map[string]string{
machineutils.MachinePriority: "3",
},
map[string]string{
"node": "fakeID-0",
},
true,
metav1.Now(),
),
},
}),
Entry("Drain machine failure after drain timeout, hence deletion continues", &data{
setup: setup{
secrets: []*corev1.Secret{
Expand Down Expand Up @@ -1730,7 +1839,7 @@ var _ = Describe("machine", func() {
"node": "fakeID-0",
},
true,
metav1.Now(),
metav1.NewTime(time.Now().Add(-3*time.Hour)),
),
nodes: []*corev1.Node{
{
Expand Down
5 changes: 2 additions & 3 deletions pkg/util/provider/machinecontroller/machine_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import (
"github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/status"
"github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils"
utilstrings "github.com/gardener/machine-controller-manager/pkg/util/strings"
utiltime "github.com/gardener/machine-controller-manager/pkg/util/time"

corev1 "k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -912,9 +913,7 @@ func (c *controller) drainNode(deleteMachineRequest *driver.DeleteMachineRequest
if skipDrain {
state = v1alpha1.MachineStateProcessing
} else {
// Timeout value obtained by subtracting last operation with expected time out period
timeOut := metav1.Now().Add(-timeOutDuration).Sub(machine.Status.CurrentStatus.LastUpdateTime.Time)
timeOutOccurred = timeOut > 0
timeOutOccurred = utiltime.HasTimeOutOccurred(*machine.DeletionTimestamp, timeOutDuration)

if forceDeleteLabelPresent || timeOutOccurred {
// To perform forceful machine drain/delete either one of the below conditions must be satified
Expand Down
31 changes: 31 additions & 0 deletions pkg/util/time/time.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/*
Copyright (c) 2020 SAP SE or an SAP affiliate company. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

// Package time is used to provide the core functionalities of machine-controller-manager
package time

import (
"time"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// HasTimeOutOccurred returns true, when time.Now() is more than time + period
func HasTimeOutOccurred(timeStamp metav1.Time, period time.Duration) bool {
// Timeout value obtained by subtracting last operation with expected time out period
timeOut := metav1.Now().Add(-period).Sub(timeStamp.Time)
return timeOut > 0
}
13 changes: 13 additions & 0 deletions pkg/util/time/time_suite_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package time_test

import (
"testing"

. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
)

func TestTime(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "Time Suite")
}
77 changes: 77 additions & 0 deletions pkg/util/time/time_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
Copyright (c) 2017 SAP SE or an SAP affiliate company. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package time

import (
"time"

. "github.com/onsi/ginkgo"
. "github.com/onsi/ginkgo/extensions/table"
. "github.com/onsi/gomega"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

var _ = Describe("time", func() {
Describe("#hasTimeOutOccurred", func() {
type setup struct {
timeStamp metav1.Time
period time.Duration
}
type action struct {
}
type expect struct {
timeOutOccurred bool
}
type data struct {
setup setup
action action
expect expect
}
DescribeTable("##TimeOut scenarios",
func(data *data) {
timeOutOccurred := HasTimeOutOccurred(data.setup.timeStamp, data.setup.period)
Expect(timeOutOccurred).To(Equal(data.expect.timeOutOccurred))
},
Entry("Time stamp is one hour ago and period is 30mins", &data{
setup: setup{
timeStamp: metav1.Time{Time: time.Now().Add(-time.Hour)},
period: 30 * time.Minute,
},
expect: expect{
timeOutOccurred: true,
},
}),
Entry("Time stamp is one hour ago and period is 90mins", &data{
setup: setup{
timeStamp: metav1.Time{Time: time.Now().Add(-time.Hour)},
period: 90 * time.Minute,
},
expect: expect{
timeOutOccurred: false,
},
}),
Entry("Time stamp is now and period is 5mins", &data{
setup: setup{
timeStamp: metav1.Time{Time: time.Now()},
period: 5 * time.Minute,
},
expect: expect{
timeOutOccurred: false,
},
}),
)
})
})

0 comments on commit c054e44

Please sign in to comment.