Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework running Fence Agent command #106

Merged
merged 28 commits into from
Dec 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
fedd5b9
Move UpdateConditions to utils package
clobrano Nov 24, 2023
1283aa2
Test refactoring
clobrano Nov 24, 2023
68c3f54
Run Fence Agent asynchronously
clobrano Nov 24, 2023
c8e924f
Add configurable Fence Agent command retries and timeout
clobrano Nov 30, 2023
01fe6c6
Drop log checks in E2E tests
clobrano Nov 30, 2023
064ce1d
Cancel fence agent goroutine and cleanup Executer map
clobrano Dec 1, 2023
356712a
Use ExponentialBackoff to handle Executor status updates retries
clobrano Dec 2, 2023
a551225
Fix typos
clobrano Nov 24, 2023
6f54df9
Fixes and improvements of cliexecuter
clobrano Dec 14, 2023
4b293a9
Refactor AsyncRun
clobrano Dec 14, 2023
6f86042
Reduce scope of test variable
clobrano Dec 14, 2023
2ab0015
Fix typo
clobrano Dec 14, 2023
ca34acb
Delay cleanup executor
clobrano Dec 14, 2023
b63a143
Reduce number of retryCount
clobrano Dec 14, 2023
f666a11
Merge branch 'main' into rework-fa-exec-async/1
clobrano Dec 15, 2023
75a7c39
Remove unused E2E functions
clobrano Dec 18, 2023
0f94a55
Do not use acronym for fance agent in condition message
clobrano Dec 18, 2023
15d4e8f
Reorder UpdateConditions switch case for clarity
clobrano Dec 18, 2023
13223d6
Fix typos and documentation in controller tests
clobrano Dec 18, 2023
12985e9
Refactor common test code in single functions
clobrano Dec 18, 2023
0a8e9c0
Fix typo in test message
clobrano Dec 18, 2023
281dd59
Minor refactor of updateStatusWithRetry error handling
clobrano Dec 19, 2023
8bb393c
Use const fence agent messages to help unittests
clobrano Dec 19, 2023
9ab9ee0
Do not expose full fence agent command line
clobrano Dec 19, 2023
682b894
Move NewFakeExecutor into a separate file
clobrano Dec 20, 2023
46dd63c
Fix: Retry if Executer status update fails for Conflict
clobrano Dec 20, 2023
a1ef1c1
Remove incorrect comment
clobrano Dec 20, 2023
fee00a4
Rephrased error messages in updateStatusWithRetry
clobrano Dec 20, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 19 additions & 8 deletions api/v1alpha1/fenceagentsremediation_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,6 @@ const (
FARFinalizer string = "fence-agents-remediation.medik8s.io/far-finalizer"
// Taints
FARNoExecuteTaintKey = "medik8s.io/fence-agents-remediation"
// FenceAgentActionSucceededType is the condition type used to signal whether the Fence Agent action was succeeded successfully or not
FenceAgentActionSucceededType = "FenceAgentActionSucceeded"
// condition messages
RemediationFinishedNodeNotFoundConditionMessage = "FAR CR name doesn't match a node name"
RemediationInterruptedByNHCConditionMessage = "Node Healthcheck timeout annotation has been set"
RemediationStartedConditionMessage = "FAR CR was found, its name matches one of the cluster nodes, and a finalizer was set to the CR"
FenceAgentSucceededConditionMessage = "FAR taint was added and the fence agent command has been created and executed successfully"
RemediationFinishedSuccessfullyConditionMessage = "The unhealthy node was fully remediated (it was tainted, fenced using FA and all the node resources have been deleted)"
)

// ConditionsChangeReason represents the reason of updating the some or all the conditions
Expand Down Expand Up @@ -63,6 +55,25 @@ type FenceAgentsRemediationSpec struct {
//+operator-sdk:csv:customresourcedefinitions:type=spec
Agent string `json:"agent"`

// RetryCount is the number of times the fencing agent will be executed
//+kubebuilder:default:=5
//+operator-sdk:csv:customresourcedefinitions:type=spec
RetryCount int `json:"retrycount,omitempty"`

// RetryInterval is the interval between each fencing agent execution
//+kubebuilder:default:="5s"
//+kubebuilder:validation:Pattern="^([0-9]+(\\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$"
//+kubebuilder:validation:Type=string
//+operator-sdk:csv:customresourcedefinitions:type=spec
RetryInterval metav1.Duration `json:"retryinterval,omitempty"`

// Timeout is the timeout for each fencing agent execution
//+kubebuilder:default:="60s"
//+kubebuilder:validation:Pattern="^([0-9]+(\\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$"
//+kubebuilder:validation:Type=string
//+operator-sdk:csv:customresourcedefinitions:type=spec
Timeout metav1.Duration `json:"timeout,omitempty"`

// SharedParameters are passed to the fencing agent regardless of which node is about to be fenced (i.e., they are common for all the nodes)
//+operator-sdk:csv:customresourcedefinitions:type=spec
SharedParameters map[ParameterName]string `json:"sharedparameters,omitempty"`
Expand Down
2 changes: 2 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,19 @@ spec:
node that is fenced, since they are node specific
displayName: Node Parameters
path: nodeparameters
- description: RetryCount is the number of times the fencing agent will be executed
displayName: Retry Count
path: retrycount
- description: RetryInterval is the interval between each fencing agent execution
displayName: Retry Interval
path: retryinterval
- description: SharedParameters are passed to the fencing agent regardless of
which node is about to be fenced (i.e., they are common for all the nodes)
displayName: Shared Parameters
path: sharedparameters
- description: Timeout is the timeout for each fencing agent execution
displayName: Timeout
path: timeout
statusDescriptors:
- description: 'Represents the observations of a FenceAgentsRemediation''s current
state. Known .status.conditions.type are: "Processing", "FenceAgentActionSucceeded",
Expand Down Expand Up @@ -112,10 +121,19 @@ spec:
node that is fenced, since they are node specific
displayName: Node Parameters
path: template.spec.nodeparameters
- description: RetryCount is the number of times the fencing agent will be executed
displayName: Retry Count
path: template.spec.retrycount
- description: RetryInterval is the interval between each fencing agent execution
displayName: Retry Interval
path: template.spec.retryinterval
- description: SharedParameters are passed to the fencing agent regardless of
which node is about to be fenced (i.e., they are common for all the nodes)
displayName: Shared Parameters
path: template.spec.sharedparameters
- description: Timeout is the timeout for each fencing agent execution
displayName: Timeout
path: template.spec.timeout
version: v1alpha1
description: "Fence Agents Remediation Operator (FAR)\n\nThe operator will remediate/fence
a node when a FenceAgentsRemediation (far) custom resource exists with the node's
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,29 @@ spec:
description: NodeParameters are passed to the fencing agent according
to the node that is fenced, since they are node specific
type: object
retrycount:
default: 5
description: RetryCount is the number of times the fencing agent will
be executed
type: integer
retryinterval:
default: 5s
description: RetryInterval is the interval between each fencing agent
execution
pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
type: string
sharedparameters:
additionalProperties:
type: string
description: SharedParameters are passed to the fencing agent regardless
of which node is about to be fenced (i.e., they are common for all
the nodes)
type: object
timeout:
default: 60s
description: Timeout is the timeout for each fencing agent execution
pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
type: string
required:
- agent
type: object
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,30 @@ spec:
according to the node that is fenced, since they are node
specific
type: object
retrycount:
default: 5
description: RetryCount is the number of times the fencing
agent will be executed
type: integer
retryinterval:
default: 5s
description: RetryInterval is the interval between each fencing
agent execution
pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
type: string
sharedparameters:
additionalProperties:
type: string
description: SharedParameters are passed to the fencing agent
regardless of which node is about to be fenced (i.e., they
are common for all the nodes)
type: object
timeout:
default: 60s
description: Timeout is the timeout for each fencing agent
execution
pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
type: string
required:
- agent
type: object
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,29 @@ spec:
description: NodeParameters are passed to the fencing agent according
to the node that is fenced, since they are node specific
type: object
retrycount:
default: 5
description: RetryCount is the number of times the fencing agent will
be executed
type: integer
retryinterval:
default: 5s
description: RetryInterval is the interval between each fencing agent
execution
pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
type: string
sharedparameters:
additionalProperties:
type: string
description: SharedParameters are passed to the fencing agent regardless
of which node is about to be fenced (i.e., they are common for all
the nodes)
type: object
timeout:
default: 60s
description: Timeout is the timeout for each fencing agent execution
pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
type: string
required:
- agent
type: object
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,30 @@ spec:
according to the node that is fenced, since they are node
specific
type: object
retrycount:
default: 5
description: RetryCount is the number of times the fencing
agent will be executed
type: integer
retryinterval:
default: 5s
description: RetryInterval is the interval between each fencing
agent execution
pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
type: string
sharedparameters:
additionalProperties:
type: string
description: SharedParameters are passed to the fencing agent
regardless of which node is about to be fenced (i.e., they
are common for all the nodes)
type: object
timeout:
default: 60s
description: Timeout is the timeout for each fencing agent
execution
pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
type: string
required:
- agent
type: object
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,19 @@ spec:
node that is fenced, since they are node specific
displayName: Node Parameters
path: nodeparameters
- description: RetryCount is the number of times the fencing agent will be executed
displayName: Retry Count
path: retrycount
- description: RetryInterval is the interval between each fencing agent execution
displayName: Retry Interval
path: retryinterval
- description: SharedParameters are passed to the fencing agent regardless of
which node is about to be fenced (i.e., they are common for all the nodes)
displayName: Shared Parameters
path: sharedparameters
- description: Timeout is the timeout for each fencing agent execution
displayName: Timeout
path: timeout
statusDescriptors:
- description: 'Represents the observations of a FenceAgentsRemediation''s current
state. Known .status.conditions.type are: "Processing", "FenceAgentActionSucceeded",
Expand Down Expand Up @@ -71,10 +80,19 @@ spec:
node that is fenced, since they are node specific
displayName: Node Parameters
path: template.spec.nodeparameters
- description: RetryCount is the number of times the fencing agent will be executed
displayName: Retry Count
path: template.spec.retrycount
- description: RetryInterval is the interval between each fencing agent execution
displayName: Retry Interval
path: template.spec.retryinterval
- description: SharedParameters are passed to the fencing agent regardless of
which node is about to be fenced (i.e., they are common for all the nodes)
displayName: Shared Parameters
path: template.spec.sharedparameters
- description: Timeout is the timeout for each fencing agent execution
displayName: Timeout
path: template.spec.timeout
version: v1alpha1
description: "Fence Agents Remediation Operator (FAR)\n\nThe operator will remediate/fence
a node when a FenceAgentsRemediation (far) custom resource exists with the node's
Expand Down
84 changes: 75 additions & 9 deletions controllers/controllers_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,12 @@ package controllers

import (
"context"
"fmt"
"os"
"path/filepath"
"strings"
"testing"
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
Expand All @@ -35,31 +38,79 @@ import (

//+kubebuilder:scaffold:imports ## https://github.com/kubernetes-sigs/kubebuilder/issues/1487 ?
"github.com/medik8s/fence-agents-remediation/api/v1alpha1"
"github.com/medik8s/fence-agents-remediation/pkg/cli"
)

// These tests use Ginkgo (BDD-style Go testing framework). Refer to
// http://onsi.github.io/ginkgo/ to learn more about Ginkgo.

const defaultNamespace = "default"

var k8sClient client.Client
var k8sManager manager.Manager
var testEnv *envtest.Environment
var ctx context.Context
var cancel context.CancelFunc
var mocksExecuter *mockExecuter
var (
k8sClient client.Client
k8sManager manager.Manager
testEnv *envtest.Environment
ctx context.Context
cancel context.CancelFunc

plogs *peekLogger

storedCommand []string
mockError error
forcedDelay time.Duration
)

// peekLogger allows to inspect operator's log for testing purpose.
type peekLogger struct {
logs []string
}

func (p *peekLogger) Write(b []byte) (n int, err error) {
n, err = GinkgoWriter.Write(b)
if err != nil {
return n, err
}
p.logs = append(p.logs, string(b))
return n, err
}

func (p *peekLogger) Contains(s string) bool {
for _, log := range p.logs {
if strings.Contains(log, s) {
return true
}
}
return false
}

// CountOccurences returns number of occurences of string s in logs.
func (p *peekLogger) CountOccurences(s string) int {
count := 0
for _, log := range p.logs {
if strings.Contains(log, s) {
count++
}
}
return count
}

func (p *peekLogger) Clear() {
p.logs = make([]string, 0)
}

func TestControllers(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "Controllers Suite")
}

var _ = BeforeSuite(func() {
plogs = &peekLogger{logs: make([]string, 0)}
opts := zap.Options{
DestWriter: plogs,
Development: true,
TimeEncoder: zapcore.RFC3339NanoTimeEncoder,
}
logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseFlagOptions(&opts)))
logf.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
clobrano marked this conversation as resolved.
Show resolved Hide resolved

By("bootstrapping test environment")
testEnv = &envtest.Environment{
Expand All @@ -82,15 +133,16 @@ var _ = BeforeSuite(func() {
k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme})
Expect(err).NotTo(HaveOccurred())
Expect(k8sClient).NotTo(BeNil())
mocksExecuter = newMockExecuter()

executor, _ := cli.NewFakeExecuter(k8sClient, controlledRun)

os.Setenv("DEPLOYMENT_NAMESPACE", defaultNamespace)

err = (&FenceAgentsRemediationReconciler{
Client: k8sClient,
Log: k8sManager.GetLogger().WithName("test far reconciler"),
Scheme: k8sManager.GetScheme(),
Executor: mocksExecuter,
Executor: executor,
}).SetupWithManager(k8sManager)
Expect(err).NotTo(HaveOccurred())

Expand All @@ -108,3 +160,17 @@ var _ = AfterSuite(func() {
err := testEnv.Stop()
Expect(err).NotTo(HaveOccurred())
})

func controlledRun(ctx context.Context, command []string) (stdout, stderr string, err error) {
storedCommand = command
if forcedDelay > 0 {
select {
case <-ctx.Done():
return "", "", fmt.Errorf("context expired")
case <-time.After(forcedDelay):
log.Info("forced delay expired", "delay", forcedDelay)
}
}

return "", "", mockError
}
Loading