Skip to content

Commit

Permalink
INSIGHTS-157 - PDB <> HPA check (#1057)
Browse files Browse the repository at this point in the history
* fix typo

* fix failure message

* fix changelog

* fix missingPodDisruptionBudget validation

* add tests for pdbMinAvailableLessThenHPAMaxReplicas

* add simple success test

* fix typo

* lowercasing warnings

* WIP implement pdbMinAvailableLessThanHPAMaxReplicas

* change check name

* rename testes

* fix check message

* change check name

* minor fixes

* improving tests

* improve tests

* fix check name

* Update docs/checks/reliability.md

Co-authored-by: Andy Suderman <[email protected]>

* fix/add tests

* fixes from PR

* fix error message

---------

Co-authored-by: Andy Suderman <[email protected]>
  • Loading branch information
vitorvezani and sudermanjr authored Jul 8, 2024
1 parent 875a8ff commit 952b6ae
Show file tree
Hide file tree
Showing 26 changed files with 708 additions and 23 deletions.
1 change: 1 addition & 0 deletions docs/checks/reliability.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ key | default | description
`topologySpreadConstraint` | `warning` | Fails when there is no topology spread constraint on the pod
`hpaMaxAvailability` | `warning` | Fails when `maxAvailable` lesser or equal than `minAvailable` (if defined) for a HorizontalPodAutoscaler
`hpaMinAvailability` | `warning` | Fails when `minAvailable` (if defined) lesser or equal to one for a HorizontalPodAutoscaler
`pdbMinAvailableGreaterThanHPAMinReplicas` | `warning` | Fails when PDB `minAvailable` is greater than HPA `minReplicas`

## Background

Expand Down
1 change: 1 addition & 0 deletions pkg/config/checks.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ var (
"rolebindingClusterAdminRole",
"hpaMaxAvailability",
"hpaMinAvailability",
"pdbMinAvailableGreaterThanHPAMinReplicas",
}

// BuiltInChecks contains the checks that come pre-installed w/ Polaris
Expand Down
12 changes: 4 additions & 8 deletions pkg/config/checks/missingPodDisruptionBudget.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,23 @@ controllers:
schema:
"$schema": http://json-schema.org/draft-07/schema#
type: object
required: [spec]
properties:
spec:
type: object
required: [template]
properties:
template:
type: object
required: [metadata]
properties:
metadata:
type: object
required: [labels]
properties:
labels:
type: object
minProperties: 1
required:
- labels
required:
- metadata
required:
- template
required:
- spec
additionalSchemaStrings:
policy/PodDisruptionBudget: |
type: object
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
successMessage: PDB and HPA are correctly configured
failureMessage: PDB minAvailable is greater than HPA minReplicas
category: Reliability
target: Controller
controllers:
include:
- Deployment
1 change: 1 addition & 0 deletions pkg/config/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ checks:
topologySpreadConstraint: warning
hpaMaxAvailability: warning
hpaMinAvailability: warning
pdbMinAvailableGreaterThanHPAMinReplicas: warning

# efficiency
cpuRequestsMissing: warning
Expand Down
1 change: 1 addition & 0 deletions pkg/config/examples/config-full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ checks:
metadataAndInstanceMismatched: warning
hpaMaxAvailability: warning
hpaMinAvailability: warning
pdbMinAvailableGreaterThanHPAMinReplicas: warning

# efficiency
cpuRequestsMissing: warning
Expand Down
10 changes: 5 additions & 5 deletions pkg/kube/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ func resolveControllerFromPod(ctx context.Context, podResource kubeAPICoreV1.Pod
err = cacheAllObjectsOfKind(ctx, firstOwner.APIVersion, firstOwner.Kind, dynamicClient, restMapper, objectCache)
}
if err != nil {
logrus.Warnf("Error caching objects of Kind %s %v", firstOwner.Kind, err)
logrus.Warnf("error caching objects of Kind %s %v", firstOwner.Kind, err)
break
}
abstractObject, ok = objectCache[key]
Expand All @@ -193,7 +193,7 @@ func resolveControllerFromPod(ctx context.Context, podResource kubeAPICoreV1.Pod

objMeta, err := meta.Accessor(&abstractObject)
if err != nil {
logrus.Warnf("Error retrieving parent metadata %s of API %s and Kind %s because of error: %v ", firstOwner.Name, firstOwner.APIVersion, firstOwner.Kind, err)
logrus.Warnf("error retrieving parent metadata %s of API %s and Kind %s because of error: %v ", firstOwner.Name, firstOwner.APIVersion, firstOwner.Kind, err)
return GenericResource{}, err
}
podSpec := GetPodSpec(abstractObject.Object)
Expand Down Expand Up @@ -221,7 +221,7 @@ func cacheSingleObject(ctx context.Context, apiVersion, kind, namespace, name st
logrus.Debugf("Caching a single %s", kind)
object, err := getObject(ctx, namespace, kind, apiVersion, name, dynamicClient, restMapper)
if err != nil {
logrus.Warnf("Error retrieving object %s/%s/%s/%s because of error: %v", kind, apiVersion, namespace, name, err)
logrus.Warnf("error retrieving object %s/%s/%s/%s because of error: %v", kind, apiVersion, namespace, name, err)
return err
}
key := fmt.Sprintf("%s/%s/%s", object.GetKind(), object.GetNamespace(), object.GetName())
Expand All @@ -235,13 +235,13 @@ func cacheAllObjectsOfKind(ctx context.Context, apiVersion, kind string, dynamic
fqKind := schema.FromAPIVersionAndKind(apiVersion, kind)
mapping, err := restMapper.RESTMapping(fqKind.GroupKind(), fqKind.Version)
if err != nil {
logrus.Warnf("Error retrieving mapping of API %s and Kind %s because of error: %v", apiVersion, kind, err)
logrus.Warnf("error retrieving mapping of API %s and Kind %s because of error: %v", apiVersion, kind, err)
return err
}

objects, err := dynamicClient.Resource(mapping.Resource).Namespace("").List(ctx, kubeAPIMetaV1.ListOptions{})
if err != nil {
logrus.Warnf("Error retrieving parent object API %s and Kind %s because of error: %v", mapping.Resource.Version, mapping.Resource.Resource, err)
logrus.Warnf("error retrieving parent object API %s and Kind %s because of error: %v", mapping.Resource.Version, mapping.Resource.Resource, err)
return err
}
for idx, object := range objects.Items {
Expand Down
6 changes: 3 additions & 3 deletions pkg/kube/resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ func CreateResourceProviderFromPath(directory string) (*ResourceProvider, error)
}
err = resources.addResourcesFromYaml(string(contents))
if err != nil {
logrus.Warnf("Skipping %s: cannot add resource from YAML: %v", path, err)
logrus.Warnf("skipping %s: cannot add resource from YAML: %v", path, err)
}
return nil
}
Expand Down Expand Up @@ -340,7 +340,7 @@ func CreateResourceProviderFromAPI(ctx context.Context, kube kubernetes.Interfac
groupKind := parseGroupKind(maybeTransformKindIntoGroupKind(string(kind)))
mapping, err := restMapper.RESTMapping(groupKind)
if err != nil {
logrus.Warnf("Error retrieving mapping of Kind %s because of error: %v", kind, err)
logrus.Warnf("error retrieving mapping of Kind %s because of error: %v", kind, err)
return nil, err
}
if c.Namespace != "" && mapping.Scope.Name() != meta.RESTScopeNameNamespace {
Expand All @@ -351,7 +351,7 @@ func CreateResourceProviderFromAPI(ctx context.Context, kube kubernetes.Interfac
logrus.Info("Loading " + kind)
objects, err := dynamic.Resource(mapping.Resource).Namespace(c.Namespace).List(ctx, metav1.ListOptions{})
if err != nil {
logrus.Warnf("Error retrieving parent object API %s and Kind %s because of error: %v", mapping.Resource.Version, mapping.Resource.Resource, err)
logrus.Warnf("error retrieving parent object API %s and Kind %s because of error: %v", mapping.Resource.Version, mapping.Resource.Resource, err)
return nil, err
}
for _, obj := range objects.Items {
Expand Down
19 changes: 19 additions & 0 deletions pkg/validator/custom.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package validator

import (
"sync"

"github.com/qri-io/jsonschema"
)

type validatorFunction func(test schemaTestCase) (bool, []jsonschema.ValError, error)

var validatorMapper = map[string]validatorFunction{}
var lock = &sync.Mutex{}

func registerCustomChecks(name string, check validatorFunction) {
lock.Lock()
defer lock.Unlock()

validatorMapper[name] = check
}
150 changes: 150 additions & 0 deletions pkg/validator/pdb_hpa_validator.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
package validator

import (
"fmt"
"strconv"
"strings"

"github.com/fairwindsops/polaris/pkg/kube"
"github.com/qri-io/jsonschema"
"github.com/sirupsen/logrus"
appsv1 "k8s.io/api/apps/v1"
autoscalingv1 "k8s.io/api/autoscaling/v1"
policyv1 "k8s.io/api/policy/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/intstr"
)

func init() {
registerCustomChecks("pdbMinAvailableGreaterThanHPAMinReplicas", pdbMinAvailableGreaterThanHPAMinReplicas)
}

func pdbMinAvailableGreaterThanHPAMinReplicas(test schemaTestCase) (bool, []jsonschema.ValError, error) {
if test.ResourceProvider == nil {
logrus.Debug("ResourceProvider is nil")
return true, nil, nil
}

deployment := &appsv1.Deployment{}
err := runtime.DefaultUnstructuredConverter.FromUnstructured(test.Resource.Resource.Object, deployment)
if err != nil {
logrus.Warnf("error converting unstructured to Deployment: %v", err)
return true, nil, nil
}

attachedPDB, err := hasPDBAttached(*deployment, test.ResourceProvider.Resources["policy/PodDisruptionBudget"])
if err != nil {
logrus.Warnf("error getting PodDisruptionBudget: %v", err)
return true, nil, nil
}

attachedHPA, err := hasHPAAttached(*deployment, test.ResourceProvider.Resources["autoscaling/HorizontalPodAutoscaler"])
if err != nil {
logrus.Warnf("error getting HorizontalPodAutoscaler: %v", err)
return true, nil, nil
}

if attachedPDB != nil && attachedHPA != nil {
logrus.Debugf("both PDB and HPA are attached to deployment %s", deployment.Name)

pdbMinAvailable, isPercent, err := getIntOrPercentValueSafely(attachedPDB.Spec.MinAvailable)
if err != nil {
logrus.Warnf("error getting getIntOrPercentValueSafely: %v", err)
return true, nil, nil
}

if isPercent {
// if the value is a percentage, we need to calculate the actual value
if attachedHPA.Spec.MinReplicas == nil {
logrus.Debug("attachedHPA.Spec.MinReplicas is nil")
return true, nil, nil
}

pdbMinAvailable, err = intstr.GetScaledValueFromIntOrPercent(attachedPDB.Spec.MinAvailable, int(*attachedHPA.Spec.MinReplicas), true)
if err != nil {
logrus.Warnf("error getting minAvailable value from PodDisruptionBudget: %v", err)
return true, nil, nil
}
}

if attachedHPA.Spec.MinReplicas != nil && pdbMinAvailable >= int(*attachedHPA.Spec.MinReplicas) {
return false, []jsonschema.ValError{
{
PropertyPath: "spec.minAvailable",
InvalidValue: pdbMinAvailable,
Message: fmt.Sprintf("The minAvailable value in the PodDisruptionBudget(%s) is %d, which is greater or equal than the minReplicas value in the HorizontalPodAutoscaler(%s) (%d)", attachedPDB.Name, pdbMinAvailable, attachedHPA.Name, *attachedHPA.Spec.MinReplicas),
},
}, nil
}
}

return true, nil, nil
}

func hasPDBAttached(deployment appsv1.Deployment, pdbs []kube.GenericResource) (*policyv1.PodDisruptionBudget, error) {
for _, generic := range pdbs {
pdb := &policyv1.PodDisruptionBudget{}
err := runtime.DefaultUnstructuredConverter.FromUnstructured(generic.Resource.Object, pdb)
if err != nil {
return nil, fmt.Errorf("error converting unstructured to PodDisruptionBudget: %v", err)
}

if pdb.Spec.Selector == nil {
logrus.Debug("pdb.Spec.Selector is nil")
continue
}

if matchesPDBForDeployment(deployment.Spec.Template.Labels, pdb.Spec.Selector.MatchLabels) {
return pdb, nil
}
}
return nil, nil
}

// matchesPDBForDeployment checks if the labels of the deployment match the labels of the PDB
func matchesPDBForDeployment(deploymentLabels, pdbLabels map[string]string) bool {
for key, value := range pdbLabels {
if deploymentLabels[key] == value {
return true
}
}
return false
}

func hasHPAAttached(deployment appsv1.Deployment, hpas []kube.GenericResource) (*autoscalingv1.HorizontalPodAutoscaler, error) {
for _, generic := range hpas {
hpa := &autoscalingv1.HorizontalPodAutoscaler{}
err := runtime.DefaultUnstructuredConverter.FromUnstructured(generic.Resource.Object, hpa)
if err != nil {
return nil, fmt.Errorf("error converting unstructured to HorizontalPodAutoscaler: %v", err)
}

if hpa.Spec.ScaleTargetRef.Kind == "Deployment" && hpa.Spec.ScaleTargetRef.Name == deployment.Name {
return hpa, nil
}
}
return nil, nil
}

// getIntOrPercentValueSafely is a safer version of getIntOrPercentValue based on private function intstr.getIntOrPercentValueSafely
func getIntOrPercentValueSafely(intOrStr *intstr.IntOrString) (int, bool, error) {
switch intOrStr.Type {
case intstr.Int:
return intOrStr.IntValue(), false, nil
case intstr.String:
isPercent := false
s := intOrStr.StrVal
if strings.HasSuffix(s, "%") {
isPercent = true
s = strings.TrimSuffix(intOrStr.StrVal, "%")
} else {
return 0, false, fmt.Errorf("invalid type: string is not a percentage")
}
v, err := strconv.Atoi(s)
if err != nil {
return 0, false, fmt.Errorf("invalid value %q: %v", intOrStr.StrVal, err)
}
return int(v), isPercent, nil
}
return 0, false, fmt.Errorf("invalid type: neither int nor percentage")
}
4 changes: 3 additions & 1 deletion pkg/validator/schema.go
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,8 @@ func applySchemaCheck(conf *config.Configuration, checkID string, test schemaTes
passes, issues, err = check.CheckContainer(test.Container)
} else if check.Validator.SchemaURI != "" {
passes, issues, err = check.CheckObject(test.Resource.Resource.Object)
} else if validatorMapper[checkID] != nil {
passes, issues, err = validatorMapper[checkID](test)
} else {
passes, issues, err = true, []jsonschema.ValError{}, nil
}
Expand All @@ -380,7 +382,7 @@ func applySchemaCheck(conf *config.Configuration, checkID string, test schemaTes
break
}
if test.ResourceProvider == nil {
logrus.Warnf("No ResourceProvider available, check %s will not work in this context (e.g. admission control)", checkID)
logrus.Warnf("no ResourceProvider available, check %s will not work in this context (e.g. admission control)", checkID)
break
}
resources := test.ResourceProvider.Resources[groupkind]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: zookeeper
spec:
replicas: 10
template:
metadata:
labels:
app.kubernetes.io/name: zookeeper
foo: bar
spec:
containers:
- name: zookeeper
image: zookeeper
---
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: zookeeper-pdb
spec:
minAvailable: 150% # 1.5 * 10 = 15
selector:
matchLabels:
app.kubernetes.io/name: zookeeper
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: zookeeper-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: zookeeper
minReplicas: 10
maxReplicas: 15
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 50
Loading

0 comments on commit 952b6ae

Please sign in to comment.