Skip to content

Commit

Permalink
Add validating webhook
Browse files Browse the repository at this point in the history
and validate worker group names are unique.

Much of the code was generated by running the command below as [documented in
kubebuilder][1].

```
kubebuilder create webhook \
  --group ray \
  --version v1 \
  --kind RayCluster \
  --defaulting \
  --programmatic-validation`
```

## Example RayCluster that has duplicate worker group names

```shell
cat dupe-worker-group-name.yaml

apiVersion: ray.io/v1
kind: RayCluster
metadata:
  name: dupe-worker-group-name
spec:
  headGroupSpec:
    rayStartParams:
      dashboard-host: '0.0.0.0'
    template:
      spec:
        containers:
        - name: ray-head
          image: rayproject/ray:2.7.0
  workerGroupSpecs:
  - replicas: 1
    minReplicas: 1
    maxReplicas: 10
    groupName: group1
    rayStartParams: {}
    template:
      spec:
        containers:
        - name: ray-worker
          image: rayproject/ray:2.7.0
  - replicas: 1
    minReplicas: 1
    maxReplicas: 10
    groupName: group1
    rayStartParams: {}
    template:
      spec:
        containers:
        - name: ray-worker
          image: rayproject/ray:2.7.0
```

## Before

```
kubectl apply -f dupe-worker-group-name.yaml
raycluster.ray.io/raycluster-dupe-worker-name created
```

## After

```
kubectl --context kind-kind apply -f config/samples/ray-cluster-dupe-worker-name.yaml                                                                                            (base)
```

`The RayCluster "raycluster-dupe-worker-name" is invalid:
spec.workerGroupSpecs[1]: Invalid value: v1.WorkerGroupSpec{GroupName:"group1",
Replicas:(*int32)(0x40006e63cc), MinReplicas:(*int32)(0x40006e63c8),
MaxReplicas:(*int32)(0x40006e63c0), RayStartParams:map[string]string{},
Template:v1.PodTemplateSpec{ObjectMeta:v1.ObjectMeta{Name:"", GenerateName:"",
Namespace:"", SelfLink:"", UID:"", ResourceVersion:"", Generation:0,
CreationTimestamp:time.Date(1, time.January, 1, 0, 0, 0, 0, time.UTC),
DeletionTimestamp:<nil>, DeletionGracePeriodSeconds:(*int64)(nil),
Labels:map[string]string(nil), Annotations:map[string]string(nil),
OwnerReferences:[]v1.OwnerReference(nil), Finalizers:[]string(nil),
ClusterName:"", ManagedFields:[]v1.ManagedFieldsEntry(nil)},
Spec:v1.PodSpec{Volumes:[]v1.Volume(nil), InitContainers:[]v1.Container(nil),
Containers:[]v1.Container{v1.Container{Name:"ray-worker",
Image:"rayproject/ray:2.7.0", Command:[]string(nil), Args:[]string(nil),
WorkingDir:"", Ports:[]v1.ContainerPort(nil), EnvFrom:[]v1.EnvFromSource(nil),
Env:[]v1.EnvVar(nil),
Resources:v1.ResourceRequirements{Limits:v1.ResourceList(nil),
Requests:v1.ResourceList(nil)}, VolumeMounts:[]v1.VolumeMount(nil),
VolumeDevices:[]v1.VolumeDevice(nil), LivenessProbe:(*v1.Probe)(nil),
ReadinessProbe:(*v1.Probe)(nil), StartupProbe:(*v1.Probe)(nil),
Lifecycle:(*v1.Lifecycle)(nil), TerminationMessagePath:"",
TerminationMessagePolicy:"", ImagePullPolicy:"",
SecurityContext:(*v1.SecurityContext)(nil), Stdin:false, StdinOnce:false,
TTY:false}}, EphemeralContainers:[]v1.EphemeralContainer(nil),
RestartPolicy:"", TerminationGracePeriodSeconds:(*int64)(nil),
ActiveDeadlineSeconds:(*int64)(nil), DNSPolicy:"",
NodeSelector:map[string]string(nil), ServiceAccountName:"",
DeprecatedServiceAccount:"", AutomountServiceAccountToken:(*bool)(nil),
NodeName:"", HostNetwork:false, HostPID:false, HostIPC:false,
ShareProcessNamespace:(*bool)(nil),
SecurityContext:(*v1.PodSecurityContext)(nil),
ImagePullSecrets:[]v1.LocalObjectReference(nil), Hostname:"", Subdomain:"",
Affinity:(*v1.Affinity)(nil), SchedulerName:"",
Tolerations:[]v1.Toleration(nil), HostAliases:[]v1.HostAlias(nil),
PriorityClassName:"", Priority:(*int32)(nil),
DNSConfig:(*v1.PodDNSConfig)(nil), ReadinessGates:[]v1.PodReadinessGate(nil),
RuntimeClassName:(*string)(nil), EnableServiceLinks:(*bool)(nil),
PreemptionPolicy:(*v1.PreemptionPolicy)(nil), Overhead:v1.ResourceList(nil),
TopologySpreadConstraints:[]v1.TopologySpreadConstraint(nil),
SetHostnameAsFQDN:(*bool)(nil), OS:(*v1.PodOS)(nil)}},
ScaleStrategy:v1.ScaleStrategy{WorkersToDelete:[]string(nil)}}: worker group
names must be unique`

closes ray-project#718
closes ray-project#736

[1]: https://book.kubebuilder.io/cronjob-tutorial/webhook-implementation
  • Loading branch information
davidxia committed Oct 31, 2023
1 parent aab9871 commit 450e681
Show file tree
Hide file tree
Showing 26 changed files with 47,846 additions and 23 deletions.
17 changes: 16 additions & 1 deletion ray-operator/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -106,13 +106,29 @@ install: manifests kustomize ## Install CRDs into the K8s cluster specified in ~
uninstall: manifests kustomize ## Uninstall CRDs from the K8s cluster specified in ~/.kube/config.
$(KUSTOMIZE) build config/crd | kubectl delete -f -

install-with-webhooks: manifests kustomize ## Install CRDs with webhooks into the K8s cluster specified in ~/.kube/config.
($(KUSTOMIZE) build config/crd-with-webhooks | kubectl create -f -) || ($(KUSTOMIZE) build config/crd-with-webhooks | kubectl replace -f -)

uninstall-with-webhooks: manifests kustomize ## Uninstall CRDs with webhooks from the K8s cluster specified in ~/.kube/config.
$(KUSTOMIZE) build config/crd-with-webhooks | kubectl delete -f -

deploy: manifests kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config.
cd config/manager && $(KUSTOMIZE) edit set image kuberay/operator=${IMG}
($(KUSTOMIZE) build config/default | kubectl create -f -) || ($(KUSTOMIZE) build config/default | kubectl replace -f -)

undeploy: ## Undeploy controller from the K8s cluster specified in ~/.kube/config.
$(KUSTOMIZE) build config/default | kubectl delete -f -

certmanager:
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.2/cert-manager.yaml

deploy-with-webhooks: manifests kustomize certmanager ## Deploy controller with webhooks to the K8s cluster specified in ~/.kube/config.
cd config/default-with-webhooks && $(KUSTOMIZE) edit set image kuberay/operator=${IMG}
($(KUSTOMIZE) build config/default-with-webhooks | kubectl create -f -) || ($(KUSTOMIZE) build config/default-with-webhooks | kubectl replace -f -)

undeploy-with-webhooks: ## Undeploy controller with webhooks from the K8s cluster specified in ~/.kube/config.
$(KUSTOMIZE) build config/default | kubectl delete -f -


CONTROLLER_GEN = $(shell pwd)/bin/controller-gen
controller-gen: ## Download controller-gen locally if necessary.
Expand All @@ -125,4 +141,3 @@ kustomize: ## Download kustomize locally if necessary.
GOFUMPT = $(shell pwd)/bin/gofumpt
gofumpt: ## Download gofumpt locally if necessary.
test -s $(GOFUMPT) || GOBIN=$(GOFUMPT)/.. go install mvdan.cc/gofumpt@latest

8 changes: 8 additions & 0 deletions ray-operator/PROJECT
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# Code generated by tool. DO NOT EDIT.
# This file is used to track the info used to scaffold your project
# and allow the plugins properly work.
# More info: https://book.kubebuilder.io/reference/project-config.html
domain: io
layout:
- go.kubebuilder.io/v3
Expand All @@ -14,6 +18,10 @@ resources:
kind: RayCluster
path: github.com/ray-project/kuberay/ray-operator/apis/ray/v1
version: v1
webhooks:
defaulting: true
validation: true
webhookVersion: v1
- api:
crdVersion: v1
namespaced: true
Expand Down
81 changes: 81 additions & 0 deletions ray-operator/apis/ray/v1/raycluster_webhook.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package v1

import (
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/util/validation/field"
ctrl "sigs.k8s.io/controller-runtime"
logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/webhook"
)

// log is for logging in this package.
var rayclusterlog = logf.Log.WithName("raycluster-resource")

func (r *RayCluster) SetupWebhookWithManager(mgr ctrl.Manager) error {
return ctrl.NewWebhookManagedBy(mgr).
For(r).
Complete()
}

//+kubebuilder:webhook:path=/mutate-ray-io-v1-raycluster,mutating=true,failurePolicy=fail,sideEffects=None,groups=ray.io,resources=rayclusters,verbs=create;update,versions=v1,name=mraycluster.kb.io,admissionReviewVersions=v1

var _ webhook.Defaulter = &RayCluster{}

// Default implements webhook.Defaulter so a webhook will be registered for the type
func (r *RayCluster) Default() {
rayclusterlog.Info("default", "name", r.Name)

// TODO(user): fill in your defaulting logic.
}

// TODO(user): change verbs to "verbs=create;update;delete" if you want to enable deletion validation.
//+kubebuilder:webhook:path=/validate-ray-io-v1-raycluster,mutating=false,failurePolicy=fail,sideEffects=None,groups=ray.io,resources=rayclusters,verbs=create;update,versions=v1,name=vraycluster.kb.io,admissionReviewVersions=v1

var _ webhook.Validator = &RayCluster{}

// ValidateCreate implements webhook.Validator so a webhook will be registered for the type
func (r *RayCluster) ValidateCreate() error {
rayclusterlog.Info("validate create", "name", r.Name)
return r.validateRayCluster()
}

// ValidateUpdate implements webhook.Validator so a webhook will be registered for the type
func (r *RayCluster) ValidateUpdate(old runtime.Object) error {
rayclusterlog.Info("validate update", "name", r.Name)
return r.validateRayCluster()
}

// ValidateDelete implements webhook.Validator so a webhook will be registered for the type
func (r *RayCluster) ValidateDelete() error {
rayclusterlog.Info("validate delete", "name", r.Name)
return nil
}

func (r *RayCluster) validateRayCluster() error {
var allErrs field.ErrorList
if err := r.validateWorkerGroups(); err != nil {
allErrs = append(allErrs, err)
}
if len(allErrs) == 0 {
return nil
}

return apierrors.NewInvalid(
schema.GroupKind{Group: "ray.io", Kind: "RayCluster"},
r.Name, allErrs)
}

func (r *RayCluster) validateWorkerGroups() *field.Error {
workerGroupNames := make(map[string]bool)

for i, workerGroup := range r.Spec.WorkerGroupSpecs {
if _, ok := workerGroupNames[workerGroup.GroupName]; ok {
return field.Invalid(field.NewPath("spec").Child("workerGroupSpecs").Index(i), workerGroup, "worker group names must be unique")
}
workerGroupNames[workerGroup.GroupName] = true
}

return nil
}
176 changes: 176 additions & 0 deletions ray-operator/apis/ray/v1/webhook_suite_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
package v1

import (
"context"
"crypto/tls"
"fmt"
"net"
"path/filepath"
"testing"
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

admissionv1beta1 "k8s.io/api/admission/v1beta1"
v1 "k8s.io/api/core/v1"

//+kubebuilder:scaffold:imports
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/rand"
"k8s.io/client-go/rest"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/envtest"
logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
)

// These tests use Ginkgo (BDD-style Go testing framework). Refer to
// http://onsi.github.io/ginkgo/ to learn more about Ginkgo.

var cfg *rest.Config
var k8sClient client.Client
var testEnv *envtest.Environment
var ctx context.Context
var cancel context.CancelFunc

func TestAPIs(t *testing.T) {
RegisterFailHandler(Fail)

RunSpecs(t, "Webhook Suite")
}

var _ = BeforeSuite(func() {
logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true)))

ctx, cancel = context.WithCancel(context.TODO())

By("bootstrapping test environment")
testEnv = &envtest.Environment{
CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "config", "crd", "bases")},
ErrorIfCRDPathMissing: false,
WebhookInstallOptions: envtest.WebhookInstallOptions{
Paths: []string{filepath.Join("..", "..", "..", "config", "webhook")},
},
}

var err error
// cfg is defined in this file globally.
cfg, err = testEnv.Start()
Expect(err).NotTo(HaveOccurred())
Expect(cfg).NotTo(BeNil())

scheme := runtime.NewScheme()
err = AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())

err = admissionv1beta1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())

//+kubebuilder:scaffold:scheme

k8sClient, err = client.New(cfg, client.Options{Scheme: scheme})
Expect(err).NotTo(HaveOccurred())
Expect(k8sClient).NotTo(BeNil())

// start webhook server using Manager
webhookInstallOptions := &testEnv.WebhookInstallOptions
mgr, err := ctrl.NewManager(cfg, ctrl.Options{
Scheme: scheme,
Host: webhookInstallOptions.LocalServingHost,
Port: webhookInstallOptions.LocalServingPort,
CertDir: webhookInstallOptions.LocalServingCertDir,
LeaderElection: false,
MetricsBindAddress: "0",
})
Expect(err).NotTo(HaveOccurred())

err = (&RayCluster{}).SetupWebhookWithManager(mgr)
Expect(err).NotTo(HaveOccurred())

//+kubebuilder:scaffold:webhook

go func() {
defer GinkgoRecover()
err = mgr.Start(ctx)
Expect(err).NotTo(HaveOccurred())
}()

// wait for the webhook server to get ready
dialer := &net.Dialer{Timeout: time.Second}
addrPort := fmt.Sprintf("%s:%d", webhookInstallOptions.LocalServingHost, webhookInstallOptions.LocalServingPort)
Eventually(func() error {
conn, err := tls.DialWithDialer(dialer, "tcp", addrPort, &tls.Config{InsecureSkipVerify: true})
if err != nil {
return err
}
conn.Close()
return nil
}).Should(Succeed())

})

var _ = Describe("RayCluster validating webhook", func() {
Context("when groupNames are not unique", func() {
var name, namespace string
var rayCluster RayCluster

BeforeEach(func() {
namespace = "default"
name = fmt.Sprintf("test-raycluster-%d", rand.IntnRange(1000, 9000))
})

It("should return error", func() {
rayCluster = RayCluster{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: namespace,
},
Spec: RayClusterSpec{
HeadGroupSpec: HeadGroupSpec{
RayStartParams: map[string]string{"DEADBEEF": "DEADBEEF"},
Template: v1.PodTemplateSpec{
Spec: v1.PodSpec{
Containers: []v1.Container{},
},
},
},
WorkerGroupSpecs: []WorkerGroupSpec{
{
GroupName: "group1",
RayStartParams: map[string]string{"DEADBEEF": "DEADBEEF"},
Template: v1.PodTemplateSpec{
Spec: v1.PodSpec{
Containers: []v1.Container{},
},
},
},
{
GroupName: "group1",
RayStartParams: map[string]string{"DEADBEEF": "DEADBEEF"},
Template: v1.PodTemplateSpec{
Spec: v1.PodSpec{
Containers: []v1.Container{},
},
},
},
},
},
}

err := k8sClient.Create(context.TODO(), &rayCluster)
Expect(err).To(HaveOccurred())

Expect(err.Error()).To(ContainSubstring("worker group names must be unique"))
})
})
})

var _ = AfterSuite(func() {
cancel()
By("tearing down the test environment")
err := testEnv.Stop()
Expect(err).NotTo(HaveOccurred())
})
2 changes: 1 addition & 1 deletion ray-operator/apis/ray/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

39 changes: 39 additions & 0 deletions ray-operator/config/certmanager/certificate.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# The following manifests contain a self-signed issuer CR and a certificate CR.
# More document can be found at https://docs.cert-manager.io
# WARNING: Targets CertManager v1.0. Check https://cert-manager.io/docs/installation/upgrading/ for breaking changes.
apiVersion: cert-manager.io/v1
kind: Issuer
metadata:
labels:
app.kubernetes.io/name: issuer
app.kubernetes.io/instance: selfsigned-issuer
app.kubernetes.io/component: certificate
app.kubernetes.io/created-by: ray-operator
app.kubernetes.io/part-of: ray-operator
app.kubernetes.io/managed-by: kustomize
name: selfsigned-issuer
namespace: system
spec:
selfSigned: {}
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
labels:
app.kubernetes.io/name: certificate
app.kubernetes.io/instance: serving-cert
app.kubernetes.io/component: certificate
app.kubernetes.io/created-by: ray-operator
app.kubernetes.io/part-of: ray-operator
app.kubernetes.io/managed-by: kustomize
name: serving-cert # this name should match the one appeared in kustomizeconfig.yaml
namespace: system
spec:
# $(SERVICE_NAME) and $(SERVICE_NAMESPACE) will be substituted by kustomize
dnsNames:
- $(SERVICE_NAME).$(SERVICE_NAMESPACE).svc
- $(SERVICE_NAME).$(SERVICE_NAMESPACE).svc.cluster.local
issuerRef:
kind: Issuer
name: selfsigned-issuer
secretName: webhook-server-cert # this secret will not be prefixed, since it's not managed by kustomize
5 changes: 5 additions & 0 deletions ray-operator/config/certmanager/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
resources:
- certificate.yaml

configurations:
- kustomizeconfig.yaml
16 changes: 16 additions & 0 deletions ray-operator/config/certmanager/kustomizeconfig.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# This configuration is for teaching kustomize how to update name ref and var substitution
nameReference:
- kind: Issuer
group: cert-manager.io
fieldSpecs:
- kind: Certificate
group: cert-manager.io
path: spec/issuerRef/name

varReference:
- kind: Certificate
group: cert-manager.io
path: spec/commonName
- kind: Certificate
group: cert-manager.io
path: spec/dnsNames
Loading

0 comments on commit 450e681

Please sign in to comment.