Skip to content

Commit

Permalink
Add validating webhook
Browse files Browse the repository at this point in the history
and validate worker group names are unique.

closes ray-project#718
closes ray-project#736
  • Loading branch information
davidxia committed Oct 30, 2023
1 parent 856a33e commit 36d2999
Show file tree
Hide file tree
Showing 21 changed files with 676 additions and 23 deletions.
5 changes: 4 additions & 1 deletion ray-operator/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,10 @@ install: manifests kustomize ## Install CRDs into the K8s cluster specified in ~
uninstall: manifests kustomize ## Uninstall CRDs from the K8s cluster specified in ~/.kube/config.
$(KUSTOMIZE) build config/crd | kubectl delete -f -

deploy: manifests kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config.
certmanager:
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.2/cert-manager.yaml

deploy: manifests kustomize certmanager ## Deploy controller to the K8s cluster specified in ~/.kube/config.
cd config/manager && $(KUSTOMIZE) edit set image kuberay/operator=${IMG}
($(KUSTOMIZE) build config/default | kubectl create -f -) || ($(KUSTOMIZE) build config/default | kubectl replace -f -)

Expand Down
8 changes: 8 additions & 0 deletions ray-operator/PROJECT
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# Code generated by tool. DO NOT EDIT.
# This file is used to track the info used to scaffold your project
# and allow the plugins properly work.
# More info: https://book.kubebuilder.io/reference/project-config.html
domain: io
layout:
- go.kubebuilder.io/v3
Expand All @@ -14,6 +18,10 @@ resources:
kind: RayCluster
path: github.com/ray-project/kuberay/ray-operator/apis/ray/v1
version: v1
webhooks:
defaulting: true
validation: true
webhookVersion: v1
- api:
crdVersion: v1
namespaced: true
Expand Down
81 changes: 81 additions & 0 deletions ray-operator/apis/ray/v1/raycluster_webhook.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package v1

import (
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/util/validation/field"
ctrl "sigs.k8s.io/controller-runtime"
logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/webhook"
)

// log is for logging in this package.
var rayclusterlog = logf.Log.WithName("raycluster-resource")

func (r *RayCluster) SetupWebhookWithManager(mgr ctrl.Manager) error {
return ctrl.NewWebhookManagedBy(mgr).
For(r).
Complete()
}

//+kubebuilder:webhook:path=/mutate-ray-io-v1-raycluster,mutating=true,failurePolicy=fail,sideEffects=None,groups=ray.io,resources=rayclusters,verbs=create;update,versions=v1,name=mraycluster.kb.io,admissionReviewVersions=v1

var _ webhook.Defaulter = &RayCluster{}

// Default implements webhook.Defaulter so a webhook will be registered for the type
func (r *RayCluster) Default() {
rayclusterlog.Info("default", "name", r.Name)

// TODO(user): fill in your defaulting logic.
}

// TODO(user): change verbs to "verbs=create;update;delete" if you want to enable deletion validation.
//+kubebuilder:webhook:path=/validate-ray-io-v1-raycluster,mutating=false,failurePolicy=fail,sideEffects=None,groups=ray.io,resources=rayclusters,verbs=create;update,versions=v1,name=vraycluster.kb.io,admissionReviewVersions=v1

var _ webhook.Validator = &RayCluster{}

// ValidateCreate implements webhook.Validator so a webhook will be registered for the type
func (r *RayCluster) ValidateCreate() error {
rayclusterlog.Info("validate create", "name", r.Name)
return r.validateRayCluster()
}

// ValidateUpdate implements webhook.Validator so a webhook will be registered for the type
func (r *RayCluster) ValidateUpdate(old runtime.Object) error {
rayclusterlog.Info("validate update", "name", r.Name)
return r.validateRayCluster()
}

// ValidateDelete implements webhook.Validator so a webhook will be registered for the type
func (r *RayCluster) ValidateDelete() error {
rayclusterlog.Info("validate delete", "name", r.Name)
return nil
}

func (r *RayCluster) validateRayCluster() error {
var allErrs field.ErrorList
if err := r.validateWorkerGroups(); err != nil {
allErrs = append(allErrs, err)
}
if len(allErrs) == 0 {
return nil
}

return apierrors.NewInvalid(
schema.GroupKind{Group: "ray.io", Kind: "RayCluster"},
r.Name, allErrs)
}

func (r *RayCluster) validateWorkerGroups() *field.Error {
workerGroupNames := make(map[string]bool)

for i, workerGroup := range r.Spec.WorkerGroupSpecs {
if _, ok := workerGroupNames[workerGroup.GroupName]; ok {
return field.Invalid(field.NewPath("spec").Child("workerGroupSpecs").Index(i), workerGroup, "worker group names must be unique")
}
workerGroupNames[workerGroup.GroupName] = true
}

return nil
}
176 changes: 176 additions & 0 deletions ray-operator/apis/ray/v1/webhook_suite_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
package v1

import (
"context"
"crypto/tls"
"fmt"
"net"
"path/filepath"
"testing"
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

admissionv1beta1 "k8s.io/api/admission/v1beta1"
v1 "k8s.io/api/core/v1"

//+kubebuilder:scaffold:imports
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/rand"
"k8s.io/client-go/rest"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/envtest"
logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
)

// These tests use Ginkgo (BDD-style Go testing framework). Refer to
// http://onsi.github.io/ginkgo/ to learn more about Ginkgo.

var cfg *rest.Config
var k8sClient client.Client
var testEnv *envtest.Environment
var ctx context.Context
var cancel context.CancelFunc

func TestAPIs(t *testing.T) {
RegisterFailHandler(Fail)

RunSpecs(t, "Webhook Suite")
}

var _ = BeforeSuite(func() {
logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true)))

ctx, cancel = context.WithCancel(context.TODO())

By("bootstrapping test environment")
testEnv = &envtest.Environment{
CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "config", "crd", "bases")},
ErrorIfCRDPathMissing: false,
WebhookInstallOptions: envtest.WebhookInstallOptions{
Paths: []string{filepath.Join("..", "..", "..", "config", "webhook")},
},
}

var err error
// cfg is defined in this file globally.
cfg, err = testEnv.Start()
Expect(err).NotTo(HaveOccurred())
Expect(cfg).NotTo(BeNil())

scheme := runtime.NewScheme()
err = AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())

err = admissionv1beta1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())

//+kubebuilder:scaffold:scheme

k8sClient, err = client.New(cfg, client.Options{Scheme: scheme})
Expect(err).NotTo(HaveOccurred())
Expect(k8sClient).NotTo(BeNil())

// start webhook server using Manager
webhookInstallOptions := &testEnv.WebhookInstallOptions
mgr, err := ctrl.NewManager(cfg, ctrl.Options{
Scheme: scheme,
Host: webhookInstallOptions.LocalServingHost,
Port: webhookInstallOptions.LocalServingPort,
CertDir: webhookInstallOptions.LocalServingCertDir,
LeaderElection: false,
MetricsBindAddress: "0",
})
Expect(err).NotTo(HaveOccurred())

err = (&RayCluster{}).SetupWebhookWithManager(mgr)
Expect(err).NotTo(HaveOccurred())

//+kubebuilder:scaffold:webhook

go func() {
defer GinkgoRecover()
err = mgr.Start(ctx)
Expect(err).NotTo(HaveOccurred())
}()

// wait for the webhook server to get ready
dialer := &net.Dialer{Timeout: time.Second}
addrPort := fmt.Sprintf("%s:%d", webhookInstallOptions.LocalServingHost, webhookInstallOptions.LocalServingPort)
Eventually(func() error {
conn, err := tls.DialWithDialer(dialer, "tcp", addrPort, &tls.Config{InsecureSkipVerify: true})
if err != nil {
return err
}
conn.Close()
return nil
}).Should(Succeed())

})

var _ = Describe("RayCluster validating webhook", func() {
Context("when groupNames are not unique", func() {
var name, namespace string
var rayCluster RayCluster

BeforeEach(func() {
namespace = "default"
name = fmt.Sprintf("test-raycluster-%d", rand.IntnRange(1000, 9000))
})

It("should return error", func() {
rayCluster = RayCluster{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: namespace,
},
Spec: RayClusterSpec{
HeadGroupSpec: HeadGroupSpec{
RayStartParams: map[string]string{"DEADBEEF": "DEADBEEF"},
Template: v1.PodTemplateSpec{
Spec: v1.PodSpec{
Containers: []v1.Container{},
},
},
},
WorkerGroupSpecs: []WorkerGroupSpec{
{
GroupName: "group1",
RayStartParams: map[string]string{"DEADBEEF": "DEADBEEF"},
Template: v1.PodTemplateSpec{
Spec: v1.PodSpec{
Containers: []v1.Container{},
},
},
},
{
GroupName: "group1",
RayStartParams: map[string]string{"DEADBEEF": "DEADBEEF"},
Template: v1.PodTemplateSpec{
Spec: v1.PodSpec{
Containers: []v1.Container{},
},
},
},
},
},
}

err := k8sClient.Create(context.TODO(), &rayCluster)
Expect(err).To(HaveOccurred())

Expect(err.Error()).To(ContainSubstring("worker group names must be unique"))
})
})
})

var _ = AfterSuite(func() {
cancel()
By("tearing down the test environment")
err := testEnv.Stop()
Expect(err).NotTo(HaveOccurred())
})
2 changes: 1 addition & 1 deletion ray-operator/apis/ray/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

39 changes: 39 additions & 0 deletions ray-operator/config/certmanager/certificate.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# The following manifests contain a self-signed issuer CR and a certificate CR.
# More document can be found at https://docs.cert-manager.io
# WARNING: Targets CertManager v1.0. Check https://cert-manager.io/docs/installation/upgrading/ for breaking changes.
apiVersion: cert-manager.io/v1
kind: Issuer
metadata:
labels:
app.kubernetes.io/name: issuer
app.kubernetes.io/instance: selfsigned-issuer
app.kubernetes.io/component: certificate
app.kubernetes.io/created-by: ray-operator
app.kubernetes.io/part-of: ray-operator
app.kubernetes.io/managed-by: kustomize
name: selfsigned-issuer
namespace: system
spec:
selfSigned: {}
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
labels:
app.kubernetes.io/name: certificate
app.kubernetes.io/instance: serving-cert
app.kubernetes.io/component: certificate
app.kubernetes.io/created-by: ray-operator
app.kubernetes.io/part-of: ray-operator
app.kubernetes.io/managed-by: kustomize
name: serving-cert # this name should match the one appeared in kustomizeconfig.yaml
namespace: system
spec:
# $(SERVICE_NAME) and $(SERVICE_NAMESPACE) will be substituted by kustomize
dnsNames:
- $(SERVICE_NAME).$(SERVICE_NAMESPACE).svc
- $(SERVICE_NAME).$(SERVICE_NAMESPACE).svc.cluster.local
issuerRef:
kind: Issuer
name: selfsigned-issuer
secretName: webhook-server-cert # this secret will not be prefixed, since it's not managed by kustomize
5 changes: 5 additions & 0 deletions ray-operator/config/certmanager/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
resources:
- certificate.yaml

configurations:
- kustomizeconfig.yaml
16 changes: 16 additions & 0 deletions ray-operator/config/certmanager/kustomizeconfig.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# This configuration is for teaching kustomize how to update name ref and var substitution
nameReference:
- kind: Issuer
group: cert-manager.io
fieldSpecs:
- kind: Certificate
group: cert-manager.io
path: spec/issuerRef/name

varReference:
- kind: Certificate
group: cert-manager.io
path: spec/commonName
- kind: Certificate
group: cert-manager.io
path: spec/dnsNames
13 changes: 13 additions & 0 deletions ray-operator/config/crd/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,16 @@ resources:
- bases/ray.io_rayservices.yaml
- bases/ray.io_rayjobs.yaml
# +kubebuilder:scaffold:crdkustomizeresource

patches:
# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix.
# patches here are for enabling the conversion webhook for each CRD
#- path: patches/webhook_in_workbenches.yaml
- path: patches/webhook_in_rayclusters.yaml
#+kubebuilder:scaffold:crdkustomizewebhookpatch

# [CERTMANAGER] To enable cert-manager, uncomment all the sections with [CERTMANAGER] prefix.
# patches here are for enabling the CA injection for each CRD
#- path: patches/cainjection_in_workbenches.yaml
- path: patches/cainjection_in_rayclusters.yaml
#+kubebuilder:scaffold:crdkustomizecainjectionpatch
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# The following patch adds a directive for certmanager to inject CA into the CRD
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
cert-manager.io/inject-ca-from: $(CERTIFICATE_NAMESPACE)/$(CERTIFICATE_NAME)
name: rayclusters.ray.io
Loading

0 comments on commit 36d2999

Please sign in to comment.