Skip to content

Commit

Permalink
[RFC][autoscaler] Add autoscaler container overrides and config optio…
Browse files Browse the repository at this point in the history
…ns for scale behavior. (ray-project#278)

This PR adds an autoscalerOptions field to the RayCluster CRD.
  • Loading branch information
DmitriGekhtman authored Jun 1, 2022
1 parent 7a40388 commit b3ca6c6
Show file tree
Hide file tree
Showing 5 changed files with 161 additions and 2 deletions.
21 changes: 21 additions & 0 deletions ray-operator/apis/ray/v1alpha1/raycluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ type RayClusterSpec struct {
RayVersion string `json:"rayVersion,omitempty"`
// EnableInTreeAutoscaling indicates whether operator should create in tree autoscaling configs
EnableInTreeAutoscaling *bool `json:"enableInTreeAutoscaling,omitempty"`
// AutoscalerOptions specifies optional configuration for the Ray autoscaler.
AutoscalerOptions *AutoscalerOptions `json:"autoscalerOptions,omitempty"`
}

// HeadGroupSpec are the spec for the head pod
Expand Down Expand Up @@ -62,6 +64,25 @@ type ScaleStrategy struct {
WorkersToDelete []string `json:"workersToDelete,omitempty"`
}

// AutoscalerOptions specifies optional configuration for the Ray autoscaler.
type AutoscalerOptions struct {
// Resources specifies resource requests and limits for the autoscaler container.
// Default values: 256m CPU request, 512m CPU limit, 256Mi memory request, 512Mi memory limit.
Resources *v1.ResourceRequirements `json:"resources,omitempty"`
// Image optionally overrides the autoscaler's container image. This override is for provided for autoscaler testing and development.
Image *string `json:"image,omitempty"`
// IdleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.
// Defaults to 300 (five minutes).
IdleTimeoutSeconds *int32 `json:"idleTimeoutSeconds,omitempty"`
// UpscalineMode is "Default" or "Aggressive."
// Default: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster.
// Aggressive: Upscaling is not rate-limited.
UpscalingMode *UpscalingMode `json:"upscalingMode,omitempty"`
}

// +kubebuilder:validation:Enum=Default;Aggressive
type UpscalingMode string

// The overall state of the Ray cluster.
type ClusterState string

Expand Down
41 changes: 41 additions & 0 deletions ray-operator/apis/ray/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

45 changes: 45 additions & 0 deletions ray-operator/config/crd/bases/ray.io_rayclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,51 @@ spec:
spec:
description: Specification of the desired behavior of the RayCluster.
properties:
autoscalerOptions:
description: AutoscalerOptions specifies optional configuration for
the Ray autoscaler.
properties:
idleTimeoutSeconds:
description: IdleTimeoutSeconds is the number of seconds to wait
before scaling down a worker pod which is not us
format: int32
type: integer
image:
description: Image optionally overrides the autoscaler's container
image.
type: string
resources:
description: Resources specifies resource requests and limits
for the autoscaler container.
properties:
limits:
additionalProperties:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
description: 'Limits describes the maximum amount of compute
resources allowed. More info: https://kubernetes.'
type: object
requests:
additionalProperties:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
description: Requests describes the minimum amount of compute
resources required.
type: object
type: object
upscalingMode:
description: UpscalineMode is "Default" or "Aggressive.
enum:
- Default
- Aggressive
type: string
type: object
enableInTreeAutoscaling:
description: EnableInTreeAutoscaling indicates whether operator should
create in tree autoscaling configs
Expand Down
18 changes: 16 additions & 2 deletions ray-operator/controllers/ray/common/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,10 @@ func DefaultHeadPodTemplate(instance rayiov1alpha1.RayCluster, headSpec rayiov1a
podTemplate.Spec.ServiceAccountName = utils.GetHeadGroupServiceAccountName(&instance)

// inject autoscaler container into head pod
container := BuildAutoscalerContainer()
podTemplate.Spec.Containers = append(podTemplate.Spec.Containers, container)
autoscalerContainer := BuildAutoscalerContainer()
// Merge the user overrides from autoscalerOptions into the autoscaler container config.
mergeAutoscalerOverrides(&autoscalerContainer, instance.Spec.AutoscalerOptions)
podTemplate.Spec.Containers = append(podTemplate.Spec.Containers, autoscalerContainer)
// set custom service account which can be authorized to talk with apiserver
podTemplate.Spec.ServiceAccountName = instance.Name
}
Expand Down Expand Up @@ -214,6 +216,18 @@ func BuildAutoscalerContainer() v1.Container {
return container
}

// Merge the user overrides from autoscalerOptions into the autoscaler container config.
func mergeAutoscalerOverrides(autoscalerContainer *v1.Container, autoscalerOptions *rayiov1alpha1.AutoscalerOptions) {
if autoscalerOptions != nil {
if autoscalerOptions.Resources != nil {
autoscalerContainer.Resources = *autoscalerOptions.Resources
}
if autoscalerOptions.Image != nil {
autoscalerContainer.Image = *autoscalerOptions.Image
}
}
}

func isRayStartWithBlock(rayStartParams map[string]string) bool {
if blockValue, exist := rayStartParams["block"]; exist {
return strings.ToLower(blockValue) == "true"
Expand Down
38 changes: 38 additions & 0 deletions ray-operator/controllers/ray/common/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,45 @@ func TestBuildPod_WithAutoscalerEnabled(t *testing.T) {
if !reflect.DeepEqual(expectedContainer, actualContainer) {
t.Fatalf("Expected `%v` but got `%v`", expectedContainer, actualContainer)
}
}

// Check that autoscaler container overrides work as expected.
func TestBuildPodWithAutoscalerOptions(t *testing.T) {
cluster := instance.DeepCopy()
cluster.Spec.EnableInTreeAutoscaling = &trueFlag
podName := strings.ToLower(cluster.Name + DashSymbol + string(rayiov1alpha1.HeadNode) + DashSymbol + utils.FormatInt32(0))
svcName := utils.GenerateServiceName(cluster.Name)

customAutoscalerImage := "custom-autoscaler-xxx"
customTimeout := int32(100)
customUpscaling := "Aggressive"
customResources := v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("1"),
v1.ResourceMemory: testMemoryLimit,
},
Limits: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("1"),
v1.ResourceMemory: testMemoryLimit,
},
}

cluster.Spec.AutoscalerOptions = &rayiov1alpha1.AutoscalerOptions{
UpscalingMode: (*rayiov1alpha1.UpscalingMode)(&customUpscaling),
IdleTimeoutSeconds: &customTimeout,
Image: &customAutoscalerImage,
Resources: &customResources,
}
podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, svcName)
pod := BuildPod(podTemplateSpec, rayiov1alpha1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, svcName, &trueFlag)
expectedContainer := *autoscalerContainer.DeepCopy()
expectedContainer.Image = customAutoscalerImage
expectedContainer.Resources = customResources
index := getAutoscalerContainerIndex(pod)
actualContainer := pod.Spec.Containers[index]
if !reflect.DeepEqual(expectedContainer, actualContainer) {
t.Fatalf("Expected `%v` but got `%v`", expectedContainer, actualContainer)
}
}

func TestDefaultHeadPodTemplate_WithAutoscalingEnabled(t *testing.T) {
Expand Down

0 comments on commit b3ca6c6

Please sign in to comment.