Skip to content

Commit

Permalink
fix: bug in object store memory validation
Browse files Browse the repository at this point in the history
We are comparing memory to K8s Pod storage request
instead of memory request.

Add tests.

Expand error message with more information.
This error message is used in K8s events.
More information here might be helpful.

Update comments and log messages for clarity and smoother English usage.

Make repeated strings constants.
  • Loading branch information
davidxia committed Jul 3, 2022
1 parent d1e0d1a commit 34ad61f
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 18 deletions.
42 changes: 24 additions & 18 deletions ray-operator/controllers/ray/common/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@ package common

import (
"bytes"
"errors"
"fmt"
"strconv"
"strings"

"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"

rayiov1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"

logf "sigs.k8s.io/controller-runtime/pkg/log"
Expand All @@ -25,6 +25,9 @@ const (
RayLogVolumeMountPath = "/tmp/ray"
AutoscalerContainerName = "autoscaler"
RayHeadContainer = "ray-head"
objectStoreMemoryKey = "object-store-memory"
// TODO (dxia): should be a const in upstream ray-project/ray
allowSlowStorageEnvVar = "RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE"
)

var log = logf.Log.WithName("RayCluster-Controller")
Expand Down Expand Up @@ -552,39 +555,42 @@ func findMemoryReqOrLimit(container v1.Container) (res *resource.Quantity) {
return nil
}

// ValidateHeadRayStartParams will do some validations for the head node RayStartParams,
// return include the bool to judge if the RayStartParams is valid and err will include some information for the warning or error output.
// if isValid is true, even if there maybe some error message, it is still acceptable for a ray and only affect the performance
// if isValid is false, it means the RayStartParams will definitely casue a unhealthy or failed status in ray cluster.
// ValidateHeadRayStartParams will validate the head node's RayStartParams.
// Return a bool indicating the validity of RayStartParams and an err with additional information.
// If isValid is true, RayStartParams are valid. Any errors will only affect performance.
// If isValid is false, RayStartParams are invalid will result in an unhealthy or failed Ray cluster.
func ValidateHeadRayStartParams(rayHeadGroupSpec rayiov1alpha1.HeadGroupSpec) (isValid bool, err error) {
// TODO (dxia): if you add more validation, please split checks into separate subroutines.
var objectStoreMemory int64
rayStartParams := rayHeadGroupSpec.RayStartParams
// validation for the object store memory
if objectStoreMemoryStr, ok := rayStartParams["object-store-memory"]; ok {
if objectStoreMemoryStr, ok := rayStartParams[objectStoreMemoryKey]; ok {
objectStoreMemory, err = strconv.ParseInt(objectStoreMemoryStr, 10, 64)
if err != nil {
isValid = false
err = errors.New("convert error of the \"object-store-memory\"")
err = errors.NewBadRequest(fmt.Sprintf("Cannot parse %s %s as an integer: %s", objectStoreMemoryKey, objectStoreMemoryStr, err.Error()))
return
}
for _, container := range rayHeadGroupSpec.Template.Spec.Containers {
// choose the ray container.
// find the ray container.
if container.Name == RayHeadContainer {
if shmSize, ok := container.Resources.Requests.Storage().AsInt64(); ok && objectStoreMemory > shmSize {
if envVarExists("RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE", container.Env) {
// in ray if RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE is set, it will only affect the performance.
if shmSize, ok := container.Resources.Requests.Memory().AsInt64(); ok && objectStoreMemory > shmSize {
if envVarExists(allowSlowStorageEnvVar, container.Env) {
// in ray if this env var is set, it will only affect the performance.
isValid = true
log.Info(fmt.Sprintf("object store memory exceed the size of the share memory in head node, object-store-memory:%d, share memory size:%d\n", objectStoreMemory, shmSize) +
"This will harm performance. Consider deleting files in /dev/shm or increasing request memory of head node.")
err = errors.New("RayStartParams unhealthy")
msg := fmt.Sprintf("RayStartParams: object store memory exceeds head node container's memory request, %s:%d, memory request:%d\n"+
"This will harm performance. Consider deleting files in %s or increasing head node's memory request.", objectStoreMemoryKey, objectStoreMemory, shmSize, SharedMemoryVolumeMountPath)
log.Info(msg)
err = errors.NewBadRequest(msg)
return
} else {
// if not set, the head node may crash and result in an unhealthy status.
isValid = false
log.Info(fmt.Sprintf("object store memory exceed the size of the share memory in head node, object-store-memory:%d, share memory size:%d\n", objectStoreMemory, shmSize) +
"This will lead to a ValueError in ray! Consider deleting files in /dev/shm or increasing request memory of head node" +
"To ignore this warning, set an environment variable in headGroupSpec: RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1")
err = errors.New("RayStartParams unhealthy")
msg := fmt.Sprintf("RayStartParams: object store memory exceeds head node container's memory request, %s:%d, memory request:%d\n"+
"This will lead to a ValueError in Ray! Consider deleting files in %s or increasing head node's memory request.\n"+
"To ignore this warning, set the following environment variable in headGroupSpec: %s=1",
objectStoreMemoryKey, objectStoreMemory, shmSize, SharedMemoryVolumeMountPath, allowSlowStorageEnvVar)
err = errors.NewBadRequest(msg)
return
}
}
Expand Down
29 changes: 29 additions & 0 deletions ray-operator/controllers/ray/common/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ import (
"testing"

"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
"github.com/stretchr/testify/assert"

v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"

rayiov1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1"
Expand Down Expand Up @@ -455,6 +457,33 @@ func TestHeadPodTemplate_WithServiceAccount(t *testing.T) {
}
}

func TestValidateHeadRayStartParams_OK(t *testing.T) {
input := instance.Spec.HeadGroupSpec.DeepCopy()
isValid, err := ValidateHeadRayStartParams(*input)
assert.Equal(t, true, isValid)
assert.Nil(t, err)
}

func TestValidateHeadRayStartParams_ValidWithObjectStoreMemoryError(t *testing.T) {
input := instance.Spec.HeadGroupSpec.DeepCopy()
input.RayStartParams["object-store-memory"] = "2000000000"
input.Template.Spec.Containers[0].Env = append(input.Template.Spec.Containers[0].Env, v1.EnvVar{
Name: allowSlowStorageEnvVar,
Value: "1",
})
isValid, err := ValidateHeadRayStartParams(*input)
assert.Equal(t, true, isValid)
assert.True(t, errors.IsBadRequest(err))
}

func TestValidateHeadRayStartParams_InvalidObjectStoreMemory(t *testing.T) {
input := instance.Spec.HeadGroupSpec.DeepCopy()
input.RayStartParams["object-store-memory"] = "2000000000"
isValid, err := ValidateHeadRayStartParams(*input)
assert.Equal(t, false, isValid)
assert.True(t, errors.IsBadRequest(err))
}

func splitAndSort(s string) []string {
strs := strings.Split(s, " ")
result := make([]string, 0, len(strs))
Expand Down

0 comments on commit 34ad61f

Please sign in to comment.