Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: bug in object store memory validation #332

Merged
merged 2 commits into from
Jul 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 24 additions & 18 deletions ray-operator/controllers/ray/common/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@ package common

import (
"bytes"
"errors"
"fmt"
"strconv"
"strings"

"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"

rayiov1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"

logf "sigs.k8s.io/controller-runtime/pkg/log"
Expand All @@ -25,6 +25,9 @@ const (
RayLogVolumeMountPath = "/tmp/ray"
AutoscalerContainerName = "autoscaler"
RayHeadContainer = "ray-head"
ObjectStoreMemoryKey = "object-store-memory"
// TODO (davidxia): should be a const in upstream ray-project/ray
AllowSlowStorageEnvVar = "RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE"
)

var log = logf.Log.WithName("RayCluster-Controller")
Expand Down Expand Up @@ -552,39 +555,42 @@ func findMemoryReqOrLimit(container v1.Container) (res *resource.Quantity) {
return nil
}

// ValidateHeadRayStartParams will do some validations for the head node RayStartParams,
// return include the bool to judge if the RayStartParams is valid and err will include some information for the warning or error output.
// if isValid is true, even if there maybe some error message, it is still acceptable for a ray and only affect the performance
// if isValid is false, it means the RayStartParams will definitely casue a unhealthy or failed status in ray cluster.
// ValidateHeadRayStartParams will validate the head node's RayStartParams.
// Return a bool indicating the validity of RayStartParams and an err with additional information.
// If isValid is true, RayStartParams are valid. Any errors will only affect performance.
// If isValid is false, RayStartParams are invalid will result in an unhealthy or failed Ray cluster.
func ValidateHeadRayStartParams(rayHeadGroupSpec rayiov1alpha1.HeadGroupSpec) (isValid bool, err error) {
// TODO (dxia): if you add more validation, please split checks into separate subroutines.
var objectStoreMemory int64
rayStartParams := rayHeadGroupSpec.RayStartParams
// validation for the object store memory
if objectStoreMemoryStr, ok := rayStartParams["object-store-memory"]; ok {
if objectStoreMemoryStr, ok := rayStartParams[ObjectStoreMemoryKey]; ok {
objectStoreMemory, err = strconv.ParseInt(objectStoreMemoryStr, 10, 64)
if err != nil {
isValid = false
err = errors.New("convert error of the \"object-store-memory\"")
err = errors.NewBadRequest(fmt.Sprintf("Cannot parse %s %s as an integer: %s", ObjectStoreMemoryKey, objectStoreMemoryStr, err.Error()))
return
}
for _, container := range rayHeadGroupSpec.Template.Spec.Containers {
// choose the ray container.
// find the ray container.
if container.Name == RayHeadContainer {
if shmSize, ok := container.Resources.Requests.Storage().AsInt64(); ok && objectStoreMemory > shmSize {
if envVarExists("RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE", container.Env) {
// in ray if RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE is set, it will only affect the performance.
if shmSize, ok := container.Resources.Requests.Memory().AsInt64(); ok && objectStoreMemory > shmSize {
Jeffwan marked this conversation as resolved.
Show resolved Hide resolved
if envVarExists(AllowSlowStorageEnvVar, container.Env) {
// in ray if this env var is set, it will only affect the performance.
isValid = true
log.Info(fmt.Sprintf("object store memory exceed the size of the share memory in head node, object-store-memory:%d, share memory size:%d\n", objectStoreMemory, shmSize) +
"This will harm performance. Consider deleting files in /dev/shm or increasing request memory of head node.")
err = errors.New("RayStartParams unhealthy")
msg := fmt.Sprintf("RayStartParams: object store memory exceeds head node container's memory request, %s:%d, memory request:%d\n"+
"This will harm performance. Consider deleting files in %s or increasing head node's memory request.", ObjectStoreMemoryKey, objectStoreMemory, shmSize, SharedMemoryVolumeMountPath)
log.Info(msg)
err = errors.NewBadRequest(msg)
return
} else {
// if not set, the head node may crash and result in an unhealthy status.
isValid = false
log.Info(fmt.Sprintf("object store memory exceed the size of the share memory in head node, object-store-memory:%d, share memory size:%d\n", objectStoreMemory, shmSize) +
"This will lead to a ValueError in ray! Consider deleting files in /dev/shm or increasing request memory of head node" +
"To ignore this warning, set an environment variable in headGroupSpec: RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1")
err = errors.New("RayStartParams unhealthy")
msg := fmt.Sprintf("RayStartParams: object store memory exceeds head node container's memory request, %s:%d, memory request:%d\n"+
"This will lead to a ValueError in Ray! Consider deleting files in %s or increasing head node's memory request.\n"+
"To ignore this warning, set the following environment variable in headGroupSpec: %s=1",
ObjectStoreMemoryKey, objectStoreMemory, shmSize, SharedMemoryVolumeMountPath, AllowSlowStorageEnvVar)
err = errors.NewBadRequest(msg)
return
}
}
Expand Down
29 changes: 29 additions & 0 deletions ray-operator/controllers/ray/common/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ import (
"testing"

"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
"github.com/stretchr/testify/assert"

v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"

rayiov1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1"
Expand Down Expand Up @@ -455,6 +457,33 @@ func TestHeadPodTemplate_WithServiceAccount(t *testing.T) {
}
}

func TestValidateHeadRayStartParams_OK(t *testing.T) {
input := instance.Spec.HeadGroupSpec.DeepCopy()
isValid, err := ValidateHeadRayStartParams(*input)
assert.Equal(t, true, isValid)
assert.Nil(t, err)
}

func TestValidateHeadRayStartParams_ValidWithObjectStoreMemoryError(t *testing.T) {
input := instance.Spec.HeadGroupSpec.DeepCopy()
input.RayStartParams[ObjectStoreMemoryKey] = "2000000000"
input.Template.Spec.Containers[0].Env = append(input.Template.Spec.Containers[0].Env, v1.EnvVar{
Name: AllowSlowStorageEnvVar,
Value: "1",
})
isValid, err := ValidateHeadRayStartParams(*input)
assert.Equal(t, true, isValid)
assert.True(t, errors.IsBadRequest(err))
}

func TestValidateHeadRayStartParams_InvalidObjectStoreMemory(t *testing.T) {
input := instance.Spec.HeadGroupSpec.DeepCopy()
input.RayStartParams[ObjectStoreMemoryKey] = "2000000000"
isValid, err := ValidateHeadRayStartParams(*input)
assert.Equal(t, false, isValid)
assert.True(t, errors.IsBadRequest(err))
}

func splitAndSort(s string) []string {
strs := strings.Split(s, " ")
result := make([]string, 0, len(strs))
Expand Down