ray-project · kevin85421 · Sep 27, 2024 · Sep 21, 2024 · Sep 21, 2024 · Sep 21, 2024
diff --git a/ray-operator/controllers/ray/batchscheduler/yunikorn/yunikorn_scheduler.go b/ray-operator/controllers/ray/batchscheduler/yunikorn/yunikorn_scheduler.go
@@ -12,14 +12,17 @@ import (
 
 	rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
 	schedulerinterface "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/interface"
+	"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
 )
 
 const (
-	SchedulerName                     string = "yunikorn"
-	YuniKornPodApplicationIDLabelName string = "applicationId"
-	YuniKornPodQueueLabelName         string = "queue"
-	RayClusterApplicationIDLabelName  string = "yunikorn.apache.org/application-id"
-	RayClusterQueueLabelName          string = "yunikorn.apache.org/queue-name"
+	SchedulerName                       string = "yunikorn"
+	YuniKornPodApplicationIDLabelName   string = "applicationId"
+	YuniKornPodQueueLabelName           string = "queue"
+	RayClusterApplicationIDLabelName    string = "yunikorn.apache.org/app-id"
+	RayClusterQueueLabelName            string = "yunikorn.apache.org/queue"
+	YuniKornTaskGroupNameAnnotationName string = "yunikorn.apache.org/task-group-name"
+	YuniKornTaskGroupsAnnotationName    string = "yunikorn.apache.org/task-groups"
 )
 
 type YuniKornScheduler struct {
@@ -42,19 +45,63 @@ func (y *YuniKornScheduler) DoBatchSchedulingOnSubmission(_ context.Context, _ *
 	return nil
 }
 
+// populatePodLabels is a helper function that copies RayCluster's label to the given pod based on the label key
+// TODO: remove the legacy labels, i.e "applicationId" and "queue", directly populate
+// RayClusterApplicationIDLabelName and RayClusterQueueLabelName to pod labels
 func (y *YuniKornScheduler) populatePodLabels(app *rayv1.RayCluster, pod *corev1.Pod, sourceKey string, targetKey string) {
 	// check labels
 	if value, exist := app.Labels[sourceKey]; exist {
-		y.log.Info("Updating pod label based on RayCluster annotations",
+		y.log.Info("Updating pod label based on RayCluster labels",
 			"sourceKey", sourceKey, "targetKey", targetKey, "value", value)
 		pod.Labels[targetKey] = value
 	}
 }
 
-func (y *YuniKornScheduler) AddMetadataToPod(app *rayv1.RayCluster, _ string, pod *corev1.Pod) {
+// AddMetadataToPod adds essential labels and annotations to the Ray pods
+// the yunikorn scheduler needs these labels and annotations in order to do the scheduling properly
+func (y *YuniKornScheduler) AddMetadataToPod(app *rayv1.RayCluster, groupName string, pod *corev1.Pod) {
+	// the applicationID and queue name must be provided in the labels
 	y.populatePodLabels(app, pod, RayClusterApplicationIDLabelName, YuniKornPodApplicationIDLabelName)
 	y.populatePodLabels(app, pod, RayClusterQueueLabelName, YuniKornPodQueueLabelName)
 	pod.Spec.SchedulerName = y.Name()
+
+	// when gang scheduling is enabled, extra annotations need to be added to all pods
+	if y.isGangSchedulingEnabled(app) {
+		// populate the taskGroups info to each pod
+		y.populateTaskGroupsAnnotationToPod(app, pod)
+
+		// set the task group name based on the head or worker group name
+		// the group name for the head and each of the worker group should be different
+		pod.Annotations[YuniKornTaskGroupNameAnnotationName] = groupName
+	}
+}
+
+func (y *YuniKornScheduler) isGangSchedulingEnabled(app *rayv1.RayCluster) bool {
+	_, exist := app.Labels[utils.RayClusterGangSchedulingEnabled]
+	return exist
+}
+
+func (y *YuniKornScheduler) populateTaskGroupsAnnotationToPod(app *rayv1.RayCluster, pod *corev1.Pod) {
+	taskGroups := newTaskGroupsFromApp(app)
+	taskGroupsAnnotationValue, err := taskGroups.marshal()
+	if err != nil {
+		y.log.Error(err, "failed to add gang scheduling related annotations to pod, "+
+			"gang scheduling will not be enabled for this workload",
+			"rayCluster", app.Name, "name", pod.Name, "namespace", pod.Namespace)
+		return
+	}
+
+	y.log.Info("add task groups info to pod's annotation",
+		"key", YuniKornTaskGroupsAnnotationName,
+		"value", taskGroupsAnnotationValue,
+		"numOfTaskGroups", taskGroups.size())
+	if pod.Annotations == nil {
+		pod.Annotations = make(map[string]string)
+	}
+	pod.Annotations[YuniKornTaskGroupsAnnotationName] = taskGroupsAnnotationValue
+
+	y.log.Info("Gang Scheduling enabled for RayCluster",
+		"RayCluster", app.Name, "Namespace", app.Namespace)
 }
 
 func (yf *YuniKornSchedulerFactory) New(_ *rest.Config) (schedulerinterface.BatchScheduler, error) {

diff --git a/ray-operator/controllers/ray/batchscheduler/yunikorn/yunikorn_scheduler_test.go b/ray-operator/controllers/ray/batchscheduler/yunikorn/yunikorn_scheduler_test.go
@@ -1,10 +1,15 @@
 package yunikorn
 
 import (
+	"encoding/json"
+	"fmt"
 	"testing"
 
+	"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
+
 	"github.com/stretchr/testify/assert"
 	v1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 
 	rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
@@ -41,7 +46,7 @@ func TestPopulatePodLabels(t *testing.T) {
 
 	rayCluster2 := createRayClusterWithLabels(
 		"ray-cluster-without-labels",
-		"test",
+		"test1",
 		nil, // empty labels
 	)
 	rayPod3 := createPod("my-pod-2", "test")
@@ -51,6 +56,114 @@ func TestPopulatePodLabels(t *testing.T) {
 	assert.Equal(t, podLabelsContains(rayPod3, YuniKornPodQueueLabelName, queue2), false)
 }
 
+func TestIsGangSchedulingEnabled(t *testing.T) {
+	yk := &YuniKornScheduler{}
+
+	job1 := "job-1-01234"
+	queue1 := "root.default"
+	rayCluster1 := createRayClusterWithLabels(
+		"ray-cluster-with-gang-scheduling",
+		"test1",
+		map[string]string{
+			RayClusterApplicationIDLabelName:      job1,
+			RayClusterQueueLabelName:              queue1,
+			utils.RayClusterGangSchedulingEnabled: "true",
+		},
+	)
+
+	assert.Equal(t, yk.isGangSchedulingEnabled(rayCluster1), true)
+
+	rayCluster2 := createRayClusterWithLabels(
+		"ray-cluster-with-gang-scheduling",
+		"test2",
+		map[string]string{
+			RayClusterApplicationIDLabelName:      job1,
+			RayClusterQueueLabelName:              queue1,
+			utils.RayClusterGangSchedulingEnabled: "",
+		},
+	)
+
+	assert.Equal(t, yk.isGangSchedulingEnabled(rayCluster2), true)
+
+	rayCluster3 := createRayClusterWithLabels(
+		"ray-cluster-with-gang-scheduling",
+		"test3",
+		map[string]string{
+			RayClusterApplicationIDLabelName: job1,
+			RayClusterQueueLabelName:         queue1,
+		},
+	)
+
+	assert.Equal(t, yk.isGangSchedulingEnabled(rayCluster3), false)
+}
+
+func TestPopulateGangSchedulingAnnotations(t *testing.T) {
+	yk := &YuniKornScheduler{}
+
+	job1 := "job-1-01234"
+	queue1 := "root.default"
+
+	// test the case when gang-scheduling is enabled
+	rayClusterWithGangScheduling := createRayClusterWithLabels(
+		"ray-cluster-with-gang-scheduling",
+		"test3",
+		map[string]string{
+			RayClusterApplicationIDLabelName:      job1,
+			RayClusterQueueLabelName:              queue1,
+			utils.RayClusterGangSchedulingEnabled: "true",
+		},
+	)
+
+	// head pod:
+	//   cpu: 5
+	//   memory: 5Gi
+	addHeadPodSpec(rayClusterWithGangScheduling, v1.ResourceList{
+		v1.ResourceCPU:    resource.MustParse("5"),
+		v1.ResourceMemory: resource.MustParse("5Gi"),
+	})
+
+	// worker pod:
+	//   cpu: 2
+	//   memory: 10Gi
+	//   nvidia.com/gpu: 1
+	addWorkerPodSpec(rayClusterWithGangScheduling,
+		"worker-group-1", 1, 1, 2, v1.ResourceList{
+			v1.ResourceCPU:    resource.MustParse("2"),
+			v1.ResourceMemory: resource.MustParse("10Gi"),
+			"nvidia.com/gpu":  resource.MustParse("1"),
+		})
+
+	// gang-scheduling enabled case, the plugin should populate the taskGroup annotation to the app
+	rayPod := createPod("ray-pod", "default")
+	yk.populateTaskGroupsAnnotationToPod(rayClusterWithGangScheduling, rayPod)
+
+	kk, err := GetTaskGroupsFromAnnotation(rayPod)
+	assert.NoError(t, err)
+	assert.Equal(t, len(kk), 2)
+	// verify the annotation value
+	taskGroupsSpec := rayPod.Annotations[YuniKornTaskGroupsAnnotationName]
+	assert.Equal(t, true, len(taskGroupsSpec) > 0)
+	taskGroups := newTaskGroups()
+	err = taskGroups.unmarshalFrom(taskGroupsSpec)
+	assert.NoError(t, err)
+	assert.Equal(t, len(taskGroups.Groups), 2)
+
+	// verify the correctness of head group
+	headGroup := taskGroups.getTaskGroup(utils.RayNodeHeadGroupLabelValue)
+	assert.NotNil(t, headGroup)
+	assert.Equal(t, int32(1), headGroup.MinMember)
+	assert.Equal(t, resource.MustParse("5"), headGroup.MinResource[v1.ResourceCPU.String()])
+	assert.Equal(t, resource.MustParse("5Gi"), headGroup.MinResource[v1.ResourceMemory.String()])
+
+	// verify the correctness of worker group
+	workerGroup := taskGroups.getTaskGroup("worker-group-1")
+	assert.NotNil(t, workerGroup)
+	assert.Equal(t, int32(1), workerGroup.MinMember)
+	assert.Equal(t, resource.MustParse("2"), workerGroup.MinResource[v1.ResourceCPU.String()])
+	assert.Equal(t, resource.MustParse("10Gi"), workerGroup.MinResource[v1.ResourceMemory.String()])
+	assert.Equal(t, resource.MustParse("1"), workerGroup.MinResource["nvidia.com/gpu"])
+}
+
 func createRayClusterWithLabels(name string, namespace string, labels map[string]string) *rayv1.RayCluster {
 	rayCluster := &rayv1.RayCluster{
 		ObjectMeta: metav1.ObjectMeta{
@@ -63,6 +176,49 @@ func createRayClusterWithLabels(name string, namespace string, labels map[string
 	return rayCluster
 }
 
+func addHeadPodSpec(app *rayv1.RayCluster, resource v1.ResourceList) {
+	// app.Spec.HeadGroupSpec.Template.Spec.Containers
+	headContainers := []v1.Container{
+		{
+			Name:  "head-pod",
+			Image: "ray.io/ray-head:latest",
+			Resources: v1.ResourceRequirements{
+				Limits:   nil,
+				Requests: resource,
+			},
+		},
+	}
+
+	app.Spec.HeadGroupSpec.Template.Spec.Containers = headContainers
+}
+
+func addWorkerPodSpec(app *rayv1.RayCluster, workerGroupName string,
+	replicas int32, minReplicas int32, maxReplicas int32, resources v1.ResourceList,
+) {
+	workerContainers := []v1.Container{
+		{
+			Name:  "worker-pod",
+			Image: "ray.io/ray-head:latest",
+			Resources: v1.ResourceRequirements{
+				Limits:   nil,
+				Requests: resources,
+			},
+		},
+	}
+
+	app.Spec.WorkerGroupSpecs = append(app.Spec.WorkerGroupSpecs, rayv1.WorkerGroupSpec{
+		GroupName:   workerGroupName,
+		Replicas:    &replicas,
+		MinReplicas: &minReplicas,
+		MaxReplicas: &maxReplicas,
+		Template: v1.PodTemplateSpec{
+			Spec: v1.PodSpec{
+				Containers: workerContainers,
+			},
+		},
+	})
+}
+
 func createPod(name string, namespace string) *v1.Pod {
 	return &v1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
@@ -90,3 +246,36 @@ func podLabelsContains(pod *v1.Pod, key string, value string) bool {
 
 	return false
 }
+
+func GetTaskGroupsFromAnnotation(pod *v1.Pod) ([]TaskGroup, error) {
+	taskGroupInfo, exist := pod.Annotations[YuniKornTaskGroupsAnnotationName]
+	if !exist {
+		return nil, fmt.Errorf("not found")
+	}
+
+	taskGroups := []TaskGroup{}
+	err := json.Unmarshal([]byte(taskGroupInfo), &taskGroups)
+	if err != nil {
+		return nil, err
+	}
+	// json.Unmarshal won't return error if name or MinMember is empty, but will return error if MinResource is empty or error format.
+	for _, taskGroup := range taskGroups {
+		if taskGroup.Name == "" {
+			return nil, fmt.Errorf("can't get taskGroup Name from pod annotation, %s",
+				taskGroupInfo)
+		}
+		if taskGroup.MinResource == nil {
+			return nil, fmt.Errorf("can't get taskGroup MinResource from pod annotation, %s",
+				taskGroupInfo)
+		}
+		if taskGroup.MinMember == int32(0) {
+			return nil, fmt.Errorf("can't get taskGroup MinMember from pod annotation, %s",
+				taskGroupInfo)
+		}
+		if taskGroup.MinMember < int32(0) {
+			return nil, fmt.Errorf("minMember cannot be negative, %s",
+				taskGroupInfo)
+		}
+	}
+	return taskGroups, nil
+}