Skip to content

Commit

Permalink
Add MXJob v1 api and controller
Browse files Browse the repository at this point in the history
- Generate MXJob api and manifests
- Rewrite MXNet controller logic in reconciler pattern

Signed-off-by: Jiaxin Shan <[email protected]>
  • Loading branch information
Jeffwan committed Jul 10, 2021
1 parent 20d0e99 commit ae32a4e
Show file tree
Hide file tree
Showing 26 changed files with 8,542 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
bin/
/tf-operator
vendor/
testbin/*

# IDEs
.vscode/

# Compiled python files.
Expand Down
8 changes: 8 additions & 0 deletions PROJECT
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,12 @@ resources:
kind: PyTorchJob
path: github.com/kubeflow/tf-operator/pkg/apis/pytorch/v1
version: v1
- api:
crdVersion: v1
namespaced: true
controller: true
group: kubeflow.org
kind: MXJob
path: github.com/kubeflow/tf-operator/pkg/apis/mxnet/v1
version: v1
version: "3"
6,899 changes: 6,899 additions & 0 deletions config/crd/bases/kubeflow.org_mxjobs.yaml

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions config/crd/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,22 @@
resources:
- bases/kubeflow.org_xgboostjobs.yaml
- bases/kubeflow.org_pytorchjobs.yaml
- bases/kubeflow.org_mxjobs.yaml
#+kubebuilder:scaffold:crdkustomizeresource

patchesStrategicMerge:
# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix.
# patches here are for enabling the conversion webhook for each CRD
#- patches/webhook_in_xgboostjobs.yaml
#- patches/webhook_in_pytorchjobs.yaml
#- patches/webhook_in_mxjobs.yaml
#+kubebuilder:scaffold:crdkustomizewebhookpatch

# [CERTMANAGER] To enable webhook, uncomment all the sections with [CERTMANAGER] prefix.
# patches here are for enabling the CA injection for each CRD
#- patches/cainjection_in_xgboostjobs.yaml
#- patches/cainjection_in_pytorchjobs.yaml
#- patches/cainjection_in_mxjobs.yaml
#+kubebuilder:scaffold:crdkustomizecainjectionpatch

# the following config is for teaching kustomize how to do kustomization for CRDs.
Expand Down
7 changes: 7 additions & 0 deletions config/crd/patches/cainjection_in_mxjobs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# The following patch adds a directive for certmanager to inject CA into the CRD
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
cert-manager.io/inject-ca-from: $(CERTIFICATE_NAMESPACE)/$(CERTIFICATE_NAME)
name: mxjobs.kubeflow.org
14 changes: 14 additions & 0 deletions config/crd/patches/webhook_in_mxjobs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# The following patch enables a conversion webhook for the CRD
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: mxjobs.kubeflow.org
spec:
conversion:
strategy: Webhook
webhook:
clientConfig:
service:
namespace: system
name: webhook-service
path: /convert
24 changes: 24 additions & 0 deletions config/rbac/mxjob_editor_role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# permissions for end users to edit mxjobs.
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: mxjob-editor-role
rules:
- apiGroups:
- kubeflow.org
resources:
- mxjobs
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- kubeflow.org
resources:
- mxjobs/status
verbs:
- get
20 changes: 20 additions & 0 deletions config/rbac/mxjob_viewer_role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# permissions for end users to view mxjobs.
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: mxjob-viewer-role
rules:
- apiGroups:
- kubeflow.org
resources:
- mxjobs
verbs:
- get
- list
- watch
- apiGroups:
- kubeflow.org
resources:
- mxjobs/status
verbs:
- get
45 changes: 45 additions & 0 deletions examples/mxnet/mxjob_dist_v1.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
apiVersion: "kubeflow.org/v1"
kind: "MXJob"
metadata:
name: "mxnet-job"
spec:
jobMode: MXTrain
mxReplicaSpecs:
Scheduler:
replicas: 1
restartPolicy: Never
template:
spec:
containers:
- name: mxnet
image: mxjob/mxnet:gpu
ports:
- containerPort: 9991
name: mxjob-port
Server:
replicas: 1
restartPolicy: Never
template:
spec:
containers:
- name: mxnet
image: mxjob/mxnet:gpu
ports:
- containerPort: 9991
name: mxjob-port
Worker:
replicas: 1
restartPolicy: Never
template:
spec:
containers:
- name: mxnet
image: mxjob/mxnet:gpu
command: ["python"]
args: ["/incubator-mxnet/example/image-classification/train_mnist.py","--num-epochs","10","--num-layers","2","--kv-store","dist_device_sync","--gpus","0"]
resources:
limits:
nvidia.com/gpu: 1
ports:
- containerPort: 9991
name: mxjob-port
7 changes: 7 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ import (

pytorchv1 "github.com/kubeflow/tf-operator/pkg/apis/pytorch/v1"
xgboostv1 "github.com/kubeflow/tf-operator/pkg/apis/xgboost/v1"
mxnetv1 "github.com/kubeflow/tf-operator/pkg/apis/mxnet/v1"
mxnetcontroller "github.com/kubeflow/tf-operator/pkg/controller.v1/mxnet"
pytorchcontroller "github.com/kubeflow/tf-operator/pkg/controller.v1/pytorch"
xgboostcontroller "github.com/kubeflow/tf-operator/pkg/controller.v1/xgboost"
//+kubebuilder:scaffold:imports
Expand All @@ -48,6 +50,7 @@ func init() {

utilruntime.Must(xgboostv1.AddToScheme(scheme))
utilruntime.Must(pytorchv1.AddToScheme(scheme))
utilruntime.Must(mxnetv1.AddToScheme(scheme))
//+kubebuilder:scaffold:scheme
}

Expand Down Expand Up @@ -91,6 +94,10 @@ func main() {
setupLog.Error(err, "unable to create controller", "controller", "XGBoostJob")
os.Exit(1)
}
if err = mxnetcontroller.NewReconciler(mgr).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "MXJob")
os.Exit(1)
}
//+kubebuilder:scaffold:builder

if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
Expand Down
18 changes: 18 additions & 0 deletions pkg/apis/mxnet/v1/constants.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package v1

import commonv1 "github.com/kubeflow/common/pkg/apis/common/v1"

const (
// EnvKubeflowNamespace is ENV for kubeflow namespace specified by user.
EnvKubeflowNamespace = "KUBEFLOW_NAMESPACE"

// DefaultPortName is name of the port used to communicate between scheduler and
// servers & workers.
DefaultPortName = "mxjob-port"
// DefaultContainerName is the name of the MXJob container.
DefaultContainerName = "mxnet"
// DefaultPort is default value of the port.
DefaultPort = 9091
// DefaultRestartPolicy is default RestartPolicy for MXReplicaSpec.
DefaultRestartPolicy = commonv1.RestartPolicyNever
)
34 changes: 34 additions & 0 deletions pkg/apis/mxnet/v1/groupversion_info.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// Copyright YEAR The Kubeflow Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package v1 contains API Schema definitions for the kubeflow.org v1 API group
//+kubebuilder:object:generate=true
//+groupName=kubeflow.org
package v1

import (
"k8s.io/apimachinery/pkg/runtime/schema"
"sigs.k8s.io/controller-runtime/pkg/scheme"
)

var (
// GroupVersion is group version used to register these objects
GroupVersion = schema.GroupVersion{Group: "kubeflow.org", Version: "v1"}

// SchemeBuilder is used to add go types to the GroupVersionKind scheme
SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion}

// AddToScheme adds the types in this group-version to the given scheme.
AddToScheme = SchemeBuilder.AddToScheme
)
113 changes: 113 additions & 0 deletions pkg/apis/mxnet/v1/mxjob_types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
// Copyright YEAR The Kubeflow Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package v1

import (
common "github.com/kubeflow/common/pkg/apis/common/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// MXJobSpec defines the desired state of MXJob
type MXJobSpec struct {
// RunPolicy encapsulates various runtime policies of the distributed training
// job, for example how to clean up resources and how long the job can stay
// active.
RunPolicy common.RunPolicy `json:",inline"`

// JobMode specify the kind of MXjob to do. Different mode may have
// different MXReplicaSpecs request
JobMode JobModeType `json:"jobMode"`

// MXReplicaSpecs is map of common.ReplicaType and common.ReplicaSpec
// specifies the MX replicas to run.
// For example,
// {
// "Scheduler": common.ReplicaSpec,
// "Server": common.ReplicaSpec,
// "Worker": common.ReplicaSpec,
// }
MXReplicaSpecs map[common.ReplicaType]*common.ReplicaSpec `json:"mxReplicaSpecs"`
}

// JobModeType id the type for JobMode
type JobModeType string

const (
// Train Mode, in this mode requested MXReplicaSpecs need
// has Server, Scheduler, Worker
MXTrain JobModeType = "MXTrain"

// Tune Mode, in this mode requested MXReplicaSpecs need
// has Tuner
MXTune JobModeType = "MXTune"
)

const (
// MXReplicaTypeScheduler is the type for scheduler replica in MXNet.
MXReplicaTypeScheduler common.ReplicaType = "Scheduler"

// MXReplicaTypeServer is the type for parameter servers of distributed MXNet.
MXReplicaTypeServer common.ReplicaType = "Server"

// MXReplicaTypeWorker is the type for workers of distributed MXNet.
// This is also used for non-distributed MXNet.
MXReplicaTypeWorker common.ReplicaType = "Worker"

// MXReplicaTypeTunerTracker
// This the auto-tuning tracker e.g. autotvm tracker, it will dispatch tuning task to TunerServer
MXReplicaTypeTunerTracker common.ReplicaType = "TunerTracker"

// MXReplicaTypeTunerServer
MXReplicaTypeTunerServer common.ReplicaType = "TunerServer"

// MXReplicaTuner is the type for auto-tuning of distributed MXNet.
// This is also used for non-distributed MXNet.
MXReplicaTypeTuner common.ReplicaType = "Tuner"
)

// MXJobStatus defines the observed state of MXJob
type MXJobStatus struct {
// INSERT ADDITIONAL STATUS FIELD - define observed state of cluster
// Important: Run "make" to regenerate code after modifying this file
}

// +genclient
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
// +resource:path=mxjob

//+kubebuilder:object:root=true
//+kubebuilder:subresource:status

// MXJob is the Schema for the mxjobs API
type MXJob struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`

Spec MXJobSpec `json:"spec,omitempty"`
Status common.JobStatus `json:"status,omitempty"`
}

//+kubebuilder:object:root=true

// MXJobList contains a list of MXJob
type MXJobList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []MXJob `json:"items"`
}

func init() {
SchemeBuilder.Register(&MXJob{}, &MXJobList{})
}
Loading

0 comments on commit ae32a4e

Please sign in to comment.