Skip to content

Commit

Permalink
feat: update helm chart for k8 RP [DET-3542] (#882)
Browse files Browse the repository at this point in the history
* feat: update helm chart to support k8 RP
* feat: set CPU and MEM reqs for k8 master deployment
  • Loading branch information
aaron276h authored Jul 20, 2020
1 parent fad06e9 commit 56df7d2
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 1 deletion.
32 changes: 31 additions & 1 deletion helm/charts/determined/templates/master-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,34 @@ data:
http_port: {{ .Values.httpPort }}
scheduler:
type: fair_share
resource_provider:
type: "kubernetes"
namespace: {{ .Release.Namespace }}
slots_per_node: {{ required "A valid Values.slotsPerNode entry is required!" .Values.slotsPerNode }}
master_service_name: determined-master-service-{{ .Release.Name }}
{{ if .Values.taskContainerDefaults -}}
task_container_defaults:
{{- if .Values.taskContainerDefaults.shmSizeBytes }}
shm_size_bytes: {{ int $.Values.taskContainerDefaults.shmSizeBytes }}
{{- end }}
{{- if .Values.taskContainerDefaults.networkMode }}
network_mode: {{ .Values.taskContainerDefaults.networkMode }}
{{- end }}
{{- if .Values.taskContainerDefaults.dtrainNetworkInterface }}
dtrain_network_interface: {{ .Values.taskContainerDefaults.dtrainNetworkInterface }}
{{- end }}
{{- if .Values.taskContainerDefaults.ncclPortRange }}
nccl_port_range: {{ .Values.taskContainerDefaults.ncclPortRange }}
{{- end }}
{{- if .Values.taskContainerDefaults.glooPortRange }}
gloo_port_range: {{ .Values.taskContainerDefaults.glooPortRange }}
{{- end }}
{{ end }}
{{- if .Values.telemetry }}
{{- if .Values.telemetry.enabled }}
telemetry:
enabled: {{ .Values.telemetry.enabled }}
{{- end }}
{{- end }}
13 changes: 13 additions & 0 deletions helm/charts/determined/templates/master-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,25 @@ spec:
serviceAccount: determined-master-{{ .Release.Name }}
containers:
- name: determined-master-{{ .Release.Name }}
{{- if .Values.detVersion }}
# detVersion is used for CI to override the appVersion.
image: determinedai/determined-master:{{ .Values.detVersion }}
{{- else }}
image: determinedai/determined-master:{{ required "A valid Chart.AppVersion entry required!" .Chart.AppVersion }}
{{- end }}
imagePullPolicy: "Always"
volumeMounts:
- name: master-config
mountPath: /etc/determined/
readOnly: true
resources:
requests:
{{- if .Values.masterCpuRequest }}
cpu: {{ .Values.masterCpuRequest | quote }}
{{- end }}
{{- if .Values.masterMemRequest }}
memory: {{ .Values.masterMemRequest | quote }}
{{- end}}
volumes:
- name: master-config
configMap:
Expand Down
9 changes: 9 additions & 0 deletions helm/charts/determined/templates/master-permissions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,15 @@ rules:
- apiGroups: [""]
resources: ["pods", "pods/status", "pods/log", "configmaps"]
verbs: ["create", "get", "list", "delete"]
- apiGroups: [""]
resources: ["services"]
verbs: ["get"]
- apiGroups: [""]
resources: ["pods", "events"]
verbs: ["watch"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["list"]


---
Expand Down
28 changes: 28 additions & 0 deletions helm/charts/determined/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,31 @@ checkpointStorage:
# accessKey: <access_key>
# secretKey: <secret_key>
# endpointUrl: <endpoint_url>

# This is the number of GPUs there are per machine. Determined uses this information when
# scheduling multi-GPU tasks. Each multi-GPU (distributed training) task will be scheduled as
# a set of `slotsPerTask / slotsPerNode` separate pods, with each pod assigned to `slotsPerNode` GPUs.
# Tasks sizes that are not divisible by `slotsPerNode` are never scheduled. If you have a cluster of
# different size nodes (e.g., 4 and 8 GPUs per node), set the `slotsPerNode` to the smallest
# common denominator.
slotsPerNode:

# Memory and CPU requirements for the master instance should be adjusted for scale.
masterCpuRequest: "4"
masterMemRequest: "8Gi"

## Configure the task container defaults. Tasks include trials, commands, tensorboards notebooks and shells.
## For all task containers, shm_size_bytes and network_mode are configurable. For trials, the
## network interface used by distributed (multi-machine) training and ports used by the NCCL and
## GLOO libraries during distributed training are configurable. These default to auto-discovery and
## random non-privileged ports, respectively.
taskContainerDefaults:
shmSizeBytes: 4294967296
# networkMode: bridge
# dtrainNetworkInterface: <network interface name>
# ncclPortRange: <MIN:MAX>
# glooPortRange: <MIN:MAX>

## Configure whether we collect anonymous information about the usage of Determined.
telemetry:
enabled: true

0 comments on commit 56df7d2

Please sign in to comment.