From 88657769ac1882f134530e2c01c0b4d09924614b Mon Sep 17 00:00:00 2001 From: David Leifker Date: Sun, 6 Oct 2024 09:21:02 -0500 Subject: [PATCH] feat(template-mcps): Add configuration for datahub-gc * Runs garbage collection functions on a schedule --- charts/datahub/Chart.yaml | 2 +- .../templates/datahub-upgrade/_upgrade.tpl | 60 +++++++++++++++++++ .../datahub-system-update-job.yml | 10 ++++ charts/datahub/values.yaml | 39 +++++++++++- 4 files changed, 109 insertions(+), 2 deletions(-) diff --git a/charts/datahub/Chart.yaml b/charts/datahub/Chart.yaml index 748e2e9f8..775a85c69 100644 --- a/charts/datahub/Chart.yaml +++ b/charts/datahub/Chart.yaml @@ -4,7 +4,7 @@ description: A Helm chart for DataHub type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. -version: 0.4.27 +version: 0.4.28 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. appVersion: 0.14.1 diff --git a/charts/datahub/templates/datahub-upgrade/_upgrade.tpl b/charts/datahub/templates/datahub-upgrade/_upgrade.tpl index a4921c78f..39f1aec79 100644 --- a/charts/datahub/templates/datahub-upgrade/_upgrade.tpl +++ b/charts/datahub/templates/datahub-upgrade/_upgrade.tpl @@ -144,3 +144,63 @@ Return the env variables for upgrade jobs value: {{ .datahub_upgrade_history_topic_name }} {{- end }} {{- end -}} + +{{- define "deepMerge" -}} +{{- $dst := deepCopy .dst -}} +{{- range $key, $srcValue := .src -}} + {{- if hasKey $dst $key -}} + {{- $dstValue := index $dst $key -}} + {{- if and (kindIs "map" $dstValue) (kindIs "map" $srcValue) -}} + {{- $newDst := dict "dst" $dstValue "src" $srcValue -}} + {{- $mergedValue := include "deepMerge" $newDst | fromYaml -}} + {{- $_ := set $dst $key $mergedValue -}} + {{- else -}} + {{- $_ := set $dst $key $srcValue -}} + {{- end -}} + {{- else -}} + {{- $_ := set $dst $key $srcValue -}} + {{- end -}} +{{- end -}} +{{- $dst | toYaml -}} +{{- end -}} + +{{- define "randomHourInRange" -}} +{{- $start := index . 0 -}} +{{- $end := index . 1 -}} + +{{- if eq $start $end -}} + {{- $start -}} +{{- else -}} + {{- $range := int64 0 -}} + {{- if lt $end $start -}} + {{- /* Range spans midnight */ -}} + {{- $range = add (sub (int64 24) $start) $end -}} + {{- else -}} + {{- $range = sub $end $start -}} + {{- end -}} + + {{- $randomOffset := randInt 0 (add $range 1 | int) -}} + {{- mod (add $start $randomOffset) 24 -}} +{{- end -}} +{{- end -}} + +{{/* +datahubGC cron daily custom scheduling +*/}} +{{- define "datahub.systemUpdate.datahubGC.dailyCronWindow" -}} +{{- if .Values.datahubSystemUpdate.bootstrapMCPs.datahubGC.dailyCronWindow.enabled -}} +schedule: + interval: {{ printf "%d %s * * * " (mod (randNumeric 2) 60) (include "randomHourInRange" (list .Values.datahubSystemUpdate.bootstrapMCPs.datahubGC.dailyCronWindow.startHour .Values.datahubSystemUpdate.bootstrapMCPs.datahubGC.dailyCronWindow.endHour)) }} +{{- else }} +schedule: + interval: {{ .Values.datahubSystemUpdate.bootstrapMCPs.datahubGC.values.schedule.interval | quote }} +{{- end }} +{{- end -}} + +{{/* +datahubGC timezone +*/}} +{{- define "datahub.systemUpdate.datahubGC.timezone" -}} +schedule: + timezone: {{ .Values.global.datahub.timezone | default .Values.datahubSystemUpdate.bootstrapMCPs.datahubGC.values.schedule.timezone | quote }} +{{- end -}} \ No newline at end of file diff --git a/charts/datahub/templates/datahub-upgrade/datahub-system-update-job.yml b/charts/datahub/templates/datahub-upgrade/datahub-system-update-job.yml index 68a79afdd..35db77723 100644 --- a/charts/datahub/templates/datahub-upgrade/datahub-system-update-job.yml +++ b/charts/datahub/templates/datahub-upgrade/datahub-system-update-job.yml @@ -296,6 +296,16 @@ spec: - name: ELASTICSEARCH_BUILD_INDICES_ALLOW_DOC_COUNT_MISMATCH value: {{ . | quote }} {{- end }} + {{- range $k, $v := .Values.datahubSystemUpdate.bootstrapMCPs }} + {{- $result := dict }} + {{- $result = merge $result $v.values }} + {{- range $v.values_generated_configs }} + {{- $funcOutput := include . $ | fromYaml }} + {{- $result = include "deepMerge" (dict "dst" $result "src" $funcOutput) | fromYaml }} + {{- end }} + - name: {{ $v.values_env }} + value: {{ $result | toJson | quote }} + {{- end }} {{- with .Values.datahubSystemUpdate.extraEnvs }} {{- toYaml . | nindent 12 }} {{- end }} diff --git a/charts/datahub/values.yaml b/charts/datahub/values.yaml index 786b5cc23..ab20bf2d3 100644 --- a/charts/datahub/values.yaml +++ b/charts/datahub/values.yaml @@ -380,7 +380,7 @@ datahubSystemUpdate: # steps are completed, the non-blocking job runs while the rest of the # system is starting. nonblocking: - enabled: false + enabled: true # When mode = 'nonblocking' the nonblocking job should not include the above helm.sh/hook annotations annotations: # This is what defines this resource as a hook. Without this line, the @@ -393,6 +393,40 @@ datahubSystemUpdate: # to run # command: customCommand # args: [] + # Depends on v0.14.2 or greater + bootstrapMCPs: + datahubGC: + # For information about this recipe https://datahubproject.io/docs/0.14.0/generated/ingestion/sources/datahubgc/#install-the-plugin + # Overrides values.schedule.interval below with a jitter window using a generated config + dailyCronWindow: + enabled: true + startHour: 18 + endHour: 5 + # dynamic overrides, the output to each function is deepMerged with values + values_generated_configs: + - "datahub.systemUpdate.datahubGC.dailyCronWindow" + - "datahub.systemUpdate.datahubGC.timezone" + # Environment variable containing the json value for the template mcp + values_env: DATAHUB_GC_BOOTSTRAP_VALUES + # Base values for the template mcp + values: + ingestion: + name: datahub-gc + schedule: + # overridden by global.datahub.timezone + timezone: "UTC" + # overridden if dynamic.dailyCronWindow.enable is true + interval: "0 1 * * *" + cleanup_expired_tokens: false + truncate_indices: true + dataprocess_cleanup: + retention_days: 30 + delete_empty_data_jobs: true + delete_empty_data_flows: true + hard_delete_entities: false + keep_last_n: 10 + soft_deleted_entities_cleanup: + retention_days: 30 podAnnotations: {} resources: limits: @@ -700,6 +734,9 @@ global: port: "8080" nodePort: "30001" + # Used for scheduled tasks + timezone: "UTC" + frontend: validateSignUpEmail: true