From 200454971a8904f8f57d12ff26cb46c4c86f1674 Mon Sep 17 00:00:00 2001 From: Tim Vaillancourt Date: Sat, 19 Oct 2024 00:02:50 +0200 Subject: [PATCH] `slack-19.0`: add flag to control `vtorc` recoveries Signed-off-by: Tim Vaillancourt --- go/flags/endtoend/vtorc.txt | 1 + go/vt/vtorc/config/config.go | 4 ++++ go/vt/vtorc/logic/vtorc.go | 3 +++ 3 files changed, 8 insertions(+) diff --git a/go/flags/endtoend/vtorc.txt b/go/flags/endtoend/vtorc.txt index 2bc5916455c..8b2646d27da 100644 --- a/go/flags/endtoend/vtorc.txt +++ b/go/flags/endtoend/vtorc.txt @@ -17,6 +17,7 @@ vtorc \ Flags: --allow-emergency-reparent Whether VTOrc should be allowed to run emergency reparent operation when it detects a dead primary (default true) + --allow-recovery Allow recovery actions (default true) --alsologtostderr log to standard error as well as files --audit-file-location string File location where the audit logs are to be stored --audit-purge-duration duration Duration for which audit logs are held before being purged. Should be in multiples of days (default 168h0m0s) diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go index ba3c41ddc61..e8286c1a40a 100644 --- a/go/vt/vtorc/config/config.go +++ b/go/vt/vtorc/config/config.go @@ -56,6 +56,7 @@ var ( auditToBackend = false auditToSyslog = false auditPurgeDuration = 7 * 24 * time.Hour // Equivalent of 7 days + allowRecovery = true recoveryPeriodBlockDuration = 30 * time.Second preventCrossCellFailover = false waitReplicasTimeout = 30 * time.Second @@ -76,6 +77,7 @@ func RegisterFlags(fs *pflag.FlagSet) { fs.BoolVar(&auditToBackend, "audit-to-backend", auditToBackend, "Whether to store the audit log in the VTOrc database") fs.BoolVar(&auditToSyslog, "audit-to-syslog", auditToSyslog, "Whether to store the audit log in the syslog") fs.DurationVar(&auditPurgeDuration, "audit-purge-duration", auditPurgeDuration, "Duration for which audit logs are held before being purged. Should be in multiples of days") + fs.BoolVar(&allowRecovery, "allow-recovery", allowRecovery, "Allow recovery actions") fs.DurationVar(&recoveryPeriodBlockDuration, "recovery-period-block-duration", recoveryPeriodBlockDuration, "Duration for which a new recovery is blocked on an instance after running a recovery") fs.BoolVar(&preventCrossCellFailover, "prevent-cross-cell-failover", preventCrossCellFailover, "Prevent VTOrc from promoting a primary in a different cell than the current primary in case of a failover") fs.DurationVar(&waitReplicasTimeout, "wait-replicas-timeout", waitReplicasTimeout, "Duration for which to wait for replica's to respond when issuing RPCs") @@ -104,6 +106,7 @@ type Configuration struct { WaitReplicasTimeoutSeconds int // Timeout on amount of time to wait for the replicas in case of ERS. Should be a small value because we should fail-fast. Should not be larger than LockTimeout since that is the total time we use for an ERS. TolerableReplicationLagSeconds int // Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS. TopoInformationRefreshSeconds int // Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topo-server. + AllowRecovery bool // Allow recoveries. RecoveryPollSeconds int // Timer duration on which VTOrc recovery analysis runs } @@ -134,6 +137,7 @@ func UpdateConfigValuesFromFlags() { Config.WaitReplicasTimeoutSeconds = int(waitReplicasTimeout / time.Second) Config.TolerableReplicationLagSeconds = int(tolerableReplicationLag / time.Second) Config.TopoInformationRefreshSeconds = int(topoInformationRefreshDuration / time.Second) + Config.AllowRecovery = allowRecovery Config.RecoveryPollSeconds = int(recoveryPollDuration / time.Second) } diff --git a/go/vt/vtorc/logic/vtorc.go b/go/vt/vtorc/logic/vtorc.go index f637956fbfd..abaeebd2afd 100644 --- a/go/vt/vtorc/logic/vtorc.go +++ b/go/vt/vtorc/logic/vtorc.go @@ -385,6 +385,9 @@ func ContinuousDiscovery() { } }() case <-recoveryTick: + if !config.Config.AllowRecovery { + continue + } go func() { if IsLeaderOrActive() { go ClearActiveFailureDetections()