Skip to content

Commit

Permalink
Ruler: Make the sync queue polling intervals configurable. (#9268)
Browse files Browse the repository at this point in the history
* Ruler: Make the sync queue polling intervals configurable.

* Tweak doc string

* Update about-versioning.md
  • Loading branch information
stevesg authored Sep 20, 2024
1 parent 3e06d8a commit 61c1080
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 18 deletions.
22 changes: 22 additions & 0 deletions cmd/mimir/config-descriptor.json
Original file line number Diff line number Diff line change
Expand Up @@ -12684,6 +12684,28 @@
"fieldValue": null,
"fieldDefaultValue": null
},
{
"kind": "field",
"name": "outbound_sync_queue_poll_interval",
"required": false,
"desc": "Interval between sending queued rule sync requests to ruler replicas.",
"fieldValue": null,
"fieldDefaultValue": 10000000000,
"fieldFlag": "ruler.outbound-sync-queue-poll-interval",
"fieldType": "duration",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "inbound_sync_queue_poll_interval",
"required": false,
"desc": "Interval between applying queued incoming rule sync requests.",
"fieldValue": null,
"fieldDefaultValue": 10000000000,
"fieldFlag": "ruler.inbound-sync-queue-poll-interval",
"fieldType": "duration",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "max_independent_rule_evaluation_concurrency",
Expand Down
4 changes: 4 additions & 0 deletions cmd/mimir/help-all.txt.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -2669,6 +2669,8 @@ Usage of ./cmd/mimir/mimir:
This grace period controls which alerts the ruler restores after a restart. Alerts with "for" duration lower than this grace period are not restored after a ruler restart. This means that if the alerts have been firing before the ruler restarted, they will now go to pending state and then to firing again after their "for" duration expires. Alerts with "for" duration greater than or equal to this grace period that have been pending before the ruler restart will remain in pending state for at least this grace period. Alerts with "for" duration greater than or equal to this grace period that have been firing before the ruler restart will continue to be firing after the restart. (default 2m0s)
-ruler.for-outage-tolerance duration
Max time to tolerate outage for restoring "for" state of alert. (default 1h0m0s)
-ruler.inbound-sync-queue-poll-interval duration
[experimental] Interval between applying queued incoming rule sync requests. (default 10s)
-ruler.independent-rule-evaluation-concurrency-min-duration-percentage float
[experimental] Minimum threshold of the interval to last rule group runtime duration to allow a rule to be evaluated concurrency. By default, the rule group runtime duration must exceed 50.0% of the evaluation interval. (default 50)
-ruler.max-independent-rule-evaluation-concurrency int
Expand All @@ -2687,6 +2689,8 @@ Usage of ./cmd/mimir/mimir:
Capacity of the queue for notifications to be sent to the Alertmanager. (default 10000)
-ruler.notification-timeout duration
HTTP timeout duration when sending notifications to the Alertmanager. (default 10s)
-ruler.outbound-sync-queue-poll-interval duration
[experimental] Interval between sending queued rule sync requests to ruler replicas. (default 10s)
-ruler.poll-interval duration
How frequently the configured rule groups are re-synced from the object storage. (default 10m0s)
-ruler.protected-namespaces comma-separated-list-of-strings
Expand Down
3 changes: 3 additions & 0 deletions docs/sources/mimir/configure/about-versioning.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ The following features are currently experimental:
- `-ruler.max-independent-rule-evaluation-concurrency-per-tenant`
- `-ruler.independent-rule-evaluation-concurrency-min-duration-percentage`
- `-ruler.rule-evaluation-write-enabled`
- Allow control over rule sync intervals.
- `ruler.outbound-sync-queue-poll-interval`
- `ruler.inbound-sync-queue-poll-interval`
- Distributor
- Metrics relabeling
- `-distributor.metric-relabeling-enabled`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2093,6 +2093,15 @@ tenant_federation:
# CLI flag: -ruler.tenant-federation.enabled
[enabled: <boolean> | default = false]
# (experimental) Interval between sending queued rule sync requests to ruler
# replicas.
# CLI flag: -ruler.outbound-sync-queue-poll-interval
[outbound_sync_queue_poll_interval: <duration> | default = 10s]
# (experimental) Interval between applying queued incoming rule sync requests.
# CLI flag: -ruler.inbound-sync-queue-poll-interval
[inbound_sync_queue_poll_interval: <duration> | default = 10s]
# (experimental) Number of rules rules that don't have dependencies that we
# allow to be evaluated concurrently across all tenants. 0 to disable.
# CLI flag: -ruler.max-independent-rule-evaluation-concurrency
Expand Down
9 changes: 6 additions & 3 deletions pkg/ruler/api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1096,7 +1096,8 @@ rules:
// Configure the ruler to only sync the rules based on notifications upon API changes.
rulerCfg := tt.cfg
rulerCfg.PollInterval = time.Hour
rulerCfg.rulerSyncQueuePollFrequency = 100 * time.Millisecond
rulerCfg.InboundSyncQueuePollInterval = 100 * time.Millisecond
rulerCfg.OutboundSyncQueuePollInterval = 100 * time.Millisecond

reg := prometheus.NewPedanticRegistry()
r := prepareRuler(t, rulerCfg, newMockRuleStore(make(map[string]rulespb.RuleGroupList)), withStart(), withRulerAddrAutomaticMapping(), withPrometheusRegisterer(reg))
Expand Down Expand Up @@ -1139,7 +1140,8 @@ func TestAPI_DeleteNamespace(t *testing.T) {
// Configure the ruler to only sync the rules based on notifications upon API changes.
cfg := defaultRulerConfig(t)
cfg.PollInterval = time.Hour
cfg.rulerSyncQueuePollFrequency = 100 * time.Millisecond
cfg.OutboundSyncQueuePollInterval = 100 * time.Millisecond
cfg.InboundSyncQueuePollInterval = 100 * time.Millisecond

// Keep this inside the test, not as global var, otherwise running tests with -count higher than 1 fails,
// as newMockRuleStore modifies the underlying map.
Expand Down Expand Up @@ -1207,7 +1209,8 @@ func TestAPI_DeleteRuleGroup(t *testing.T) {
// Configure the ruler to only sync the rules based on notifications upon API changes.
cfg := defaultRulerConfig(t)
cfg.PollInterval = time.Hour
cfg.rulerSyncQueuePollFrequency = 100 * time.Millisecond
cfg.OutboundSyncQueuePollInterval = 100 * time.Millisecond
cfg.InboundSyncQueuePollInterval = 100 * time.Millisecond

// Keep this inside the test, not as global var, otherwise running tests with -count higher than 1 fails,
// as newMockRuleStore modifies the underlying map.
Expand Down
20 changes: 9 additions & 11 deletions pkg/ruler/ruler.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,11 @@ type Config struct {

TenantFederation TenantFederationConfig `yaml:"tenant_federation"`

OutboundSyncQueuePollInterval time.Duration `yaml:"outbound_sync_queue_poll_interval" category:"experimental"`
InboundSyncQueuePollInterval time.Duration `yaml:"inbound_sync_queue_poll_interval" category:"experimental"`

// Allow to override timers for testing purposes.
RingCheckPeriod time.Duration `yaml:"-"`
rulerSyncQueuePollFrequency time.Duration `yaml:"-"`
RingCheckPeriod time.Duration `yaml:"-"`

MaxIndependentRuleEvaluationConcurrency int64 `yaml:"max_independent_rule_evaluation_concurrency" category:"experimental"`
IndependentRuleEvaluationConcurrencyMinDurationPercentage float64 `yaml:"independent_rule_evaluation_concurrency_min_duration_percentage" category:"experimental"`
Expand Down Expand Up @@ -201,14 +203,10 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) {

f.BoolVar(&cfg.RuleEvaluationWriteEnabled, "ruler.rule-evaluation-write-enabled", true, "Writes the results of rule evaluation to ingesters or ingest storage when enabled. Use this option for testing purposes. To disable, set to false.")

cfg.RingCheckPeriod = 5 * time.Second
}
f.DurationVar(&cfg.OutboundSyncQueuePollInterval, "ruler.outbound-sync-queue-poll-interval", defaultRulerSyncPollFrequency, `Interval between sending queued rule sync requests to ruler replicas.`)
f.DurationVar(&cfg.InboundSyncQueuePollInterval, "ruler.inbound-sync-queue-poll-interval", defaultRulerSyncPollFrequency, `Interval between applying queued incoming rule sync requests.`)

func (cfg *Config) syncQueuePollFrequency() time.Duration {
if cfg.rulerSyncQueuePollFrequency > 0 {
return cfg.rulerSyncQueuePollFrequency
}
return defaultRulerSyncPollFrequency
cfg.RingCheckPeriod = 5 * time.Second
}

type rulerMetrics struct {
Expand Down Expand Up @@ -365,8 +363,8 @@ func newRuler(cfg Config, manager MultiTenantManager, reg prometheus.Registerer,
logger: logger,
limits: limits,
clientsPool: clientPool,
outboundSyncQueue: newRulerSyncQueue(cfg.syncQueuePollFrequency()),
inboundSyncQueue: newRulerSyncQueue(cfg.syncQueuePollFrequency()),
outboundSyncQueue: newRulerSyncQueue(cfg.OutboundSyncQueuePollInterval),
inboundSyncQueue: newRulerSyncQueue(cfg.InboundSyncQueuePollInterval),
allowedTenants: util.NewAllowedTenants(cfg.EnabledTenants, cfg.DisabledTenants),
metrics: newRulerMetrics(reg),
}
Expand Down
12 changes: 8 additions & 4 deletions pkg/ruler/ruler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1010,7 +1010,8 @@ func TestRuler_NotifySyncRulesAsync_ShouldTriggerRulesSyncingOnAllRulersWhenEnab

rulerCfg := defaultRulerConfig(t)
rulerCfg.PollInterval = time.Hour
rulerCfg.rulerSyncQueuePollFrequency = 100 * time.Millisecond
rulerCfg.OutboundSyncQueuePollInterval = 100 * time.Millisecond
rulerCfg.InboundSyncQueuePollInterval = 100 * time.Millisecond
rulerCfg.Ring.NumTokens = 128
rulerCfg.Ring.Common.InstanceID = rulerAddr
rulerCfg.Ring.Common.InstanceAddr = rulerAddr
Expand Down Expand Up @@ -1158,7 +1159,8 @@ func TestRuler_NotifySyncRulesAsync_ShouldTriggerRulesSyncingAndCorrectlyHandleT

rulerCfg := defaultRulerConfig(t)
rulerCfg.PollInterval = time.Hour
rulerCfg.rulerSyncQueuePollFrequency = 100 * time.Millisecond
rulerCfg.OutboundSyncQueuePollInterval = 100 * time.Millisecond
rulerCfg.InboundSyncQueuePollInterval = 100 * time.Millisecond
rulerCfg.Ring.NumTokens = 128
rulerCfg.Ring.Common.InstanceID = rulerAddr
rulerCfg.Ring.Common.InstanceAddr = rulerAddr
Expand Down Expand Up @@ -1304,7 +1306,8 @@ func TestRuler_NotifySyncRulesAsync_ShouldNotTriggerRulesSyncingOnAllRulersWhenD

rulerCfg := defaultRulerConfig(t)
rulerCfg.PollInterval = time.Hour
rulerCfg.rulerSyncQueuePollFrequency = 100 * time.Millisecond
rulerCfg.OutboundSyncQueuePollInterval = 100 * time.Millisecond
rulerCfg.InboundSyncQueuePollInterval = 100 * time.Millisecond
rulerCfg.Ring.NumTokens = 128
rulerCfg.Ring.Common.InstanceID = rulerAddr
rulerCfg.Ring.Common.InstanceAddr = rulerAddr
Expand Down Expand Up @@ -1401,7 +1404,8 @@ func TestRuler_DeleteTenantConfiguration_ShouldDeleteTenantConfigurationAndTrigg
// once explicitly triggered by the change via API.
cfg := defaultRulerConfig(t)
cfg.PollInterval = time.Hour
cfg.rulerSyncQueuePollFrequency = 100 * time.Millisecond
cfg.OutboundSyncQueuePollInterval = 100 * time.Millisecond
cfg.InboundSyncQueuePollInterval = 100 * time.Millisecond
cfg.Ring.Common.InstanceAddr = "ruler-1"

reg := prometheus.NewPedanticRegistry()
Expand Down

0 comments on commit 61c1080

Please sign in to comment.