From ca837181f360c1b0f859adbb490f965aecd5a4df Mon Sep 17 00:00:00 2001 From: Patrick Mueller Date: Wed, 23 Mar 2022 12:03:53 -0400 Subject: [PATCH] [ResponseOps][task manager] log event loop delay for tasks when over configured limit (#126300) (#128377) resolves https://github.com/elastic/kibana/issues/124366 Adds new task manager configuration keys. - `xpack.task_manager.event_loop_delay.monitor` - whether to monitor event loop delay or not; added in case this specific monitoring causes other issues and we'd want to disable it. We don't know of any cases where we'd need this today - `xpack.task_manager.event_loop_delay.warn_threshold` - the number of milliseconds of event loop delay before logging a warning This code uses the `perf_hooks.monitorEventLoopDelay()` API[1] to collect the event loop delay while a task is running. [1] https://nodejs.org/api/perf_hooks.html#perf_hooksmonitoreventloopdelayoptions When a significant event loop delay is encountered, it's very likely that other tasks running at the same time will be affected, and so will also end up having a long event loop delay value, and warnings will be logged on those. Over time, though, tasks which have consistently long event loop delays will outnumber those unfortunate peer tasks, and be obvious from the volume in the logs. To make it a bit easier to find these when viewing Kibana logs in Discover, tags are added to the logged messages to make it easier to find them. One tag is `event-loop-blocked`, second is the task type, and the third is a string consisting of the task type and task id. (cherry picked from commit b028cf97ed9a554fef723be1f38ffabbfc41bac1) --- docs/settings/task-manager-settings.asciidoc | 5 ++ .../resources/base/bin/kibana-docker | 2 + .../task_manager/server/config.test.ts | 12 +++ x-pack/plugins/task_manager/server/config.ts | 10 +++ .../server/ephemeral_task_lifecycle.test.ts | 4 + .../managed_configuration.test.ts | 4 + .../configuration_statistics.test.ts | 4 + .../monitoring_stats_stream.test.ts | 4 + .../task_manager/server/plugin.test.ts | 12 +++ .../server/polling_lifecycle.test.ts | 4 + .../task_manager/server/polling_lifecycle.ts | 3 + .../task_manager/server/task_events.test.ts | 82 +++++++++++++++++++ .../task_manager/server/task_events.ts | 20 +++++ .../server/task_running/task_runner.test.ts | 10 ++- .../server/task_running/task_runner.ts | 21 ++++- 15 files changed, 194 insertions(+), 3 deletions(-) create mode 100644 x-pack/plugins/task_manager/server/task_events.test.ts diff --git a/docs/settings/task-manager-settings.asciidoc b/docs/settings/task-manager-settings.asciidoc index b7423d7c37b310..5f31c9adc879d2 100644 --- a/docs/settings/task-manager-settings.asciidoc +++ b/docs/settings/task-manager-settings.asciidoc @@ -40,6 +40,11 @@ These non-persisted action tasks have a risk that they won't be run at all if th `xpack.task_manager.ephemeral_tasks.request_capacity`:: Sets the size of the ephemeral queue defined above. Defaults to 10. +`xpack.task_manager.event_loop_delay.monitor`:: +Enables event loop delay monitoring, which will log a warning when a task causes an event loop delay which exceeds the `warn_threshold` setting. Defaults to true. + +`xpack.task_manager.event_loop_delay.warn_threshold`:: +Sets the amount of event loop delay during a task execution which will cause a warning to be logged. Defaults to 5000 milliseconds (5 seconds). [float] [[task-manager-health-settings]] diff --git a/src/dev/build/tasks/os_packages/docker_generator/resources/base/bin/kibana-docker b/src/dev/build/tasks/os_packages/docker_generator/resources/base/bin/kibana-docker index c46a41eb5c3a09..7c3966de2d011d 100755 --- a/src/dev/build/tasks/os_packages/docker_generator/resources/base/bin/kibana-docker +++ b/src/dev/build/tasks/os_packages/docker_generator/resources/base/bin/kibana-docker @@ -376,6 +376,8 @@ kibana_vars=( xpack.task_manager.poll_interval xpack.task_manager.request_capacity xpack.task_manager.version_conflict_threshold + xpack.task_manager.event_loop_delay.monitor + xpack.task_manager.event_loop_delay.warn_threshold xpack.uptime.index ) diff --git a/x-pack/plugins/task_manager/server/config.test.ts b/x-pack/plugins/task_manager/server/config.test.ts index 4c4db2aba71285..f5ba0a3bcee0a7 100644 --- a/x-pack/plugins/task_manager/server/config.test.ts +++ b/x-pack/plugins/task_manager/server/config.test.ts @@ -16,6 +16,10 @@ describe('config validation', () => { "enabled": false, "request_capacity": 10, }, + "event_loop_delay": Object { + "monitor": true, + "warn_threshold": 5000, + }, "max_attempts": 3, "max_poll_inactivity_cycles": 10, "max_workers": 10, @@ -62,6 +66,10 @@ describe('config validation', () => { "enabled": false, "request_capacity": 10, }, + "event_loop_delay": Object { + "monitor": true, + "warn_threshold": 5000, + }, "max_attempts": 3, "max_poll_inactivity_cycles": 10, "max_workers": 10, @@ -106,6 +114,10 @@ describe('config validation', () => { "enabled": false, "request_capacity": 10, }, + "event_loop_delay": Object { + "monitor": true, + "warn_threshold": 5000, + }, "max_attempts": 3, "max_poll_inactivity_cycles": 10, "max_workers": 10, diff --git a/x-pack/plugins/task_manager/server/config.ts b/x-pack/plugins/task_manager/server/config.ts index 5a58e45a70d96d..f650ed093cee0c 100644 --- a/x-pack/plugins/task_manager/server/config.ts +++ b/x-pack/plugins/task_manager/server/config.ts @@ -41,6 +41,14 @@ export const taskExecutionFailureThresholdSchema = schema.object( } ); +const eventLoopDelaySchema = schema.object({ + monitor: schema.boolean({ defaultValue: true }), + warn_threshold: schema.number({ + defaultValue: 5000, + min: 10, + }), +}); + export const configSchema = schema.object( { /* The maximum number of times a task will be attempted before being abandoned as failed */ @@ -118,6 +126,7 @@ export const configSchema = schema.object( max: DEFAULT_MAX_EPHEMERAL_REQUEST_CAPACITY, }), }), + event_loop_delay: eventLoopDelaySchema, /* These are not designed to be used by most users. Please use caution when changing these */ unsafe: schema.object({ exclude_task_types: schema.arrayOf(schema.string(), { defaultValue: [] }), @@ -138,3 +147,4 @@ export const configSchema = schema.object( export type TaskManagerConfig = TypeOf; export type TaskExecutionFailureThreshold = TypeOf; +export type EventLoopDelayConfig = TypeOf; diff --git a/x-pack/plugins/task_manager/server/ephemeral_task_lifecycle.test.ts b/x-pack/plugins/task_manager/server/ephemeral_task_lifecycle.test.ts index 639bb834eeb4c7..1d98e37a06a551 100644 --- a/x-pack/plugins/task_manager/server/ephemeral_task_lifecycle.test.ts +++ b/x-pack/plugins/task_manager/server/ephemeral_task_lifecycle.test.ts @@ -69,6 +69,10 @@ describe('EphemeralTaskLifecycle', () => { unsafe: { exclude_task_types: [], }, + event_loop_delay: { + monitor: true, + warn_threshold: 5000, + }, ...config, }, elasticsearchAndSOAvailability$, diff --git a/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts b/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts index 3442e69aab44a6..c5f03b17693857 100644 --- a/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts +++ b/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts @@ -57,6 +57,10 @@ describe.skip('managed configuration', () => { unsafe: { exclude_task_types: [], }, + event_loop_delay: { + monitor: true, + warn_threshold: 5000, + }, }); logger = context.logger.get('taskManager'); diff --git a/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.test.ts index 77fd9a8f11fabe..776f5bc9388f7b 100644 --- a/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.test.ts @@ -40,6 +40,10 @@ describe('Configuration Statistics Aggregator', () => { unsafe: { exclude_task_types: [], }, + event_loop_delay: { + monitor: true, + warn_threshold: 5000, + }, }; const managedConfig = { diff --git a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts index 8aa2d54d896238..a6ef665966ddd7 100644 --- a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts @@ -44,6 +44,10 @@ describe('createMonitoringStatsStream', () => { unsafe: { exclude_task_types: [], }, + event_loop_delay: { + monitor: true, + warn_threshold: 5000, + }, }; it('returns the initial config used to configure Task Manager', async () => { diff --git a/x-pack/plugins/task_manager/server/plugin.test.ts b/x-pack/plugins/task_manager/server/plugin.test.ts index 20e5f211a5b4ed..aa91533eabadf9 100644 --- a/x-pack/plugins/task_manager/server/plugin.test.ts +++ b/x-pack/plugins/task_manager/server/plugin.test.ts @@ -43,6 +43,10 @@ describe('TaskManagerPlugin', () => { unsafe: { exclude_task_types: [], }, + event_loop_delay: { + monitor: true, + warn_threshold: 5000, + }, }); pluginInitializerContext.env.instanceUuid = ''; @@ -84,6 +88,10 @@ describe('TaskManagerPlugin', () => { unsafe: { exclude_task_types: [], }, + event_loop_delay: { + monitor: true, + warn_threshold: 5000, + }, }); const taskManagerPlugin = new TaskManagerPlugin(pluginInitializerContext); @@ -154,6 +162,10 @@ describe('TaskManagerPlugin', () => { unsafe: { exclude_task_types: ['*'], }, + event_loop_delay: { + monitor: true, + warn_threshold: 5000, + }, }); const logger = pluginInitializerContext.logger.get(); diff --git a/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts b/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts index b6a93b14f578b7..54320d9d90b5d7 100644 --- a/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts +++ b/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts @@ -67,6 +67,10 @@ describe('TaskPollingLifecycle', () => { unsafe: { exclude_task_types: [], }, + event_loop_delay: { + monitor: true, + warn_threshold: 5000, + }, }, taskStore: mockTaskStore, logger: taskManagerLogger, diff --git a/x-pack/plugins/task_manager/server/polling_lifecycle.ts b/x-pack/plugins/task_manager/server/polling_lifecycle.ts index b61891d732f5e3..4a92addc6f187b 100644 --- a/x-pack/plugins/task_manager/server/polling_lifecycle.ts +++ b/x-pack/plugins/task_manager/server/polling_lifecycle.ts @@ -90,6 +90,7 @@ export class TaskPollingLifecycle { private middleware: Middleware; private usageCounter?: UsageCounter; + private config: TaskManagerConfig; /** * Initializes the task manager, preventing any further addition of middleware, @@ -115,6 +116,7 @@ export class TaskPollingLifecycle { this.store = taskStore; this.executionContext = executionContext; this.usageCounter = usageCounter; + this.config = config; const emitEvent = (event: TaskLifecycleEvent) => this.events$.next(event); @@ -237,6 +239,7 @@ export class TaskPollingLifecycle { defaultMaxAttempts: this.taskClaiming.maxAttempts, executionContext: this.executionContext, usageCounter: this.usageCounter, + eventLoopDelayConfig: { ...this.config.event_loop_delay }, }); }; diff --git a/x-pack/plugins/task_manager/server/task_events.test.ts b/x-pack/plugins/task_manager/server/task_events.test.ts new file mode 100644 index 00000000000000..5d72120da725ca --- /dev/null +++ b/x-pack/plugins/task_manager/server/task_events.test.ts @@ -0,0 +1,82 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { startTaskTimer, startTaskTimerWithEventLoopMonitoring } from './task_events'; + +const DelayIterations = 4; +const DelayMillis = 250; +const DelayTotal = DelayIterations * DelayMillis; + +async function nonBlockingDelay(millis: number) { + await new Promise((resolve) => setTimeout(resolve, millis)); +} + +async function blockingDelay(millis: number) { + // get task in async queue + await nonBlockingDelay(0); + + const end = Date.now() + millis; + // eslint-disable-next-line no-empty + while (Date.now() < end) {} +} + +async function nonBlockingTask() { + for (let i = 0; i < DelayIterations; i++) { + await nonBlockingDelay(DelayMillis); + } +} + +async function blockingTask() { + for (let i = 0; i < DelayIterations; i++) { + await blockingDelay(DelayMillis); + } +} + +describe('task_events', () => { + test('startTaskTimer', async () => { + const stopTaskTimer = startTaskTimer(); + await nonBlockingTask(); + const result = stopTaskTimer(); + expect(result.stop - result.start).not.toBeLessThan(DelayTotal); + expect(result.eventLoopBlockMs).toBe(undefined); + }); + + describe('startTaskTimerWithEventLoopMonitoring', () => { + test('non-blocking', async () => { + const stopTaskTimer = startTaskTimerWithEventLoopMonitoring({ + monitor: true, + warn_threshold: 5000, + }); + await nonBlockingTask(); + const result = stopTaskTimer(); + expect(result.stop - result.start).not.toBeLessThan(DelayTotal); + expect(result.eventLoopBlockMs).toBeLessThan(DelayMillis); + }); + + test('blocking', async () => { + const stopTaskTimer = startTaskTimerWithEventLoopMonitoring({ + monitor: true, + warn_threshold: 5000, + }); + await blockingTask(); + const result = stopTaskTimer(); + expect(result.stop - result.start).not.toBeLessThan(DelayTotal); + expect(result.eventLoopBlockMs).not.toBeLessThan(DelayMillis); + }); + + test('not monitoring', async () => { + const stopTaskTimer = startTaskTimerWithEventLoopMonitoring({ + monitor: false, + warn_threshold: 5000, + }); + await blockingTask(); + const result = stopTaskTimer(); + expect(result.stop - result.start).not.toBeLessThan(DelayTotal); + expect(result.eventLoopBlockMs).toBe(0); + }); + }); +}); diff --git a/x-pack/plugins/task_manager/server/task_events.ts b/x-pack/plugins/task_manager/server/task_events.ts index 7c7845569a10b5..de2c9dc04acd27 100644 --- a/x-pack/plugins/task_manager/server/task_events.ts +++ b/x-pack/plugins/task_manager/server/task_events.ts @@ -5,6 +5,8 @@ * 2.0. */ +import { monitorEventLoopDelay } from 'perf_hooks'; + import { Option } from 'fp-ts/lib/Option'; import { ConcreteTaskInstance } from './task'; @@ -14,6 +16,7 @@ import { ClaimAndFillPoolResult } from './lib/fill_pool'; import { PollingError } from './polling'; import { TaskRunResult } from './task_running'; import { EphemeralTaskInstanceRequest } from './ephemeral_task_lifecycle'; +import type { EventLoopDelayConfig } from './config'; export enum TaskPersistence { Recurring = 'recurring', @@ -40,6 +43,7 @@ export enum TaskClaimErrorType { export interface TaskTiming { start: number; stop: number; + eventLoopBlockMs?: number; } export type WithTaskTiming = T & { timing: TaskTiming }; @@ -48,6 +52,22 @@ export function startTaskTimer(): () => TaskTiming { return () => ({ start, stop: Date.now() }); } +export function startTaskTimerWithEventLoopMonitoring( + eventLoopDelayConfig: EventLoopDelayConfig +): () => TaskTiming { + const stopTaskTimer = startTaskTimer(); + const eldHistogram = eventLoopDelayConfig.monitor ? monitorEventLoopDelay() : null; + eldHistogram?.enable(); + + return () => { + const { start, stop } = stopTaskTimer(); + eldHistogram?.disable(); + const eldMax = eldHistogram?.max ?? 0; + const eventLoopBlockMs = Math.round(eldMax / 1000 / 1000); // original in nanoseconds + return { start, stop, eventLoopBlockMs }; + }; +} + export interface TaskEvent { id?: ID; timing?: TaskTiming; diff --git a/x-pack/plugins/task_manager/server/task_running/task_runner.test.ts b/x-pack/plugins/task_manager/server/task_running/task_runner.test.ts index 09af125884fe92..ece82099728e3b 100644 --- a/x-pack/plugins/task_manager/server/task_running/task_runner.test.ts +++ b/x-pack/plugins/task_manager/server/task_running/task_runner.test.ts @@ -1528,7 +1528,11 @@ describe('TaskManagerRunner', () => { function withAnyTiming(taskRun: TaskRun) { return { ...taskRun, - timing: { start: expect.any(Number), stop: expect.any(Number) }, + timing: { + start: expect.any(Number), + stop: expect.any(Number), + eventLoopBlockMs: expect.any(Number), + }, }; } @@ -1590,6 +1594,10 @@ describe('TaskManagerRunner', () => { onTaskEvent: opts.onTaskEvent, executionContext, usageCounter, + eventLoopDelayConfig: { + monitor: true, + warn_threshold: 5000, + }, }); if (stage === TaskRunningStage.READY_TO_RUN) { diff --git a/x-pack/plugins/task_manager/server/task_running/task_runner.ts b/x-pack/plugins/task_manager/server/task_running/task_runner.ts index 48927435c4bdf9..778a834c168a14 100644 --- a/x-pack/plugins/task_manager/server/task_running/task_runner.ts +++ b/x-pack/plugins/task_manager/server/task_running/task_runner.ts @@ -38,7 +38,7 @@ import { TaskMarkRunning, asTaskRunEvent, asTaskMarkRunningEvent, - startTaskTimer, + startTaskTimerWithEventLoopMonitoring, TaskTiming, TaskPersistence, } from '../task_events'; @@ -56,6 +56,7 @@ import { } from '../task'; import { TaskTypeDictionary } from '../task_type_dictionary'; import { isUnrecoverableError } from './errors'; +import type { EventLoopDelayConfig } from '../config'; const defaultBackoffPerFailure = 5 * 60 * 1000; export const EMPTY_RUN_RESULT: SuccessfulRunResult = { state: {} }; @@ -105,6 +106,7 @@ type Opts = { defaultMaxAttempts: number; executionContext: ExecutionContextStart; usageCounter?: UsageCounter; + eventLoopDelayConfig: EventLoopDelayConfig; } & Pick; export enum TaskRunResult { @@ -152,6 +154,7 @@ export class TaskManagerRunner implements TaskRunner { private uuid: string; private readonly executionContext: ExecutionContextStart; private usageCounter?: UsageCounter; + private eventLoopDelayConfig: EventLoopDelayConfig; /** * Creates an instance of TaskManagerRunner. @@ -174,6 +177,7 @@ export class TaskManagerRunner implements TaskRunner { onTaskEvent = identity, executionContext, usageCounter, + eventLoopDelayConfig, }: Opts) { this.instance = asPending(sanitizeInstance(instance)); this.definitions = definitions; @@ -186,6 +190,7 @@ export class TaskManagerRunner implements TaskRunner { this.executionContext = executionContext; this.usageCounter = usageCounter; this.uuid = uuid.v4(); + this.eventLoopDelayConfig = eventLoopDelayConfig; } /** @@ -292,7 +297,7 @@ export class TaskManagerRunner implements TaskRunner { taskInstance: this.instance.task, }); - const stopTaskTimer = startTaskTimer(); + const stopTaskTimer = startTaskTimerWithEventLoopMonitoring(this.eventLoopDelayConfig); try { this.task = this.definition.createTaskRunner(modifiedContext); @@ -617,6 +622,18 @@ export class TaskManagerRunner implements TaskRunner { ); } ); + + const { eventLoopBlockMs = 0 } = taskTiming; + const taskLabel = `${this.taskType} ${this.instance.task.id}`; + if (eventLoopBlockMs > this.eventLoopDelayConfig.warn_threshold) { + this.logger.warn( + `event loop blocked for at least ${eventLoopBlockMs} ms while running task ${taskLabel}`, + { + tags: [this.taskType, taskLabel, 'event-loop-blocked'], + } + ); + } + return result; }