diff --git a/docs/settings/task-manager-settings.asciidoc b/docs/settings/task-manager-settings.asciidoc index b7423d7c37b310..5f31c9adc879d2 100644 --- a/docs/settings/task-manager-settings.asciidoc +++ b/docs/settings/task-manager-settings.asciidoc @@ -40,6 +40,11 @@ These non-persisted action tasks have a risk that they won't be run at all if th `xpack.task_manager.ephemeral_tasks.request_capacity`:: Sets the size of the ephemeral queue defined above. Defaults to 10. +`xpack.task_manager.event_loop_delay.monitor`:: +Enables event loop delay monitoring, which will log a warning when a task causes an event loop delay which exceeds the `warn_threshold` setting. Defaults to true. + +`xpack.task_manager.event_loop_delay.warn_threshold`:: +Sets the amount of event loop delay during a task execution which will cause a warning to be logged. Defaults to 5000 milliseconds (5 seconds). [float] [[task-manager-health-settings]] diff --git a/src/dev/build/tasks/os_packages/docker_generator/resources/base/bin/kibana-docker b/src/dev/build/tasks/os_packages/docker_generator/resources/base/bin/kibana-docker index 01d27a345378bf..83a542c93d12b9 100755 --- a/src/dev/build/tasks/os_packages/docker_generator/resources/base/bin/kibana-docker +++ b/src/dev/build/tasks/os_packages/docker_generator/resources/base/bin/kibana-docker @@ -379,6 +379,8 @@ kibana_vars=( xpack.task_manager.poll_interval xpack.task_manager.request_capacity xpack.task_manager.version_conflict_threshold + xpack.task_manager.event_loop_delay.monitor + xpack.task_manager.event_loop_delay.warn_threshold xpack.uptime.index ) diff --git a/x-pack/plugins/task_manager/server/config.test.ts b/x-pack/plugins/task_manager/server/config.test.ts index 4c4db2aba71285..f5ba0a3bcee0a7 100644 --- a/x-pack/plugins/task_manager/server/config.test.ts +++ b/x-pack/plugins/task_manager/server/config.test.ts @@ -16,6 +16,10 @@ describe('config validation', () => { "enabled": false, "request_capacity": 10, }, + "event_loop_delay": Object { + "monitor": true, + "warn_threshold": 5000, + }, "max_attempts": 3, "max_poll_inactivity_cycles": 10, "max_workers": 10, @@ -62,6 +66,10 @@ describe('config validation', () => { "enabled": false, "request_capacity": 10, }, + "event_loop_delay": Object { + "monitor": true, + "warn_threshold": 5000, + }, "max_attempts": 3, "max_poll_inactivity_cycles": 10, "max_workers": 10, @@ -106,6 +114,10 @@ describe('config validation', () => { "enabled": false, "request_capacity": 10, }, + "event_loop_delay": Object { + "monitor": true, + "warn_threshold": 5000, + }, "max_attempts": 3, "max_poll_inactivity_cycles": 10, "max_workers": 10, diff --git a/x-pack/plugins/task_manager/server/config.ts b/x-pack/plugins/task_manager/server/config.ts index 5a58e45a70d96d..f650ed093cee0c 100644 --- a/x-pack/plugins/task_manager/server/config.ts +++ b/x-pack/plugins/task_manager/server/config.ts @@ -41,6 +41,14 @@ export const taskExecutionFailureThresholdSchema = schema.object( } ); +const eventLoopDelaySchema = schema.object({ + monitor: schema.boolean({ defaultValue: true }), + warn_threshold: schema.number({ + defaultValue: 5000, + min: 10, + }), +}); + export const configSchema = schema.object( { /* The maximum number of times a task will be attempted before being abandoned as failed */ @@ -118,6 +126,7 @@ export const configSchema = schema.object( max: DEFAULT_MAX_EPHEMERAL_REQUEST_CAPACITY, }), }), + event_loop_delay: eventLoopDelaySchema, /* These are not designed to be used by most users. Please use caution when changing these */ unsafe: schema.object({ exclude_task_types: schema.arrayOf(schema.string(), { defaultValue: [] }), @@ -138,3 +147,4 @@ export const configSchema = schema.object( export type TaskManagerConfig = TypeOf; export type TaskExecutionFailureThreshold = TypeOf; +export type EventLoopDelayConfig = TypeOf; diff --git a/x-pack/plugins/task_manager/server/ephemeral_task_lifecycle.test.ts b/x-pack/plugins/task_manager/server/ephemeral_task_lifecycle.test.ts index 639bb834eeb4c7..1d98e37a06a551 100644 --- a/x-pack/plugins/task_manager/server/ephemeral_task_lifecycle.test.ts +++ b/x-pack/plugins/task_manager/server/ephemeral_task_lifecycle.test.ts @@ -69,6 +69,10 @@ describe('EphemeralTaskLifecycle', () => { unsafe: { exclude_task_types: [], }, + event_loop_delay: { + monitor: true, + warn_threshold: 5000, + }, ...config, }, elasticsearchAndSOAvailability$, diff --git a/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts b/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts index 3442e69aab44a6..c5f03b17693857 100644 --- a/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts +++ b/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts @@ -57,6 +57,10 @@ describe.skip('managed configuration', () => { unsafe: { exclude_task_types: [], }, + event_loop_delay: { + monitor: true, + warn_threshold: 5000, + }, }); logger = context.logger.get('taskManager'); diff --git a/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.test.ts index 77fd9a8f11fabe..776f5bc9388f7b 100644 --- a/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.test.ts @@ -40,6 +40,10 @@ describe('Configuration Statistics Aggregator', () => { unsafe: { exclude_task_types: [], }, + event_loop_delay: { + monitor: true, + warn_threshold: 5000, + }, }; const managedConfig = { diff --git a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts index 8aa2d54d896238..a6ef665966ddd7 100644 --- a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts @@ -44,6 +44,10 @@ describe('createMonitoringStatsStream', () => { unsafe: { exclude_task_types: [], }, + event_loop_delay: { + monitor: true, + warn_threshold: 5000, + }, }; it('returns the initial config used to configure Task Manager', async () => { diff --git a/x-pack/plugins/task_manager/server/plugin.test.ts b/x-pack/plugins/task_manager/server/plugin.test.ts index 20e5f211a5b4ed..aa91533eabadf9 100644 --- a/x-pack/plugins/task_manager/server/plugin.test.ts +++ b/x-pack/plugins/task_manager/server/plugin.test.ts @@ -43,6 +43,10 @@ describe('TaskManagerPlugin', () => { unsafe: { exclude_task_types: [], }, + event_loop_delay: { + monitor: true, + warn_threshold: 5000, + }, }); pluginInitializerContext.env.instanceUuid = ''; @@ -84,6 +88,10 @@ describe('TaskManagerPlugin', () => { unsafe: { exclude_task_types: [], }, + event_loop_delay: { + monitor: true, + warn_threshold: 5000, + }, }); const taskManagerPlugin = new TaskManagerPlugin(pluginInitializerContext); @@ -154,6 +162,10 @@ describe('TaskManagerPlugin', () => { unsafe: { exclude_task_types: ['*'], }, + event_loop_delay: { + monitor: true, + warn_threshold: 5000, + }, }); const logger = pluginInitializerContext.logger.get(); diff --git a/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts b/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts index cf29d1f475c6ce..7cbaa5a1655449 100644 --- a/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts +++ b/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts @@ -67,6 +67,10 @@ describe('TaskPollingLifecycle', () => { unsafe: { exclude_task_types: [], }, + event_loop_delay: { + monitor: true, + warn_threshold: 5000, + }, }, taskStore: mockTaskStore, logger: taskManagerLogger, diff --git a/x-pack/plugins/task_manager/server/polling_lifecycle.ts b/x-pack/plugins/task_manager/server/polling_lifecycle.ts index a452c8a3f82fbe..ee7e2ec32932ee 100644 --- a/x-pack/plugins/task_manager/server/polling_lifecycle.ts +++ b/x-pack/plugins/task_manager/server/polling_lifecycle.ts @@ -91,6 +91,7 @@ export class TaskPollingLifecycle { private middleware: Middleware; private usageCounter?: UsageCounter; + private config: TaskManagerConfig; /** * Initializes the task manager, preventing any further addition of middleware, @@ -117,6 +118,7 @@ export class TaskPollingLifecycle { this.store = taskStore; this.executionContext = executionContext; this.usageCounter = usageCounter; + this.config = config; const emitEvent = (event: TaskLifecycleEvent) => this.events$.next(event); @@ -240,6 +242,7 @@ export class TaskPollingLifecycle { defaultMaxAttempts: this.taskClaiming.maxAttempts, executionContext: this.executionContext, usageCounter: this.usageCounter, + eventLoopDelayConfig: { ...this.config.event_loop_delay }, }); }; diff --git a/x-pack/plugins/task_manager/server/task_events.test.ts b/x-pack/plugins/task_manager/server/task_events.test.ts new file mode 100644 index 00000000000000..5d72120da725ca --- /dev/null +++ b/x-pack/plugins/task_manager/server/task_events.test.ts @@ -0,0 +1,82 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { startTaskTimer, startTaskTimerWithEventLoopMonitoring } from './task_events'; + +const DelayIterations = 4; +const DelayMillis = 250; +const DelayTotal = DelayIterations * DelayMillis; + +async function nonBlockingDelay(millis: number) { + await new Promise((resolve) => setTimeout(resolve, millis)); +} + +async function blockingDelay(millis: number) { + // get task in async queue + await nonBlockingDelay(0); + + const end = Date.now() + millis; + // eslint-disable-next-line no-empty + while (Date.now() < end) {} +} + +async function nonBlockingTask() { + for (let i = 0; i < DelayIterations; i++) { + await nonBlockingDelay(DelayMillis); + } +} + +async function blockingTask() { + for (let i = 0; i < DelayIterations; i++) { + await blockingDelay(DelayMillis); + } +} + +describe('task_events', () => { + test('startTaskTimer', async () => { + const stopTaskTimer = startTaskTimer(); + await nonBlockingTask(); + const result = stopTaskTimer(); + expect(result.stop - result.start).not.toBeLessThan(DelayTotal); + expect(result.eventLoopBlockMs).toBe(undefined); + }); + + describe('startTaskTimerWithEventLoopMonitoring', () => { + test('non-blocking', async () => { + const stopTaskTimer = startTaskTimerWithEventLoopMonitoring({ + monitor: true, + warn_threshold: 5000, + }); + await nonBlockingTask(); + const result = stopTaskTimer(); + expect(result.stop - result.start).not.toBeLessThan(DelayTotal); + expect(result.eventLoopBlockMs).toBeLessThan(DelayMillis); + }); + + test('blocking', async () => { + const stopTaskTimer = startTaskTimerWithEventLoopMonitoring({ + monitor: true, + warn_threshold: 5000, + }); + await blockingTask(); + const result = stopTaskTimer(); + expect(result.stop - result.start).not.toBeLessThan(DelayTotal); + expect(result.eventLoopBlockMs).not.toBeLessThan(DelayMillis); + }); + + test('not monitoring', async () => { + const stopTaskTimer = startTaskTimerWithEventLoopMonitoring({ + monitor: false, + warn_threshold: 5000, + }); + await blockingTask(); + const result = stopTaskTimer(); + expect(result.stop - result.start).not.toBeLessThan(DelayTotal); + expect(result.eventLoopBlockMs).toBe(0); + }); + }); +}); diff --git a/x-pack/plugins/task_manager/server/task_events.ts b/x-pack/plugins/task_manager/server/task_events.ts index 7c7845569a10b5..de2c9dc04acd27 100644 --- a/x-pack/plugins/task_manager/server/task_events.ts +++ b/x-pack/plugins/task_manager/server/task_events.ts @@ -5,6 +5,8 @@ * 2.0. */ +import { monitorEventLoopDelay } from 'perf_hooks'; + import { Option } from 'fp-ts/lib/Option'; import { ConcreteTaskInstance } from './task'; @@ -14,6 +16,7 @@ import { ClaimAndFillPoolResult } from './lib/fill_pool'; import { PollingError } from './polling'; import { TaskRunResult } from './task_running'; import { EphemeralTaskInstanceRequest } from './ephemeral_task_lifecycle'; +import type { EventLoopDelayConfig } from './config'; export enum TaskPersistence { Recurring = 'recurring', @@ -40,6 +43,7 @@ export enum TaskClaimErrorType { export interface TaskTiming { start: number; stop: number; + eventLoopBlockMs?: number; } export type WithTaskTiming = T & { timing: TaskTiming }; @@ -48,6 +52,22 @@ export function startTaskTimer(): () => TaskTiming { return () => ({ start, stop: Date.now() }); } +export function startTaskTimerWithEventLoopMonitoring( + eventLoopDelayConfig: EventLoopDelayConfig +): () => TaskTiming { + const stopTaskTimer = startTaskTimer(); + const eldHistogram = eventLoopDelayConfig.monitor ? monitorEventLoopDelay() : null; + eldHistogram?.enable(); + + return () => { + const { start, stop } = stopTaskTimer(); + eldHistogram?.disable(); + const eldMax = eldHistogram?.max ?? 0; + const eventLoopBlockMs = Math.round(eldMax / 1000 / 1000); // original in nanoseconds + return { start, stop, eventLoopBlockMs }; + }; +} + export interface TaskEvent { id?: ID; timing?: TaskTiming; diff --git a/x-pack/plugins/task_manager/server/task_running/task_runner.test.ts b/x-pack/plugins/task_manager/server/task_running/task_runner.test.ts index 09af125884fe92..ece82099728e3b 100644 --- a/x-pack/plugins/task_manager/server/task_running/task_runner.test.ts +++ b/x-pack/plugins/task_manager/server/task_running/task_runner.test.ts @@ -1528,7 +1528,11 @@ describe('TaskManagerRunner', () => { function withAnyTiming(taskRun: TaskRun) { return { ...taskRun, - timing: { start: expect.any(Number), stop: expect.any(Number) }, + timing: { + start: expect.any(Number), + stop: expect.any(Number), + eventLoopBlockMs: expect.any(Number), + }, }; } @@ -1590,6 +1594,10 @@ describe('TaskManagerRunner', () => { onTaskEvent: opts.onTaskEvent, executionContext, usageCounter, + eventLoopDelayConfig: { + monitor: true, + warn_threshold: 5000, + }, }); if (stage === TaskRunningStage.READY_TO_RUN) { diff --git a/x-pack/plugins/task_manager/server/task_running/task_runner.ts b/x-pack/plugins/task_manager/server/task_running/task_runner.ts index 48927435c4bdf9..778a834c168a14 100644 --- a/x-pack/plugins/task_manager/server/task_running/task_runner.ts +++ b/x-pack/plugins/task_manager/server/task_running/task_runner.ts @@ -38,7 +38,7 @@ import { TaskMarkRunning, asTaskRunEvent, asTaskMarkRunningEvent, - startTaskTimer, + startTaskTimerWithEventLoopMonitoring, TaskTiming, TaskPersistence, } from '../task_events'; @@ -56,6 +56,7 @@ import { } from '../task'; import { TaskTypeDictionary } from '../task_type_dictionary'; import { isUnrecoverableError } from './errors'; +import type { EventLoopDelayConfig } from '../config'; const defaultBackoffPerFailure = 5 * 60 * 1000; export const EMPTY_RUN_RESULT: SuccessfulRunResult = { state: {} }; @@ -105,6 +106,7 @@ type Opts = { defaultMaxAttempts: number; executionContext: ExecutionContextStart; usageCounter?: UsageCounter; + eventLoopDelayConfig: EventLoopDelayConfig; } & Pick; export enum TaskRunResult { @@ -152,6 +154,7 @@ export class TaskManagerRunner implements TaskRunner { private uuid: string; private readonly executionContext: ExecutionContextStart; private usageCounter?: UsageCounter; + private eventLoopDelayConfig: EventLoopDelayConfig; /** * Creates an instance of TaskManagerRunner. @@ -174,6 +177,7 @@ export class TaskManagerRunner implements TaskRunner { onTaskEvent = identity, executionContext, usageCounter, + eventLoopDelayConfig, }: Opts) { this.instance = asPending(sanitizeInstance(instance)); this.definitions = definitions; @@ -186,6 +190,7 @@ export class TaskManagerRunner implements TaskRunner { this.executionContext = executionContext; this.usageCounter = usageCounter; this.uuid = uuid.v4(); + this.eventLoopDelayConfig = eventLoopDelayConfig; } /** @@ -292,7 +297,7 @@ export class TaskManagerRunner implements TaskRunner { taskInstance: this.instance.task, }); - const stopTaskTimer = startTaskTimer(); + const stopTaskTimer = startTaskTimerWithEventLoopMonitoring(this.eventLoopDelayConfig); try { this.task = this.definition.createTaskRunner(modifiedContext); @@ -617,6 +622,18 @@ export class TaskManagerRunner implements TaskRunner { ); } ); + + const { eventLoopBlockMs = 0 } = taskTiming; + const taskLabel = `${this.taskType} ${this.instance.task.id}`; + if (eventLoopBlockMs > this.eventLoopDelayConfig.warn_threshold) { + this.logger.warn( + `event loop blocked for at least ${eventLoopBlockMs} ms while running task ${taskLabel}`, + { + tags: [this.taskType, taskLabel, 'event-loop-blocked'], + } + ); + } + return result; }