[Auto Import] Use larger number of samples on the backend (elastic#19…

…6233) ## Release Notes Automatic Import now analyses larger number of samples to generate an integration. ## Summary Closes elastic/security-team#9844 **Added: Backend Sampling** We pass 100 rows (these numeric values are adjustable) to the backend [^1] [^1]: As before, deterministically selected on the frontend, see elastic#191598 The Categorization chain now processes the samples in batches, performing after initial categorization a number of review cycles (but not more than 5, tuned so that we stay under the 2 minute limit for a single API call). To decide when to stop processing we keep the list of _stable_ samples as follows: 1. The list is initially empty. 2. For each review we select a random subset of 40 samples, preferring to pick up the not-stable samples. 3. After each review – when the LLM potentially gives us new or changes the old processors – we compare the new pipeline results with the old pipeline results. 4. Those reviewed samples that did not change their categorization are added to the stable list. 5. Any samples that have changed their categorization are removed from the stable list. 6. If all samples are stable, we finish processing. **Removed: User Notification** Using 100 samples provides a balance between expected complexity and time budget we work with. We might want to change it in the future, possibly dynamically, making the specific number of no importance to the user. Thus we remove the truncation notification. **Unchanged:** - No batching is made in the related chain: it seems to work as-is. **Refactored:** - We centralize the sizing constants in the `x-pack/plugins/integration_assistant/common/constants.ts` file. - We remove the unused state key `formattedSamples` and combine `modelJSONInput` back into `modelInput`. > [!NOTE] > I had difficulty generating new graph diagrams, so they remain unchanged.
hop-dev · Oct 15, 2024 · fc3ce54 · fc3ce54
1 parent c119a6a
commit fc3ce54
Show file tree

Hide file tree

Showing 31 changed files with 534 additions and 190 deletions.
diff --git a/x-pack/plugins/integration_assistant/__jest__/fixtures/categorization.ts b/x-pack/plugins/integration_assistant/__jest__/fixtures/categorization.ts
@@ -162,7 +162,6 @@ export const testPipelineInvalidEcs: { pipelineResults: object[]; errors: object
 export const categorizationTestState = {
   rawSamples: ['{"test1": "test1"}'],
   samples: ['{ "test1": "test1" }'],
-  formattedSamples: '{"test1": "test1"}',
   ecsTypes: 'testtypes',
   ecsCategories: 'testcategories',
   exAnswer: 'testanswer',
@@ -173,9 +172,8 @@ export const categorizationTestState = {
   previousError: 'testprevious',
   previousInvalidCategorization: 'testinvalid',
   pipelineResults: [{ test: 'testresult' }],
-  finalized: false,
-  hasTriedOnce: false,
-  reviewed: false,
+  previousPipelineResults: [{ test: 'testresult' }],
+  lastReviewedSamples: [],
   currentPipeline: { test: 'testpipeline' },
   currentProcessors: [
     {
@@ -193,6 +191,9 @@ export const categorizationTestState = {
   initialPipeline: categorizationInitialPipeline,
   results: { test: 'testresults' },
   samplesFormat: { name: SamplesFormatName.Values.json },
+  stableSamples: [],
+  reviewCount: 0,
+  finalized: false,
 };
 
 export const categorizationMockProcessors = [

diff --git a/x-pack/plugins/integration_assistant/__jest__/fixtures/related.ts b/x-pack/plugins/integration_assistant/__jest__/fixtures/related.ts
@@ -140,7 +140,6 @@ export const testPipelineValidResult: { pipelineResults: object[]; errors: objec
 export const relatedTestState = {
   rawSamples: ['{"test1": "test1"}'],
   samples: ['{ "test1": "test1" }'],
-  formattedSamples: '{"test1": "test1"}',
   ecs: 'testtypes',
   exAnswer: 'testanswer',
   packageName: 'testpackage',

diff --git a/x-pack/plugins/integration_assistant/common/constants.ts b/x-pack/plugins/integration_assistant/common/constants.ts
@@ -36,3 +36,11 @@ export enum GenerationErrorCode {
   UNSUPPORTED_LOG_SAMPLES_FORMAT = 'unsupported-log-samples-format',
   UNPARSEABLE_CSV_DATA = 'unparseable-csv-data',
 }
+
+// Size limits
+export const FRONTEND_SAMPLE_ROWS = 100;
+export const LOG_FORMAT_DETECTION_SAMPLE_ROWS = 5;
+export const CATEGORIZATION_INITIAL_BATCH_SIZE = 60;
+export const CATEROGIZATION_REVIEW_BATCH_SIZE = 40;
+export const CATEGORIZATION_REVIEW_MAX_CYCLES = 5;
+export const CATEGORIZATION_RECURSION_LIMIT = 50;
diff --git a/x-pack/plugins/integration_assistant/common/index.ts b/x-pack/plugins/integration_assistant/common/index.ts
@@ -21,6 +21,8 @@ export {
 } from './api/analyze_logs/analyze_logs_route.gen';
 export { CelInputRequestBody, CelInputResponse } from './api/cel/cel_input_route.gen';
 
+export { partialShuffleArray } from './utils';
+
 export type {
   DataStream,
   InputType,

diff --git a/...ant/steps/data_stream_step/utils.test.tsx → ...ntegration_assistant/common/utils.test.ts b/...ant/steps/data_stream_step/utils.test.tsx → ...ntegration_assistant/common/utils.test.ts
diff --git a/...ssistant/steps/data_stream_step/utils.tsx → ...ins/integration_assistant/common/utils.ts b/...ssistant/steps/data_stream_step/utils.tsx → ...ins/integration_assistant/common/utils.ts
diff --git a/...ntegration/create_integration_assistant/steps/data_stream_step/sample_logs_input.test.tsx b/...ntegration/create_integration_assistant/steps/data_stream_step/sample_logs_input.test.tsx
@@ -11,7 +11,6 @@ import { TestProvider } from '../../../../../mocks/test_provider';
 import { parseNDJSON, parseJSONArray, SampleLogsInput } from './sample_logs_input';
 import { ActionsProvider } from '../../state';
 import { mockActions } from '../../mocks/state';
-import { mockServices } from '../../../../../services/mocks/services';
 
 const wrapper: React.FC<React.PropsWithChildren<{}>> = ({ children }) => (
   <TestProvider>
@@ -165,25 +164,6 @@ describe('SampleLogsInput', () => {
           samplesFormat: { name: 'json', json_path: [] },
         });
       });
-
-      describe('when the file has too many rows', () => {
-        const tooLargeLogsSample = Array(6).fill(logsSampleRaw).join(','); // 12 entries
-        beforeEach(async () => {
-          await changeFile(input, new File([`[${tooLargeLogsSample}]`], 'test.json', { type }));
-        });
-
-        it('should truncate the logs sample', () => {
-          expect(mockActions.setIntegrationSettings).toBeCalledWith({
-            logSamples: tooLargeLogsSample.split(',').slice(0, 2),
-            samplesFormat: { name: 'json', json_path: [] },
-          });
-        });
-        it('should add a notification toast', () => {
-          expect(mockServices.notifications.toasts.addInfo).toBeCalledWith(
-            `The logs sample has been truncated to 10 rows.`
-          );
-        });
-      });
     });
 
     describe('when the file is a json array under a key', () => {
@@ -236,25 +216,6 @@ describe('SampleLogsInput', () => {
           samplesFormat: { name: 'ndjson', multiline: false },
         });
       });
-
-      describe('when the file has too many rows', () => {
-        const tooLargeLogsSample = Array(6).fill(simpleNDJSON).join('\n'); // 12 entries
-        beforeEach(async () => {
-          await changeFile(input, new File([tooLargeLogsSample], 'test.json', { type }));
-        });
-
-        it('should truncate the logs sample', () => {
-          expect(mockActions.setIntegrationSettings).toBeCalledWith({
-            logSamples: tooLargeLogsSample.split('\n').slice(0, 2),
-            samplesFormat: { name: 'ndjson', multiline: false },
-          });
-        });
-        it('should add a notification toast', () => {
-          expect(mockServices.notifications.toasts.addInfo).toBeCalledWith(
-            `The logs sample has been truncated to 10 rows.`
-          );
-        });
-      });
     });
 
     describe('when the file is a an ndjson with a single record', () => {

diff --git a/...ate_integration/create_integration_assistant/steps/data_stream_step/sample_logs_input.tsx b/...ate_integration/create_integration_assistant/steps/data_stream_step/sample_logs_input.tsx
@@ -8,14 +8,12 @@
 import React, { useCallback, useState } from 'react';
 import { EuiCallOut, EuiFilePicker, EuiFormRow, EuiSpacer, EuiText } from '@elastic/eui';
 import { isPlainObject } from 'lodash/fp';
-import { useKibana } from '@kbn/kibana-react-plugin/public';
 import type { IntegrationSettings } from '../../types';
 import * as i18n from './translations';
 import { useActions } from '../../state';
 import type { SamplesFormat } from '../../../../../../common';
-import { partialShuffleArray } from './utils';
-
-const MaxLogsSampleRows = 10;
+import { partialShuffleArray } from '../../../../../../common';
+import { FRONTEND_SAMPLE_ROWS } from '../../../../../../common/constants';
 
 /**
  * Parse the logs sample file content as newiline-delimited JSON (NDJSON).
@@ -83,8 +81,8 @@ export const parseJSONArray = (
  * @returns Whether the array was truncated.
  */
 function trimShuffleLogsSample<T>(array: T[]): boolean {
-  const willTruncate = array.length > MaxLogsSampleRows;
-  const numElements = willTruncate ? MaxLogsSampleRows : array.length;
+  const willTruncate = array.length > FRONTEND_SAMPLE_ROWS;
+  const numElements = willTruncate ? FRONTEND_SAMPLE_ROWS : array.length;
 
   partialShuffleArray(array, 1, numElements);
 
@@ -215,7 +213,6 @@ interface SampleLogsInputProps {
 }
 
 export const SampleLogsInput = React.memo<SampleLogsInputProps>(({ integrationSettings }) => {
-  const { notifications } = useKibana().services;
   const { setIntegrationSettings } = useActions();
   const [isParsing, setIsParsing] = useState(false);
   const [sampleFileError, setSampleFileError] = useState<string>();
@@ -266,11 +263,7 @@ export const SampleLogsInput = React.memo<SampleLogsInputProps>(({ integrationSe
           return;
         }
 
-        const { samplesFormat, logSamples, isTruncated } = prepareResult;
-
-        if (isTruncated) {
-          notifications?.toasts.addInfo(i18n.LOGS_SAMPLE_TRUNCATED(MaxLogsSampleRows));
-        }
+        const { samplesFormat, logSamples } = prepareResult;
 
         setIntegrationSettings({
           ...integrationSettings,
@@ -293,7 +286,7 @@ export const SampleLogsInput = React.memo<SampleLogsInputProps>(({ integrationSe
 
       reader.readAsText(logsSampleFile);
     },
-    [integrationSettings, setIntegrationSettings, notifications?.toasts, setIsParsing]
+    [integrationSettings, setIntegrationSettings, setIsParsing]
   );
   return (
     <EuiFormRow

diff --git a/...ts/create_integration/create_integration_assistant/steps/data_stream_step/translations.ts b/...ts/create_integration/create_integration_assistant/steps/data_stream_step/translations.ts
@@ -110,11 +110,6 @@ export const LOGS_SAMPLE_DESCRIPTION = i18n.translate(
     defaultMessage: 'Drag and drop a file or Browse files.',
   }
 );
-export const LOGS_SAMPLE_TRUNCATED = (maxRows: number) =>
-  i18n.translate('xpack.integrationAssistant.step.dataStream.logsSample.truncatedWarning', {
-    values: { maxRows },
-    defaultMessage: `The logs sample has been truncated to {maxRows} rows.`,
-  });
 export const LOGS_SAMPLE_ERROR = {
   CAN_NOT_READ: i18n.translate(
     'xpack.integrationAssistant.step.dataStream.logsSample.errorCanNotRead',

diff --git a/x-pack/plugins/integration_assistant/server/graphs/categorization/categorization.ts b/x-pack/plugins/integration_assistant/server/graphs/categorization/categorization.ts
@@ -11,6 +11,8 @@ import { combineProcessors } from '../../util/processors';
 import { CATEGORIZATION_EXAMPLE_PROCESSORS } from './constants';
 import { CATEGORIZATION_MAIN_PROMPT } from './prompts';
 import type { CategorizationNodeParams } from './types';
+import { selectResults } from './util';
+import { CATEGORIZATION_INITIAL_BATCH_SIZE } from '../../../common/constants';
 
 export async function handleCategorization({
   state,
@@ -19,8 +21,15 @@ export async function handleCategorization({
   const categorizationMainPrompt = CATEGORIZATION_MAIN_PROMPT;
   const outputParser = new JsonOutputParser();
   const categorizationMainGraph = categorizationMainPrompt.pipe(model).pipe(outputParser);
+
+  const [pipelineResults, _] = selectResults(
+    state.pipelineResults,
+    CATEGORIZATION_INITIAL_BATCH_SIZE,
+    new Set(state.stableSamples)
+  );
+
   const currentProcessors = (await categorizationMainGraph.invoke({
-    pipeline_results: JSON.stringify(state.pipelineResults, null, 2),
+    pipeline_results: JSON.stringify(pipelineResults, null, 2),
     example_processors: CATEGORIZATION_EXAMPLE_PROCESSORS,
     ex_answer: state?.exAnswer,
     ecs_categories: state?.ecsCategories,
@@ -36,7 +45,7 @@ export async function handleCategorization({
   return {
     currentPipeline,
     currentProcessors,
-    hasTriedOnce: true,
+    lastReviewedSamples: [],
     lastExecutedChain: 'categorization',
   };
 }
diff --git a/x-pack/plugins/integration_assistant/server/graphs/categorization/constants.ts b/x-pack/plugins/integration_assistant/server/graphs/categorization/constants.ts
@@ -4,6 +4,7 @@
  * 2.0; you may not use this file except in compliance with the Elastic License
  * 2.0.
  */
+
 export const ECS_CATEGORIES = {
   api: 'Covers events from API calls, including those from OS and network protocols. Allowed event.type combinations: access, admin, allowed, change, creation, deletion, denied, end, info, start, user',
   authentication:

diff --git a/x-pack/plugins/integration_assistant/server/graphs/categorization/errors.ts b/x-pack/plugins/integration_assistant/server/graphs/categorization/errors.ts
@@ -39,7 +39,6 @@ export async function handleErrors({
   return {
     currentPipeline,
     currentProcessors,
-    reviewed: false,
     lastExecutedChain: 'error',
   };
 }
diff --git a/x-pack/plugins/integration_assistant/server/graphs/categorization/graph.test.ts b/x-pack/plugins/integration_assistant/server/graphs/categorization/graph.test.ts
@@ -25,6 +25,7 @@ import { handleReview } from './review';
 import { handleCategorization } from './categorization';
 import { handleErrors } from './errors';
 import { handleInvalidCategorization } from './invalid';
+import { handleUpdateStableSamples } from './stable';
 import { testPipeline, combineProcessors } from '../../util';
 import {
   ActionsClientChatOpenAI,
@@ -39,6 +40,7 @@ jest.mock('./errors');
 jest.mock('./review');
 jest.mock('./categorization');
 jest.mock('./invalid');
+jest.mock('./stable');
 
 jest.mock('../../util/pipeline', () => ({
   testPipeline: jest.fn(),
@@ -74,7 +76,8 @@ describe('runCategorizationGraph', () => {
       return {
         currentPipeline,
         currentProcessors,
-        reviewed: false,
+        stableSamples: [],
+        reviewCount: 0,
         finalized: false,
         lastExecutedChain: 'categorization',
       };
@@ -90,7 +93,8 @@ describe('runCategorizationGraph', () => {
       return {
         currentPipeline,
         currentProcessors,
-        reviewed: false,
+        stableSamples: [],
+        reviewCount: 0,
         finalized: false,
         lastExecutedChain: 'error',
       };
@@ -106,7 +110,8 @@ describe('runCategorizationGraph', () => {
       return {
         currentPipeline,
         currentProcessors,
-        reviewed: false,
+        stableSamples: [],
+        reviewCount: 0,
         finalized: false,
         lastExecutedChain: 'invalidCategorization',
       };
@@ -122,11 +127,29 @@ describe('runCategorizationGraph', () => {
       return {
         currentProcessors,
         currentPipeline,
-        reviewed: true,
+        stableSamples: [],
+        reviewCount: 0,
         finalized: false,
         lastExecutedChain: 'review',
       };
     });
+    // After the review it should route to modelOutput and finish.
+    (handleUpdateStableSamples as jest.Mock)
+      .mockResolvedValueOnce({
+        stableSamples: [],
+        finalized: false,
+        lastExecutedChain: 'handleUpdateStableSamples',
+      })
+      .mockResolvedValueOnce({
+        stableSamples: [],
+        finalized: false,
+        lastExecutedChain: 'handleUpdateStableSamples',
+      })
+      .mockResolvedValueOnce({
+        stableSamples: [0],
+        finalized: false,
+        lastExecutedChain: 'handleUpdateStableSamples',
+      });
   });
 
   it('Ensures that the graph compiles', async () => {