Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Automatic Import] Reproducible sampling of log entries #191598

Merged
merged 37 commits into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
9bf9e11
More robust error handling when uploading a file
ilyannn Aug 26, 2024
a46dac2
Clear the error and state on any changes to FileReader
ilyannn Aug 26, 2024
40cb9d1
Handle special Chrome behaviour
ilyannn Aug 26, 2024
dff63a0
Display longer message with onabort as well
ilyannn Aug 26, 2024
1711427
Merge branch 'main' into auto-import/better-errors
ilyannn Aug 26, 2024
56f878d
Use a different ID for the new string
ilyannn Aug 27, 2024
c1b77bc
Merge branch 'main' into auto-import/better-errors
elasticmachine Aug 27, 2024
eb0d7ef
Fix code duplication
ilyannn Aug 27, 2024
ed3535b
Merge branch 'auto-import/better-errors' of github.com:ilyannn/kibana…
ilyannn Aug 27, 2024
b49ca07
Extract a named function for handling an error
ilyannn Aug 27, 2024
9718a4d
Add partialShuffleArray
ilyannn Aug 28, 2024
3f2d711
Fix the function and add tests
ilyannn Aug 28, 2024
6fce500
Use a better default seed
ilyannn Aug 28, 2024
8541db9
Apply the partial shuffle
ilyannn Aug 29, 2024
b26ca57
Make tests pass, but how?
ilyannn Aug 29, 2024
a6b245e
Remove console.log statements
ilyannn Aug 29, 2024
2d1d412
Always shuffle samples around
ilyannn Aug 29, 2024
fdc3b5d
Extract partialShuffleArray into a separate file
ilyannn Aug 30, 2024
571eb5a
Merge branch 'main' of github.com:elastic/kibana into auto-import/bet…
ilyannn Aug 31, 2024
ac196e8
Improve how the EMPTY error is handled
ilyannn Aug 31, 2024
97413f3
Merge branch 'main' into auto-import/better-errors
elasticmachine Aug 31, 2024
27f5a55
Merge branch 'auto-import/better-errors' of github.com:ilyannn/kibana…
ilyannn Sep 1, 2024
fcbb8ed
Fix the empty sample issues
ilyannn Sep 1, 2024
2c87f61
Merge branch 'main' into auto-import/sampling
elasticmachine Sep 1, 2024
7253496
Merge branch 'main' into auto-import/sampling
elasticmachine Sep 3, 2024
698c71e
Merge branch 'main' of github.com:elastic/kibana into auto-import/sam…
ilyannn Sep 3, 2024
94dc786
Remove duplicated lines
ilyannn Sep 3, 2024
6a831c4
Remove more duplication
ilyannn Sep 3, 2024
feb7543
Make non-optionaly more clear
ilyannn Sep 3, 2024
e6232f4
Rename a function
ilyannn Sep 3, 2024
27c8b1e
Rename to prepareLogsContent
ilyannn Sep 3, 2024
e354a9f
Improve some docs
ilyannn Sep 3, 2024
5393ccd
Merge branch 'main' into auto-import/sampling
elasticmachine Sep 3, 2024
c045d69
Refactor selectFromLogsSample into trimShuffleLogsSample
ilyannn Sep 4, 2024
2e92a73
Merge branch 'auto-import/sampling' of github.com:ilyannn/kibana into…
ilyannn Sep 4, 2024
5f68ac3
Update the docs
ilyannn Sep 5, 2024
d0118d3
Improve the docs some more
ilyannn Sep 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ describe('SampleLogsInput', () => {

it('should truncate the logs sample', () => {
expect(mockActions.setIntegrationSettings).toBeCalledWith({
logSamples: tooLargeLogsSample.split(',').slice(0, 10),
logSamples: tooLargeLogsSample.split(',').slice(0, 2),
ilyannn marked this conversation as resolved.
Show resolved Hide resolved
samplesFormat: { name: 'json', json_path: [] },
});
});
Expand Down Expand Up @@ -245,7 +245,7 @@ describe('SampleLogsInput', () => {

it('should truncate the logs sample', () => {
expect(mockActions.setIntegrationSettings).toBeCalledWith({
logSamples: tooLargeLogsSample.split('\n').slice(0, 10),
logSamples: tooLargeLogsSample.split('\n').slice(0, 2),
samplesFormat: { name: 'ndjson', multiline: false },
});
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import type { IntegrationSettings } from '../../types';
import * as i18n from './translations';
import { useActions } from '../../state';
import type { SamplesFormat } from '../../../../../../common';
import { partialShuffleArray } from './utils';

const MaxLogsSampleRows = 10;

Expand Down Expand Up @@ -64,46 +65,92 @@ export const parseJSONArray = (
return { errorNoArrayFound: true, entries: [], pathToEntries: [] };
};

interface ParseLogsErrorResult {
/**
* Selects samples from the backend from an array of log samples type T.
*
* This is a generic function to apply to arrays of any type.
*
* The array is changed in-place so that it will:
* - have no more than MaxLogsSampleRows; and
* - be shuffled using the reproducible shuffle algorithm;
* - however, the first element will be kept in-place.
*
* The idea is to perform the same amount of operations on the array
* regardless of its size and to not use any extra memory.
*
* @param array - The array to select from (cannot be empty).
* @template T - The type of elements in the array.
* @returns Whether the array was truncated.
*/
function trimShuffleLogsSample<T>(array: T[]): boolean {
const willTruncate = array.length > MaxLogsSampleRows;
const numElements = willTruncate ? MaxLogsSampleRows : array.length;

partialShuffleArray(array, 1, numElements);

if (willTruncate) {
array.length = numElements;
}

return willTruncate;
}

// The error message structure.
interface PrepareLogsErrorResult {
error: string;
}

interface ParseLogsSuccessResult {
// The parsed logs sample structure.
interface PrepareLogsSuccessResult {
// Format of the samples, if able to be determined.
samplesFormat?: SamplesFormat;
// The parsed log samples. If samplesFormat is (ND)JSON, these are JSON strings.
logSamples: string[];
// Whether the log samples were truncated.
isTruncated: boolean;
}

type ParseLogsResult = ParseLogsErrorResult | ParseLogsSuccessResult;
type PrepareLogsResult = PrepareLogsErrorResult | PrepareLogsSuccessResult;

/**
* Parse the logs sample file content and return the parsed logs sample.
* Prepares the logs sample to send to the backend from the user-provided file.
*
* This function will return an error message if the file content is not valid, that is:
* - it is too large to parse (the memory required is 2-3x of the file size); or
* - it looks like a JSON format, but there is no array; or
* - it looks like (ND)JSON format, but the items are not JSON dictionaries.
* - it looks like (ND)JSON format, but the items are not JSON dictionaries; or
* - the list of entries is empty.
* In other cases it will parse and return the `logSamples` array of strings.
*
* Additionally if the format was (ND)JSON:
* - the `samplesFormat` field will be filled out with the format description; and
* - the samples will be serialized back to JSON strings;
* otherwise:
* - the `samplesFormat` field will be undefined; and
* - the samples will be strings with unknown structure.
*
* Otherwise it is guaranteed to parse and return (possibly empty) `logSamples` array.
* If the file content is (ND)JSON, it will additionally fill out the `samplesFormat`
* field with name 'json' or 'ndjson'; otherwise it will be undefined.
* In all cases it will also:
* - shuffle the parsed logs sample using the reproducible shuffle algorithm;
* - return no more than MaxLogsSampleRows entries.
*
* @param fileContent The content of the user-provided logs sample file.
* @returns The parsed logs sample structure or an error message.
*/
const parseLogsContent = (fileContent: string): ParseLogsResult => {
let parsedContent: unknown[];
let samplesFormat: SamplesFormat;
const prepareLogsContent = (fileContent: string): PrepareLogsResult => {
let parsedJSONContent: unknown[];
let jsonSamplesFormat: SamplesFormat;

try {
parsedContent = parseNDJSON(fileContent);
parsedJSONContent = parseNDJSON(fileContent);

// Special case for files that can be parsed as both JSON and NDJSON:
// for a one-line array [] -> extract its contents (it's a JSON)
// for a one-line object {} -> do nothing (keep as NDJSON)
if (parsedContent.length === 1 && Array.isArray(parsedContent[0])) {
parsedContent = parsedContent[0];
samplesFormat = { name: 'json', json_path: [] };
if (parsedJSONContent.length === 1 && Array.isArray(parsedJSONContent[0])) {
parsedJSONContent = parsedJSONContent[0];
jsonSamplesFormat = { name: 'json', json_path: [] };
} else {
samplesFormat = { name: 'ndjson', multiline: false };
jsonSamplesFormat = { name: 'ndjson', multiline: false };
}
} catch (parseNDJSONError) {
if (parseNDJSONError instanceof RangeError) {
Expand All @@ -114,33 +161,53 @@ const parseLogsContent = (fileContent: string): ParseLogsResult => {
if (errorNoArrayFound) {
return { error: i18n.LOGS_SAMPLE_ERROR.NOT_ARRAY };
}
parsedContent = entries;
samplesFormat = { name: 'json', json_path: pathToEntries };
parsedJSONContent = entries;
jsonSamplesFormat = { name: 'json', json_path: pathToEntries };
} catch (parseJSONError) {
if (parseJSONError instanceof RangeError) {
return { error: i18n.LOGS_SAMPLE_ERROR.TOO_LARGE_TO_PARSE };
}
try {
parsedContent = parseNDJSON(fileContent, true);
samplesFormat = { name: 'ndjson', multiline: true };
parsedJSONContent = parseNDJSON(fileContent, true);
jsonSamplesFormat = { name: 'ndjson', multiline: true };
} catch (parseMultilineNDJSONError) {
if (parseMultilineNDJSONError instanceof RangeError) {
return { error: i18n.LOGS_SAMPLE_ERROR.TOO_LARGE_TO_PARSE };
}
// This is an unknown format, so split into lines and return no samplesFormat.
const fileLines = fileContent.split('\n').filter((line) => line.trim() !== '');
if (fileLines.length === 0) {
return { error: i18n.LOGS_SAMPLE_ERROR.EMPTY };
}

const isTruncated = trimShuffleLogsSample(fileLines);

return {
logSamples: fileContent.split('\n').filter((line) => line.trim() !== ''),
samplesFormat: undefined, // Signifies that the format is unknown.
samplesFormat: undefined,
logSamples: fileLines,
isTruncated,
};
}
}
}

if (parsedContent.some((log) => !isPlainObject(log))) {
// This seems to be an ND(JSON), so perform additional checks and return jsonSamplesFormat.

if (parsedJSONContent.some((log) => !isPlainObject(log))) {
return { error: i18n.LOGS_SAMPLE_ERROR.NOT_OBJECT };
}

const logSamples = parsedContent.map((log) => JSON.stringify(log));
return { logSamples, samplesFormat };
if (parsedJSONContent.length === 0) {
return { error: i18n.LOGS_SAMPLE_ERROR.EMPTY };
}

const isTruncated = trimShuffleLogsSample(parsedJSONContent);

return {
samplesFormat: jsonSamplesFormat,
logSamples: parsedJSONContent.map((line) => JSON.stringify(line)),
isTruncated,
};
};

interface SampleLogsInputProps {
Expand Down Expand Up @@ -192,26 +259,17 @@ export const SampleLogsInput = React.memo<SampleLogsInputProps>(({ integrationSe
return;
}

const result = parseLogsContent(fileContent);
const prepareResult = prepareLogsContent(fileContent);

if ('error' in result) {
setSampleFileError(result.error);
if ('error' in prepareResult) {
setSampleFileError(prepareResult.error);
return;
}

const { logSamples: possiblyLargeLogSamples, samplesFormat } = result;

if (possiblyLargeLogSamples.length === 0) {
setSampleFileError(i18n.LOGS_SAMPLE_ERROR.EMPTY);
return;
}
const { samplesFormat, logSamples, isTruncated } = prepareResult;

let logSamples;
if (possiblyLargeLogSamples.length > MaxLogsSampleRows) {
logSamples = possiblyLargeLogSamples.slice(0, MaxLogsSampleRows);
if (isTruncated) {
notifications?.toasts.addInfo(i18n.LOGS_SAMPLE_TRUNCATED(MaxLogsSampleRows));
} else {
logSamples = possiblyLargeLogSamples;
}

setIntegrationSettings({
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import { partialShuffleArray } from './utils';

describe('partialShuffleArray', () => {
const fixture = [1, 2, 3, 4, 5, 6, 7];

it('should shuffle the array in reproducible manner when shuffling the whole array', () => {
const arr = fixture.slice();
partialShuffleArray(arr, 0, 7);
expect(arr).toEqual([4, 5, 1, 3, 7, 6, 2]);
});

it('can sometimes keep the array the same by sheer coincidence', () => {
const arr = [1, 2, 3, 4, 5];
partialShuffleArray(arr, 1, 5, '1337');
expect(arr).toEqual([1, 2, 3, 4, 5]);
});

it('should mostly return a different array', () => {
const original = [1, 2, 3, 4, 5];
let countSameArray = 0;
let countDifferentArray = 0;

for (let start = 0; start < original.length; start++) {
for (let end = start + 1; end <= original.length; end++) {
const arr = original.slice();
partialShuffleArray(arr, start, end);
countSameArray += arr.every((v, i) => v === original[i]) ? 1 : 0;
countDifferentArray += arr.some((v, i) => v !== original[i]) ? 1 : 0;
}
}
expect(countSameArray).toBeTruthy();
expect(countSameArray).toBeLessThan(countDifferentArray);
});

it('should shuffle the array in reproducible manner when providing a non-default seed', () => {
const arr = fixture.slice();
partialShuffleArray(arr, 0, 7, '1337');
expect(arr).toEqual([3, 5, 1, 7, 2, 6, 4]);
});

it('should partially shuffle the array in reproducible manner when shuffling a subarray', () => {
const arr = fixture.slice();
partialShuffleArray(arr, 2, 5);
expect(arr).toEqual([1, 2, 7, 5, 3, 6, 4]);
});

it('should do nothing if start is at the end of the array', () => {
const arr = fixture.slice();
partialShuffleArray(arr, arr.length, arr.length);
expect(arr).toEqual(fixture);
});

it('should do nothing if start is the same as end', () => {
const arr = fixture.slice();
const size = arr.length;
partialShuffleArray(arr, size / 2, size / 2);
expect(arr).toEqual(fixture);
});

it('should throw an error for invalid start index', () => {
const arr = fixture.slice();
expect(() => partialShuffleArray(arr, arr.length + 1, 4)).toThrow('Invalid start index');
expect(() => partialShuffleArray(arr, -1, 4)).toThrow('Invalid start index');
});

it('should throw an error for invalid end index', () => {
const arr = fixture.slice();
expect(() => partialShuffleArray(arr, 1, 0)).toThrow('Invalid end index');
expect(() => partialShuffleArray(arr, 1, arr.length + 1)).toThrow('Invalid end index');
});

it('should handle large arrays', () => {
const size = 100000;
const original = Array.from({ length: size }, (_, i) => i);
const arr = original.slice();

partialShuffleArray(arr, 2, 200);

expect(arr).toHaveLength(size);
expect(arr[0]).toEqual(0);
expect(arr[1]).toEqual(1);
expect(arr === original).toBe(false);
expect(new Set(arr)).toEqual(new Set(original));
});
});
Loading