From 8d2ed1e5b6fc352c4720b3d8cb2445186fb48194 Mon Sep 17 00:00:00 2001 From: Aditi Khare Date: Wed, 24 Apr 2024 15:49:16 -0400 Subject: [PATCH 1/8] fix(NODE-6123): toUtf8 validation insufficiently strict --- .../require_vendor.mjs | 2 +- src/parser/deserializer.ts | 13 +-- src/utils/node_byte_utils.ts | 2 +- src/utils/web_byte_utils.ts | 10 +- src/validate_utf8.ts | 66 ++++++----- test/node/byte_utils.test.ts | 103 +++++++++++++++++- 6 files changed, 139 insertions(+), 57 deletions(-) diff --git a/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs b/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs index 7d4fa4e91..659afe8d9 100644 --- a/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs +++ b/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs @@ -14,7 +14,7 @@ export class RequireVendor { * @returns {{ code: string; map: import('magic-string').SourceMap }} */ transform(code, id) { - if (!id.includes('web_byte_utils')) { + if (!id.includes('validate_utf8')) { return; } diff --git a/src/parser/deserializer.ts b/src/parser/deserializer.ts index ac2781903..a01a167d0 100644 --- a/src/parser/deserializer.ts +++ b/src/parser/deserializer.ts @@ -16,7 +16,6 @@ import { BSONSymbol } from '../symbol'; import { Timestamp } from '../timestamp'; import { ByteUtils } from '../utils/byte_utils'; import { NumberUtils } from '../utils/number_utils'; -import { validateUtf8 } from '../validate_utf8'; /** @public */ export interface DeserializeOptions { @@ -604,12 +603,12 @@ function deserializeObject( ) throw new BSONError('bad string length in bson'); // Namespace - if (validation != null && validation.utf8) { - if (!validateUtf8(buffer, index, index + stringSize - 1)) { - throw new BSONError('Invalid UTF-8 string in BSON document'); - } - } - const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, false); + const namespace = ByteUtils.toUTF8( + buffer, + index, + index + stringSize - 1, + validation != null && (validation.utf8 as boolean) + ); // Update parse index position index = index + stringSize; diff --git a/src/utils/node_byte_utils.ts b/src/utils/node_byte_utils.ts index d6a641a47..d9487046a 100644 --- a/src/utils/node_byte_utils.ts +++ b/src/utils/node_byte_utils.ts @@ -139,7 +139,7 @@ export const nodeJsByteUtils = { // TODO(NODE-4930): Insufficiently strict BSON UTF8 validation for (let i = 0; i < string.length; i++) { if (string.charCodeAt(i) === 0xfffd) { - if (!validateUtf8(buffer, start, end)) { + if (!validateUtf8(buffer, start, end, fatal)) { throw new BSONError('Invalid UTF-8 string in BSON document'); } break; diff --git a/src/utils/web_byte_utils.ts b/src/utils/web_byte_utils.ts index 77a1f0f74..f2d1b09b1 100644 --- a/src/utils/web_byte_utils.ts +++ b/src/utils/web_byte_utils.ts @@ -1,5 +1,6 @@ import { BSONError } from '../error'; import { tryReadBasicLatin } from './latin'; +import { validateUtf8 } from '../validate_utf8'; type TextDecoder = { readonly encoding: string; @@ -179,14 +180,7 @@ export const webByteUtils = { return basicLatin; } - if (fatal) { - try { - return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end)); - } catch (cause) { - throw new BSONError('Invalid UTF-8 string in BSON document', { cause }); - } - } - return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end)); + return validateUtf8(uint8array, start, end, fatal); }, utf8ByteLength(input: string): number { diff --git a/src/validate_utf8.ts b/src/validate_utf8.ts index e1da934c6..1d2a81565 100644 --- a/src/validate_utf8.ts +++ b/src/validate_utf8.ts @@ -1,13 +1,26 @@ -const FIRST_BIT = 0x80; -const FIRST_TWO_BITS = 0xc0; -const FIRST_THREE_BITS = 0xe0; -const FIRST_FOUR_BITS = 0xf0; -const FIRST_FIVE_BITS = 0xf8; +import { BSONError } from './error'; -const TWO_BIT_CHAR = 0xc0; -const THREE_BIT_CHAR = 0xe0; -const FOUR_BIT_CHAR = 0xf0; -const CONTINUING_CHAR = 0x80; +type TextDecoder = { + readonly encoding: string; + readonly fatal: boolean; + readonly ignoreBOM: boolean; + decode(input?: Uint8Array): string; +}; +type TextDecoderConstructor = { + new (label: 'utf8', options: { fatal: boolean; ignoreBOM?: boolean }): TextDecoder; +}; + +type TextEncoder = { + readonly encoding: string; + encode(input?: string): Uint8Array; +}; +type TextEncoderConstructor = { + new (): TextEncoder; +}; + +// Node byte utils global +declare const TextDecoder: TextDecoderConstructor; +declare const TextEncoder: TextEncoderConstructor; /** * Determines if the passed in bytes are valid utf8 @@ -16,32 +29,17 @@ const CONTINUING_CHAR = 0x80; * @param end - The index to end validating */ export function validateUtf8( - bytes: { [index: number]: number }, + buffer: Uint8Array, start: number, - end: number -): boolean { - let continuation = 0; - - for (let i = start; i < end; i += 1) { - const byte = bytes[i]; - - if (continuation) { - if ((byte & FIRST_TWO_BITS) !== CONTINUING_CHAR) { - return false; - } - continuation -= 1; - } else if (byte & FIRST_BIT) { - if ((byte & FIRST_THREE_BITS) === TWO_BIT_CHAR) { - continuation = 1; - } else if ((byte & FIRST_FOUR_BITS) === THREE_BIT_CHAR) { - continuation = 2; - } else if ((byte & FIRST_FIVE_BITS) === FOUR_BIT_CHAR) { - continuation = 3; - } else { - return false; - } + end: number, + fatal: boolean +): string { + if (fatal) { + try { + return new TextDecoder('utf8', { fatal }).decode(buffer.slice(start, end)); + } catch (cause) { + throw new BSONError('Invalid UTF-8 string in BSON document', { cause }); } } - - return !continuation; + return new TextDecoder('utf8', { fatal }).decode(buffer.slice(start, end)); } diff --git a/test/node/byte_utils.test.ts b/test/node/byte_utils.test.ts index fa6d7f893..a41b19d3a 100644 --- a/test/node/byte_utils.test.ts +++ b/test/node/byte_utils.test.ts @@ -8,6 +8,7 @@ import { webByteUtils } from '../../src/utils/web_byte_utils'; import * as sinon from 'sinon'; import { loadCJSModuleBSON, loadReactNativeCJSModuleBSON, loadESModuleBSON } from '../load_bson'; import * as crypto from 'node:crypto'; +import { BSONError } from '../../src/error'; type ByteUtilTest = { name: string; @@ -399,6 +400,7 @@ const fromUTF8Tests: ByteUtilTest<'encodeUTF8Into'>[] = [ } } ]; + const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [ { name: 'should create utf8 string from buffer input', @@ -417,21 +419,57 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [ } }, { - name: 'should throw an error if fatal is set and string is invalid', + name: 'should insert replacement character fatal is false and string is invalid', + inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false], + expectation({ error, output }) { + expect(error).to.not.exist; + expect(output).to.equal('abc\uFFFD'); + } + }, + { + name: 'should throw an error if fatal is set and string is a sequence that decodes to an invalid code point', inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, true], expectation({ error }) { expect(error).to.match(/Invalid UTF-8 string in BSON document/i); } }, { - name: 'should insert replacement character fatal is false and string is invalid', - inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false], - expectation({ error, output }) { - expect(error).to.not.exist; - expect(output).to.equal('abc\uFFFD'); + name: 'throw an error if fatal is set and string contains overlong encoding', + inputs: [Buffer.from('11000000025f0005000000f08282ac0000', 'hex'), 0, 18, true], + expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { + name: 'throw an error if fatal is set and string contains invalid bytes', + inputs: [Buffer.from('abcff', 'hex'), 0, 2, true], + expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { + name: 'throw an error if fatal is set and string contains an unexpected continuation byte', + inputs: [Buffer.from('7F80', 'hex'), 0, 2, true], + expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { + name: 'throw an error if fatal is set and string contains a non-continuation byte before the end of the character', + inputs: [Buffer.from('c000', 'hex'), 0, 2, true], + expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { + name: 'throw an error if fatal is set and string ends before the end of the character', + inputs: [Buffer.from('c0', 'hex'), 0, 1, true], + expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); } } ]; + const utf8ByteLengthTests: ByteUtilTest<'utf8ByteLength'>[] = [ { name: 'should return zero for empty string', @@ -493,6 +531,51 @@ const randomBytesTests: ByteUtilTest<'randomBytes'>[] = [ } ]; +// extra error cases copied from Web platform specs +const toUTF8ErrorCaseTests = [ + { input: [0xff], name: 'invalid code' }, + { input: [0xc0], name: 'ends early' }, + { input: [0xe0], name: 'ends early 2' }, + { input: [0xc0, 0x00], name: 'invalid trail' }, + { input: [0xc0, 0xc0], name: 'invalid trail 2' }, + { input: [0xe0, 0x00], name: 'invalid trail 3' }, + { input: [0xe0, 0xc0], name: 'invalid trail 4' }, + { input: [0xe0, 0x80, 0x00], name: 'invalid trail 5' }, + { input: [0xe0, 0x80, 0xc0], name: 'invalid trail 6' }, + { input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10ffff' }, + { input: [0xfe, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' }, + + // Overlong encodings + { input: [0xc0, 0x80], name: 'overlong U+0000 - 2 bytes' }, + { input: [0xe0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' }, + { input: [0xf0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' }, + { input: [0xf8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' }, + { input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 6 bytes' }, + + { input: [0xc1, 0xbf], name: 'overlong U+007f - 2 bytes' }, + { input: [0xe0, 0x81, 0xbf], name: 'overlong U+007f - 3 bytes' }, + { input: [0xf0, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 4 bytes' }, + { input: [0xf8, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 5 bytes' }, + { input: [0xfc, 0x80, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 6 bytes' }, + + { input: [0xe0, 0x9f, 0xbf], name: 'overlong U+07ff - 3 bytes' }, + { input: [0xf0, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 4 bytes' }, + { input: [0xf8, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 5 bytes' }, + { input: [0xfc, 0x80, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 6 bytes' }, + + { input: [0xf0, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 4 bytes' }, + { input: [0xf8, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 5 bytes' }, + { input: [0xfc, 0x80, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 6 bytes' }, + + { input: [0xf8, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 5 bytes' }, + { input: [0xfc, 0x80, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 6 bytes' }, + + // UTf-16 surrogates encoded as code points in UTf-8 + { input: [0xed, 0xa0, 0x80], name: 'lead surrogate' }, + { input: [0xed, 0xb0, 0x80], name: 'trail surrogate' }, + { input: [0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80], name: 'surrogate pair' } +]; + const utils = new Map([ ['nodeJsByteUtils', nodeJsByteUtils], ['webByteUtils', webByteUtils] @@ -798,6 +881,14 @@ describe('ByteUtils', () => { test.expectation({ web: byteUtilsName === 'webByteUtils', output, error }); }); } + if (utility === 'toUTF8') + for (const test of toUTF8ErrorCaseTests) { + it(`throws error when fatal is set and provided ${test.name} as input`, () => { + expect(() => + byteUtils[utility](Uint8Array.from(test.input), 0, test.input.length, true) + ).to.throw(BSONError, /Invalid UTF-8 string in BSON document/i); + }); + } }); } } From 9d3033af3b739d6f720bd5f3ef09e6ec6840beef Mon Sep 17 00:00:00 2001 From: Aditi Khare Date: Wed, 24 Apr 2024 16:55:56 -0400 Subject: [PATCH 2/8] store decoder instances, so not created upon each call --- src/validate_utf8.ts | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/validate_utf8.ts b/src/validate_utf8.ts index 1d2a81565..42782e75c 100644 --- a/src/validate_utf8.ts +++ b/src/validate_utf8.ts @@ -18,10 +18,13 @@ type TextEncoderConstructor = { new (): TextEncoder; }; -// Node byte utils global +// validate utf8 globals declare const TextDecoder: TextDecoderConstructor; declare const TextEncoder: TextEncoderConstructor; +const TextDecoderFatal: TextDecoder = new TextDecoder('utf8', { fatal: true }); +const TextDecoderNonFatal: TextDecoder = new TextDecoder('utf8', { fatal: false }); + /** * Determines if the passed in bytes are valid utf8 * @param bytes - An array of 8-bit bytes. Must be indexable and have length property @@ -36,10 +39,10 @@ export function validateUtf8( ): string { if (fatal) { try { - return new TextDecoder('utf8', { fatal }).decode(buffer.slice(start, end)); + return TextDecoderFatal.decode(buffer.slice(start, end)); } catch (cause) { throw new BSONError('Invalid UTF-8 string in BSON document', { cause }); } } - return new TextDecoder('utf8', { fatal }).decode(buffer.slice(start, end)); + return TextDecoderNonFatal.decode(buffer.slice(start, end)); } From 8a07891c057a600d8fe3c0c2cb1dbbb5f5ed182d Mon Sep 17 00:00:00 2001 From: Aditi Khare Date: Thu, 25 Apr 2024 11:08:18 -0400 Subject: [PATCH 3/8] lazy load text decoder --- src/validate_utf8.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/validate_utf8.ts b/src/validate_utf8.ts index 42782e75c..dc9dfe353 100644 --- a/src/validate_utf8.ts +++ b/src/validate_utf8.ts @@ -22,8 +22,8 @@ type TextEncoderConstructor = { declare const TextDecoder: TextDecoderConstructor; declare const TextEncoder: TextEncoderConstructor; -const TextDecoderFatal: TextDecoder = new TextDecoder('utf8', { fatal: true }); -const TextDecoderNonFatal: TextDecoder = new TextDecoder('utf8', { fatal: false }); +let TextDecoderFatal: TextDecoder; +let TextDecoderNonFatal: TextDecoder; /** * Determines if the passed in bytes are valid utf8 @@ -39,10 +39,12 @@ export function validateUtf8( ): string { if (fatal) { try { + TextDecoderFatal ??= new TextDecoder('utf8', { fatal: true }); return TextDecoderFatal.decode(buffer.slice(start, end)); } catch (cause) { throw new BSONError('Invalid UTF-8 string in BSON document', { cause }); } } + TextDecoderNonFatal ??= new TextDecoder('utf8', { fatal: false }); return TextDecoderNonFatal.decode(buffer.slice(start, end)); } From d6b87141f301131c6cfa9c18f8ef385290c423f2 Mon Sep 17 00:00:00 2001 From: Aditi Khare Date: Thu, 25 Apr 2024 16:41:13 -0400 Subject: [PATCH 4/8] fix require polyfill logic --- .../require_vendor.mjs | 34 ++++++++++++------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs b/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs index 659afe8d9..abff43b84 100644 --- a/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs +++ b/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs @@ -1,9 +1,12 @@ import MagicString from 'magic-string'; -const REQUIRE_POLYFILLS = - `const { TextEncoder, TextDecoder } = require('../vendor/text-encoding'); +const REQUIRE_WEB_UTILS_POLYFILLS = + `const { TextEncoder } = require('../vendor/text-encoding'); const { encode: btoa, decode: atob } = require('../vendor/base64');\n` +const REQUIRE_VALIDATE_UTF8_POLYFILLS = + `const { TextEncoder } = require('../vendor/text-encoding');`; + export class RequireVendor { /** * Take the compiled source code input; types are expected to already have been removed. @@ -14,17 +17,24 @@ export class RequireVendor { * @returns {{ code: string; map: import('magic-string').SourceMap }} */ transform(code, id) { - if (!id.includes('validate_utf8')) { - return; - } + if (id.includes('validate_utf8')) { + // MagicString lets us edit the source code and still generate an accurate source map + const magicString = new MagicString(code); + magicString.prepend(REQUIRE_VALIDATE_UTF8_POLYFILLS); - // MagicString lets us edit the source code and still generate an accurate source map - const magicString = new MagicString(code); - magicString.prepend(REQUIRE_POLYFILLS); + return { + code: magicString.toString(), + map: magicString.generateMap({ hires: true }) + }; + } else if (id.includes('web_byte_utils')) { + // MagicString lets us edit the source code and still generate an accurate source map + const magicString = new MagicString(code); + magicString.prepend(REQUIRE_WEB_UTILS_POLYFILLS); - return { - code: magicString.toString(), - map: magicString.generateMap({ hires: true }) - }; + return { + code: magicString.toString(), + map: magicString.generateMap({ hires: true }) + }; + } } } From 75b9485663f6424edec017f99a90711eece4ffb8 Mon Sep 17 00:00:00 2001 From: Aditi Khare Date: Thu, 25 Apr 2024 16:41:57 -0400 Subject: [PATCH 5/8] fix require polyfill logic 2 --- etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs b/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs index abff43b84..9a6d3930b 100644 --- a/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs +++ b/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs @@ -5,7 +5,7 @@ const REQUIRE_WEB_UTILS_POLYFILLS = const { encode: btoa, decode: atob } = require('../vendor/base64');\n` const REQUIRE_VALIDATE_UTF8_POLYFILLS = - `const { TextEncoder } = require('../vendor/text-encoding');`; + `const { TextDecoder } = require('../vendor/text-encoding');`; export class RequireVendor { /** From 92c7770ecadb0b629e2c179be8eabe43fd739ea5 Mon Sep 17 00:00:00 2001 From: Aditi Khare Date: Thu, 25 Apr 2024 18:39:55 -0400 Subject: [PATCH 6/8] requested changes --- .../require_vendor.mjs | 8 +- src/{validate_utf8.ts => parse_utf8.ts} | 23 +--- src/parser/deserializer.ts | 7 +- src/utils/node_byte_utils.ts | 7 +- src/utils/web_byte_utils.ts | 4 +- test/node/byte_utils.test.ts | 101 +++++++++++------- test/node/release.test.ts | 2 +- 7 files changed, 75 insertions(+), 77 deletions(-) rename src/{validate_utf8.ts => parse_utf8.ts} (66%) diff --git a/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs b/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs index 9a6d3930b..4819023dd 100644 --- a/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs +++ b/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs @@ -4,8 +4,8 @@ const REQUIRE_WEB_UTILS_POLYFILLS = `const { TextEncoder } = require('../vendor/text-encoding'); const { encode: btoa, decode: atob } = require('../vendor/base64');\n` -const REQUIRE_VALIDATE_UTF8_POLYFILLS = - `const { TextDecoder } = require('../vendor/text-encoding');`; +const REQUIRE_PARSE_UTF8_POLYFILLS = + `const { TextDecoder } = require('../vendor/text-encoding');\n`; export class RequireVendor { /** @@ -17,10 +17,10 @@ export class RequireVendor { * @returns {{ code: string; map: import('magic-string').SourceMap }} */ transform(code, id) { - if (id.includes('validate_utf8')) { + if (id.includes('parse_utf8')) { // MagicString lets us edit the source code and still generate an accurate source map const magicString = new MagicString(code); - magicString.prepend(REQUIRE_VALIDATE_UTF8_POLYFILLS); + magicString.prepend(REQUIRE_PARSE_UTF8_POLYFILLS); return { code: magicString.toString(), diff --git a/src/validate_utf8.ts b/src/parse_utf8.ts similarity index 66% rename from src/validate_utf8.ts rename to src/parse_utf8.ts index dc9dfe353..0e12793bf 100644 --- a/src/validate_utf8.ts +++ b/src/parse_utf8.ts @@ -10,18 +10,8 @@ type TextDecoderConstructor = { new (label: 'utf8', options: { fatal: boolean; ignoreBOM?: boolean }): TextDecoder; }; -type TextEncoder = { - readonly encoding: string; - encode(input?: string): Uint8Array; -}; -type TextEncoderConstructor = { - new (): TextEncoder; -}; - -// validate utf8 globals +// parse utf8 globals declare const TextDecoder: TextDecoderConstructor; -declare const TextEncoder: TextEncoderConstructor; - let TextDecoderFatal: TextDecoder; let TextDecoderNonFatal: TextDecoder; @@ -31,20 +21,15 @@ let TextDecoderNonFatal: TextDecoder; * @param start - The index to start validating * @param end - The index to end validating */ -export function validateUtf8( - buffer: Uint8Array, - start: number, - end: number, - fatal: boolean -): string { +export function parseUtf8(buffer: Uint8Array, start: number, end: number, fatal: boolean): string { if (fatal) { try { TextDecoderFatal ??= new TextDecoder('utf8', { fatal: true }); - return TextDecoderFatal.decode(buffer.slice(start, end)); + return TextDecoderFatal.decode(buffer.subarray(start, end)); } catch (cause) { throw new BSONError('Invalid UTF-8 string in BSON document', { cause }); } } TextDecoderNonFatal ??= new TextDecoder('utf8', { fatal: false }); - return TextDecoderNonFatal.decode(buffer.slice(start, end)); + return TextDecoderNonFatal.decode(buffer.subarray(start, end)); } diff --git a/src/parser/deserializer.ts b/src/parser/deserializer.ts index a01a167d0..165a529cf 100644 --- a/src/parser/deserializer.ts +++ b/src/parser/deserializer.ts @@ -603,12 +603,7 @@ function deserializeObject( ) throw new BSONError('bad string length in bson'); // Namespace - const namespace = ByteUtils.toUTF8( - buffer, - index, - index + stringSize - 1, - validation != null && (validation.utf8 as boolean) - ); + const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, shouldValidateKey); // Update parse index position index = index + stringSize; diff --git a/src/utils/node_byte_utils.ts b/src/utils/node_byte_utils.ts index d9487046a..ca1482ca0 100644 --- a/src/utils/node_byte_utils.ts +++ b/src/utils/node_byte_utils.ts @@ -1,5 +1,5 @@ import { BSONError } from '../error'; -import { validateUtf8 } from '../validate_utf8'; +import { parseUtf8 } from '../parse_utf8'; import { tryReadBasicLatin, tryWriteBasicLatin } from './latin'; type NodeJsEncoding = 'base64' | 'hex' | 'utf8' | 'binary'; @@ -136,12 +136,9 @@ export const nodeJsByteUtils = { const string = nodeJsByteUtils.toLocalBufferType(buffer).toString('utf8', start, end); if (fatal) { - // TODO(NODE-4930): Insufficiently strict BSON UTF8 validation for (let i = 0; i < string.length; i++) { if (string.charCodeAt(i) === 0xfffd) { - if (!validateUtf8(buffer, start, end, fatal)) { - throw new BSONError('Invalid UTF-8 string in BSON document'); - } + parseUtf8(buffer, start, end, true); break; } } diff --git a/src/utils/web_byte_utils.ts b/src/utils/web_byte_utils.ts index f2d1b09b1..0f79f0df3 100644 --- a/src/utils/web_byte_utils.ts +++ b/src/utils/web_byte_utils.ts @@ -1,6 +1,6 @@ import { BSONError } from '../error'; import { tryReadBasicLatin } from './latin'; -import { validateUtf8 } from '../validate_utf8'; +import { parseUtf8 } from '../parse_utf8'; type TextDecoder = { readonly encoding: string; @@ -180,7 +180,7 @@ export const webByteUtils = { return basicLatin; } - return validateUtf8(uint8array, start, end, fatal); + return parseUtf8(uint8array, start, end, fatal); }, utf8ByteLength(input: string): number { diff --git a/test/node/byte_utils.test.ts b/test/node/byte_utils.test.ts index a41b19d3a..7141cfd93 100644 --- a/test/node/byte_utils.test.ts +++ b/test/node/byte_utils.test.ts @@ -531,49 +531,70 @@ const randomBytesTests: ByteUtilTest<'randomBytes'>[] = [ } ]; -// extra error cases copied from Web platform specs -const toUTF8ErrorCaseTests = [ - { input: [0xff], name: 'invalid code' }, - { input: [0xc0], name: 'ends early' }, - { input: [0xe0], name: 'ends early 2' }, - { input: [0xc0, 0x00], name: 'invalid trail' }, - { input: [0xc0, 0xc0], name: 'invalid trail 2' }, - { input: [0xe0, 0x00], name: 'invalid trail 3' }, - { input: [0xe0, 0xc0], name: 'invalid trail 4' }, - { input: [0xe0, 0x80, 0x00], name: 'invalid trail 5' }, - { input: [0xe0, 0x80, 0xc0], name: 'invalid trail 6' }, - { input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10ffff' }, - { input: [0xfe, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' }, +// extra error cases copied from wpt/encoding/textdecoder-fatal.any.js +// commit sha: 7c9f867 +const toUTF8WebPlatformSpecTests = [ + { encoding: 'utf-8', input: [0xff], name: 'invalid code' }, + { encoding: 'utf-8', input: [0xc0], name: 'ends early' }, + { encoding: 'utf-8', input: [0xe0], name: 'ends early 2' }, + { encoding: 'utf-8', input: [0xc0, 0x00], name: 'invalid trail' }, + { encoding: 'utf-8', input: [0xc0, 0xc0], name: 'invalid trail 2' }, + { encoding: 'utf-8', input: [0xe0, 0x00], name: 'invalid trail 3' }, + { encoding: 'utf-8', input: [0xe0, 0xc0], name: 'invalid trail 4' }, + { encoding: 'utf-8', input: [0xe0, 0x80, 0x00], name: 'invalid trail 5' }, + { encoding: 'utf-8', input: [0xe0, 0x80, 0xc0], name: 'invalid trail 6' }, + { encoding: 'utf-8', input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10ffff' }, + { encoding: 'utf-8', input: [0xfe, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' }, // Overlong encodings - { input: [0xc0, 0x80], name: 'overlong U+0000 - 2 bytes' }, - { input: [0xe0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' }, - { input: [0xf0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' }, - { input: [0xf8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' }, - { input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 6 bytes' }, - - { input: [0xc1, 0xbf], name: 'overlong U+007f - 2 bytes' }, - { input: [0xe0, 0x81, 0xbf], name: 'overlong U+007f - 3 bytes' }, - { input: [0xf0, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 4 bytes' }, - { input: [0xf8, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 5 bytes' }, - { input: [0xfc, 0x80, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 6 bytes' }, - - { input: [0xe0, 0x9f, 0xbf], name: 'overlong U+07ff - 3 bytes' }, - { input: [0xf0, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 4 bytes' }, - { input: [0xf8, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 5 bytes' }, - { input: [0xfc, 0x80, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 6 bytes' }, - - { input: [0xf0, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 4 bytes' }, - { input: [0xf8, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 5 bytes' }, - { input: [0xfc, 0x80, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 6 bytes' }, - - { input: [0xf8, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 5 bytes' }, - { input: [0xfc, 0x80, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 6 bytes' }, + { encoding: 'utf-8', input: [0xc0, 0x80], name: 'overlong U+0000 - 2 bytes' }, + { encoding: 'utf-8', input: [0xe0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' }, + { encoding: 'utf-8', input: [0xf0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' }, + { encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' }, + { + encoding: 'utf-8', + input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], + name: 'overlong U+0000 - 6 bytes' + }, + + { encoding: 'utf-8', input: [0xc1, 0xbf], name: 'overlong U+007f - 2 bytes' }, + { encoding: 'utf-8', input: [0xe0, 0x81, 0xbf], name: 'overlong U+007f - 3 bytes' }, + { encoding: 'utf-8', input: [0xf0, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 4 bytes' }, + { encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 5 bytes' }, + { + encoding: 'utf-8', + input: [0xfc, 0x80, 0x80, 0x80, 0x81, 0xbf], + name: 'overlong U+007f - 6 bytes' + }, + + { encoding: 'utf-8', input: [0xe0, 0x9f, 0xbf], name: 'overlong U+07ff - 3 bytes' }, + { encoding: 'utf-8', input: [0xf0, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 4 bytes' }, + { encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 5 bytes' }, + { + encoding: 'utf-8', + input: [0xfc, 0x80, 0x80, 0x80, 0x9f, 0xbf], + name: 'overlong U+07ff - 6 bytes' + }, + + { encoding: 'utf-8', input: [0xf0, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 4 bytes' }, + { encoding: 'utf-8', input: [0xf8, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 5 bytes' }, + { + encoding: 'utf-8', + input: [0xfc, 0x80, 0x80, 0x8f, 0xbf, 0xbf], + name: 'overlong U+ffff - 6 bytes' + }, + + { encoding: 'utf-8', input: [0xf8, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 5 bytes' }, + { + encoding: 'utf-8', + input: [0xfc, 0x80, 0x84, 0x8f, 0xbf, 0xbf], + name: 'overlong U+10ffff - 6 bytes' + }, // UTf-16 surrogates encoded as code points in UTf-8 - { input: [0xed, 0xa0, 0x80], name: 'lead surrogate' }, - { input: [0xed, 0xb0, 0x80], name: 'trail surrogate' }, - { input: [0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80], name: 'surrogate pair' } + { encoding: 'utf-8', input: [0xed, 0xa0, 0x80], name: 'lead surrogate' }, + { encoding: 'utf-8', input: [0xed, 0xb0, 0x80], name: 'trail surrogate' }, + { encoding: 'utf-8', input: [0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80], name: 'surrogate pair' } ]; const utils = new Map([ @@ -882,7 +903,7 @@ describe('ByteUtils', () => { }); } if (utility === 'toUTF8') - for (const test of toUTF8ErrorCaseTests) { + for (const test of toUTF8WebPlatformSpecTests) { it(`throws error when fatal is set and provided ${test.name} as input`, () => { expect(() => byteUtils[utility](Uint8Array.from(test.input), 0, test.input.length, true) diff --git a/test/node/release.test.ts b/test/node/release.test.ts index da69230df..756305b38 100644 --- a/test/node/release.test.ts +++ b/test/node/release.test.ts @@ -50,7 +50,7 @@ const REQUIRED_FILES = [ 'src/utils/number_utils.ts', 'src/utils/web_byte_utils.ts', 'src/utils/latin.ts', - 'src/validate_utf8.ts', + 'src/parse_utf8.ts', 'vendor/base64/base64.js', 'vendor/base64/package.json', 'vendor/base64/LICENSE-MIT.txt', From 4cb2c9b2a775ea868c0c9078668951cfe6cc9db6 Mon Sep 17 00:00:00 2001 From: Aditi Khare Date: Mon, 29 Apr 2024 16:52:55 -0400 Subject: [PATCH 7/8] part of requested changes --- src/parse_utf8.ts | 2 +- test/node/byte_utils.test.ts | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/parse_utf8.ts b/src/parse_utf8.ts index 0e12793bf..045a9080b 100644 --- a/src/parse_utf8.ts +++ b/src/parse_utf8.ts @@ -23,8 +23,8 @@ let TextDecoderNonFatal: TextDecoder; */ export function parseUtf8(buffer: Uint8Array, start: number, end: number, fatal: boolean): string { if (fatal) { + TextDecoderFatal ??= new TextDecoder('utf8', { fatal: true }); try { - TextDecoderFatal ??= new TextDecoder('utf8', { fatal: true }); return TextDecoderFatal.decode(buffer.subarray(start, end)); } catch (cause) { throw new BSONError('Invalid UTF-8 string in BSON document', { cause }); diff --git a/test/node/byte_utils.test.ts b/test/node/byte_utils.test.ts index 7141cfd93..aff6b9a6d 100644 --- a/test/node/byte_utils.test.ts +++ b/test/node/byte_utils.test.ts @@ -533,6 +533,7 @@ const randomBytesTests: ByteUtilTest<'randomBytes'>[] = [ // extra error cases copied from wpt/encoding/textdecoder-fatal.any.js // commit sha: 7c9f867 +// link: https://github.com/web-platform-tests/wpt/commit/7c9f8674d9809731e8919073d957d6233f6e0544 const toUTF8WebPlatformSpecTests = [ { encoding: 'utf-8', input: [0xff], name: 'invalid code' }, { encoding: 'utf-8', input: [0xc0], name: 'ends early' }, From 521dd768891296e7cc001b60d9e766d2771068c6 Mon Sep 17 00:00:00 2001 From: Aditi Khare Date: Tue, 30 Apr 2024 14:36:18 -0400 Subject: [PATCH 8/8] updated tests and added to deserialize --- test/node/byte_utils.test.ts | 125 ++----------------------- test/node/data/utf8_wpt_error_cases.ts | 67 +++++++++++++ test/node/parser/deserializer.test.ts | 46 ++++++++- 3 files changed, 122 insertions(+), 116 deletions(-) create mode 100644 test/node/data/utf8_wpt_error_cases.ts diff --git a/test/node/byte_utils.test.ts b/test/node/byte_utils.test.ts index aff6b9a6d..67a4721fe 100644 --- a/test/node/byte_utils.test.ts +++ b/test/node/byte_utils.test.ts @@ -8,7 +8,7 @@ import { webByteUtils } from '../../src/utils/web_byte_utils'; import * as sinon from 'sinon'; import { loadCJSModuleBSON, loadReactNativeCJSModuleBSON, loadESModuleBSON } from '../load_bson'; import * as crypto from 'node:crypto'; -import { BSONError } from '../../src/error'; +import { utf8WebPlatformSpecTests } from './data/utf8_wpt_error_cases'; type ByteUtilTest = { name: string; @@ -426,48 +426,18 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [ expect(output).to.equal('abc\uFFFD'); } }, - { - name: 'should throw an error if fatal is set and string is a sequence that decodes to an invalid code point', - inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, true], - expectation({ error }) { - expect(error).to.match(/Invalid UTF-8 string in BSON document/i); - } - }, - { - name: 'throw an error if fatal is set and string contains overlong encoding', - inputs: [Buffer.from('11000000025f0005000000f08282ac0000', 'hex'), 0, 18, true], - expectation({ error }) { - expect(error).to.match(/Invalid UTF-8 string in BSON document/i); - } - }, - { - name: 'throw an error if fatal is set and string contains invalid bytes', - inputs: [Buffer.from('abcff', 'hex'), 0, 2, true], - expectation({ error }) { - expect(error).to.match(/Invalid UTF-8 string in BSON document/i); - } - }, - { - name: 'throw an error if fatal is set and string contains an unexpected continuation byte', - inputs: [Buffer.from('7F80', 'hex'), 0, 2, true], - expectation({ error }) { - expect(error).to.match(/Invalid UTF-8 string in BSON document/i); - } - }, - { - name: 'throw an error if fatal is set and string contains a non-continuation byte before the end of the character', - inputs: [Buffer.from('c000', 'hex'), 0, 2, true], - expectation({ error }) { - expect(error).to.match(/Invalid UTF-8 string in BSON document/i); - } - }, - { - name: 'throw an error if fatal is set and string ends before the end of the character', - inputs: [Buffer.from('c0', 'hex'), 0, 1, true], + ...utf8WebPlatformSpecTests.map(t => ({ + name: t.name, + inputs: [Uint8Array.from(t.input), 0, t.input.length, true] as [ + buffer: Uint8Array, + start: number, + end: number, + fatal: boolean + ], expectation({ error }) { expect(error).to.match(/Invalid UTF-8 string in BSON document/i); } - } + })) ]; const utf8ByteLengthTests: ByteUtilTest<'utf8ByteLength'>[] = [ @@ -531,73 +501,6 @@ const randomBytesTests: ByteUtilTest<'randomBytes'>[] = [ } ]; -// extra error cases copied from wpt/encoding/textdecoder-fatal.any.js -// commit sha: 7c9f867 -// link: https://github.com/web-platform-tests/wpt/commit/7c9f8674d9809731e8919073d957d6233f6e0544 -const toUTF8WebPlatformSpecTests = [ - { encoding: 'utf-8', input: [0xff], name: 'invalid code' }, - { encoding: 'utf-8', input: [0xc0], name: 'ends early' }, - { encoding: 'utf-8', input: [0xe0], name: 'ends early 2' }, - { encoding: 'utf-8', input: [0xc0, 0x00], name: 'invalid trail' }, - { encoding: 'utf-8', input: [0xc0, 0xc0], name: 'invalid trail 2' }, - { encoding: 'utf-8', input: [0xe0, 0x00], name: 'invalid trail 3' }, - { encoding: 'utf-8', input: [0xe0, 0xc0], name: 'invalid trail 4' }, - { encoding: 'utf-8', input: [0xe0, 0x80, 0x00], name: 'invalid trail 5' }, - { encoding: 'utf-8', input: [0xe0, 0x80, 0xc0], name: 'invalid trail 6' }, - { encoding: 'utf-8', input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10ffff' }, - { encoding: 'utf-8', input: [0xfe, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' }, - - // Overlong encodings - { encoding: 'utf-8', input: [0xc0, 0x80], name: 'overlong U+0000 - 2 bytes' }, - { encoding: 'utf-8', input: [0xe0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' }, - { encoding: 'utf-8', input: [0xf0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' }, - { encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' }, - { - encoding: 'utf-8', - input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], - name: 'overlong U+0000 - 6 bytes' - }, - - { encoding: 'utf-8', input: [0xc1, 0xbf], name: 'overlong U+007f - 2 bytes' }, - { encoding: 'utf-8', input: [0xe0, 0x81, 0xbf], name: 'overlong U+007f - 3 bytes' }, - { encoding: 'utf-8', input: [0xf0, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 4 bytes' }, - { encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 5 bytes' }, - { - encoding: 'utf-8', - input: [0xfc, 0x80, 0x80, 0x80, 0x81, 0xbf], - name: 'overlong U+007f - 6 bytes' - }, - - { encoding: 'utf-8', input: [0xe0, 0x9f, 0xbf], name: 'overlong U+07ff - 3 bytes' }, - { encoding: 'utf-8', input: [0xf0, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 4 bytes' }, - { encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 5 bytes' }, - { - encoding: 'utf-8', - input: [0xfc, 0x80, 0x80, 0x80, 0x9f, 0xbf], - name: 'overlong U+07ff - 6 bytes' - }, - - { encoding: 'utf-8', input: [0xf0, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 4 bytes' }, - { encoding: 'utf-8', input: [0xf8, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 5 bytes' }, - { - encoding: 'utf-8', - input: [0xfc, 0x80, 0x80, 0x8f, 0xbf, 0xbf], - name: 'overlong U+ffff - 6 bytes' - }, - - { encoding: 'utf-8', input: [0xf8, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 5 bytes' }, - { - encoding: 'utf-8', - input: [0xfc, 0x80, 0x84, 0x8f, 0xbf, 0xbf], - name: 'overlong U+10ffff - 6 bytes' - }, - - // UTf-16 surrogates encoded as code points in UTf-8 - { encoding: 'utf-8', input: [0xed, 0xa0, 0x80], name: 'lead surrogate' }, - { encoding: 'utf-8', input: [0xed, 0xb0, 0x80], name: 'trail surrogate' }, - { encoding: 'utf-8', input: [0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80], name: 'surrogate pair' } -]; - const utils = new Map([ ['nodeJsByteUtils', nodeJsByteUtils], ['webByteUtils', webByteUtils] @@ -903,14 +806,6 @@ describe('ByteUtils', () => { test.expectation({ web: byteUtilsName === 'webByteUtils', output, error }); }); } - if (utility === 'toUTF8') - for (const test of toUTF8WebPlatformSpecTests) { - it(`throws error when fatal is set and provided ${test.name} as input`, () => { - expect(() => - byteUtils[utility](Uint8Array.from(test.input), 0, test.input.length, true) - ).to.throw(BSONError, /Invalid UTF-8 string in BSON document/i); - }); - } }); } } diff --git a/test/node/data/utf8_wpt_error_cases.ts b/test/node/data/utf8_wpt_error_cases.ts new file mode 100644 index 000000000..6d3a98135 --- /dev/null +++ b/test/node/data/utf8_wpt_error_cases.ts @@ -0,0 +1,67 @@ +// extra error cases copied from wpt/encoding/textdecoder-fatal.any.js +// commit sha: 7c9f867 +// link: https://github.com/web-platform-tests/wpt/commit/7c9f8674d9809731e8919073d957d6233f6e0544 + +export const utf8WebPlatformSpecTests = [ + { encoding: 'utf-8', input: [0xff], name: 'invalid code' }, + { encoding: 'utf-8', input: [0xc0], name: 'ends early' }, + { encoding: 'utf-8', input: [0xe0], name: 'ends early 2' }, + { encoding: 'utf-8', input: [0xc0, 0x00], name: 'invalid trail' }, + { encoding: 'utf-8', input: [0xc0, 0xc0], name: 'invalid trail 2' }, + { encoding: 'utf-8', input: [0xe0, 0x00], name: 'invalid trail 3' }, + { encoding: 'utf-8', input: [0xe0, 0xc0], name: 'invalid trail 4' }, + { encoding: 'utf-8', input: [0xe0, 0x80, 0x00], name: 'invalid trail 5' }, + { encoding: 'utf-8', input: [0xe0, 0x80, 0xc0], name: 'invalid trail 6' }, + { encoding: 'utf-8', input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10ffff' }, + { encoding: 'utf-8', input: [0xfe, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' }, + + // Overlong encodings + { encoding: 'utf-8', input: [0xc0, 0x80], name: 'overlong U+0000 - 2 bytes' }, + { encoding: 'utf-8', input: [0xe0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' }, + { encoding: 'utf-8', input: [0xf0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' }, + { encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' }, + { + encoding: 'utf-8', + input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], + name: 'overlong U+0000 - 6 bytes' + }, + + { encoding: 'utf-8', input: [0xc1, 0xbf], name: 'overlong U+007f - 2 bytes' }, + { encoding: 'utf-8', input: [0xe0, 0x81, 0xbf], name: 'overlong U+007f - 3 bytes' }, + { encoding: 'utf-8', input: [0xf0, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 4 bytes' }, + { encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 5 bytes' }, + { + encoding: 'utf-8', + input: [0xfc, 0x80, 0x80, 0x80, 0x81, 0xbf], + name: 'overlong U+007f - 6 bytes' + }, + + { encoding: 'utf-8', input: [0xe0, 0x9f, 0xbf], name: 'overlong U+07ff - 3 bytes' }, + { encoding: 'utf-8', input: [0xf0, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 4 bytes' }, + { encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 5 bytes' }, + { + encoding: 'utf-8', + input: [0xfc, 0x80, 0x80, 0x80, 0x9f, 0xbf], + name: 'overlong U+07ff - 6 bytes' + }, + + { encoding: 'utf-8', input: [0xf0, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 4 bytes' }, + { encoding: 'utf-8', input: [0xf8, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 5 bytes' }, + { + encoding: 'utf-8', + input: [0xfc, 0x80, 0x80, 0x8f, 0xbf, 0xbf], + name: 'overlong U+ffff - 6 bytes' + }, + + { encoding: 'utf-8', input: [0xf8, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 5 bytes' }, + { + encoding: 'utf-8', + input: [0xfc, 0x80, 0x84, 0x8f, 0xbf, 0xbf], + name: 'overlong U+10ffff - 6 bytes' + }, + + // UTf-16 surrogates encoded as code points in UTf-8 + { encoding: 'utf-8', input: [0xed, 0xa0, 0x80], name: 'lead surrogate' }, + { encoding: 'utf-8', input: [0xed, 0xb0, 0x80], name: 'trail surrogate' }, + { encoding: 'utf-8', input: [0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80], name: 'surrogate pair' } +]; diff --git a/test/node/parser/deserializer.test.ts b/test/node/parser/deserializer.test.ts index 005ccefa4..30c684be5 100644 --- a/test/node/parser/deserializer.test.ts +++ b/test/node/parser/deserializer.test.ts @@ -1,6 +1,7 @@ import * as BSON from '../../register-bson'; import { expect } from 'chai'; -import { bufferFromHexArray } from '../tools/utils'; +import { bufferFromHexArray, int32LEToHex } from '../tools/utils'; +import { utf8WebPlatformSpecTests } from '../data/utf8_wpt_error_cases'; describe('deserializer()', () => { describe('when the fieldsAsRaw options is present and has a value that corresponds to a key in the object', () => { @@ -58,4 +59,47 @@ describe('deserializer()', () => { expect(resultCodeWithScope).to.have.deep.nested.property('a.scope', { b: true }); }); }); + + describe('utf8 validation', () => { + for (const test of utf8WebPlatformSpecTests) { + const inputStringSize = int32LEToHex(test.input.length + 1); // int32 size of string + const inputHexString = Buffer.from(test.input).toString('hex'); + const buffer = bufferFromHexArray([ + '02', // string + '6100', // 'a' key with null terminator + inputStringSize, + inputHexString, + '00' + ]); + context(`when utf8 validation is on and input is ${test.name}`, () => { + it(`throws error containing 'Invalid UTF-8'`, () => { + // global case + expect(() => BSON.deserialize(buffer, { validation: { utf8: true } })).to.throw( + BSON.BSONError, + /Invalid UTF-8 string in BSON document/i + ); + + // specific case + expect(() => BSON.deserialize(buffer, { validation: { utf8: { a: true } } })).to.throw( + BSON.BSONError, + /Invalid UTF-8 string in BSON document/i + ); + }); + }); + + context(`when utf8 validation is off and input is ${test.name}`, () => { + it('returns a string containing at least 1 replacement character', () => { + // global case + expect(BSON.deserialize(buffer, { validation: { utf8: false } })) + .to.have.property('a') + .that.includes('\uFFFD'); + + // specific case + expect(BSON.deserialize(buffer, { validation: { utf8: { a: false } } })) + .to.have.property('a') + .that.includes('\uFFFD'); + }); + }); + } + }); });