Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(NODE-6123): utf8 validation is insufficiently strict #676

Merged
merged 8 commits into from
Apr 30, 2024
34 changes: 22 additions & 12 deletions etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import MagicString from 'magic-string';

const REQUIRE_POLYFILLS =
`const { TextEncoder, TextDecoder } = require('../vendor/text-encoding');
const REQUIRE_WEB_UTILS_POLYFILLS =
`const { TextEncoder } = require('../vendor/text-encoding');
const { encode: btoa, decode: atob } = require('../vendor/base64');\n`

const REQUIRE_VALIDATE_UTF8_POLYFILLS =
`const { TextDecoder } = require('../vendor/text-encoding');`;
nbbeeken marked this conversation as resolved.
Show resolved Hide resolved

export class RequireVendor {
/**
* Take the compiled source code input; types are expected to already have been removed.
Expand All @@ -14,17 +17,24 @@ export class RequireVendor {
* @returns {{ code: string; map: import('magic-string').SourceMap }}
*/
transform(code, id) {
if (!id.includes('web_byte_utils')) {
return;
}
if (id.includes('validate_utf8')) {
// MagicString lets us edit the source code and still generate an accurate source map
const magicString = new MagicString(code);
magicString.prepend(REQUIRE_VALIDATE_UTF8_POLYFILLS);

// MagicString lets us edit the source code and still generate an accurate source map
const magicString = new MagicString(code);
magicString.prepend(REQUIRE_POLYFILLS);
return {
code: magicString.toString(),
map: magicString.generateMap({ hires: true })
};
} else if (id.includes('web_byte_utils')) {
// MagicString lets us edit the source code and still generate an accurate source map
const magicString = new MagicString(code);
magicString.prepend(REQUIRE_WEB_UTILS_POLYFILLS);

return {
code: magicString.toString(),
map: magicString.generateMap({ hires: true })
};
return {
code: magicString.toString(),
map: magicString.generateMap({ hires: true })
};
}
}
}
13 changes: 6 additions & 7 deletions src/parser/deserializer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ import { BSONSymbol } from '../symbol';
import { Timestamp } from '../timestamp';
import { ByteUtils } from '../utils/byte_utils';
import { NumberUtils } from '../utils/number_utils';
import { validateUtf8 } from '../validate_utf8';

/** @public */
export interface DeserializeOptions {
Expand Down Expand Up @@ -604,12 +603,12 @@ function deserializeObject(
)
throw new BSONError('bad string length in bson');
// Namespace
if (validation != null && validation.utf8) {
if (!validateUtf8(buffer, index, index + stringSize - 1)) {
throw new BSONError('Invalid UTF-8 string in BSON document');
}
}
const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, false);
const namespace = ByteUtils.toUTF8(
buffer,
index,
index + stringSize - 1,
validation != null && (validation.utf8 as boolean)
nbbeeken marked this conversation as resolved.
Show resolved Hide resolved
);
// Update parse index position
index = index + stringSize;

Expand Down
2 changes: 1 addition & 1 deletion src/utils/node_byte_utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ export const nodeJsByteUtils = {
// TODO(NODE-4930): Insufficiently strict BSON UTF8 validation
nbbeeken marked this conversation as resolved.
Show resolved Hide resolved
for (let i = 0; i < string.length; i++) {
if (string.charCodeAt(i) === 0xfffd) {
if (!validateUtf8(buffer, start, end)) {
if (!validateUtf8(buffer, start, end, fatal)) {
nbbeeken marked this conversation as resolved.
Show resolved Hide resolved
throw new BSONError('Invalid UTF-8 string in BSON document');
}
break;
Expand Down
10 changes: 2 additions & 8 deletions src/utils/web_byte_utils.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { BSONError } from '../error';
import { tryReadBasicLatin } from './latin';
import { validateUtf8 } from '../validate_utf8';

type TextDecoder = {
readonly encoding: string;
Expand Down Expand Up @@ -179,14 +180,7 @@ export const webByteUtils = {
return basicLatin;
}

if (fatal) {
try {
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
} catch (cause) {
throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
}
}
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
return validateUtf8(uint8array, start, end, fatal);
},

utf8ByteLength(input: string): number {
Expand Down
71 changes: 37 additions & 34 deletions src/validate_utf8.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,29 @@
const FIRST_BIT = 0x80;
const FIRST_TWO_BITS = 0xc0;
const FIRST_THREE_BITS = 0xe0;
const FIRST_FOUR_BITS = 0xf0;
const FIRST_FIVE_BITS = 0xf8;
import { BSONError } from './error';

const TWO_BIT_CHAR = 0xc0;
const THREE_BIT_CHAR = 0xe0;
const FOUR_BIT_CHAR = 0xf0;
const CONTINUING_CHAR = 0x80;
type TextDecoder = {
readonly encoding: string;
readonly fatal: boolean;
readonly ignoreBOM: boolean;
decode(input?: Uint8Array): string;
};
type TextDecoderConstructor = {
new (label: 'utf8', options: { fatal: boolean; ignoreBOM?: boolean }): TextDecoder;
};

type TextEncoder = {
readonly encoding: string;
encode(input?: string): Uint8Array;
};
type TextEncoderConstructor = {
new (): TextEncoder;
};
nbbeeken marked this conversation as resolved.
Show resolved Hide resolved

// validate utf8 globals
declare const TextDecoder: TextDecoderConstructor;
declare const TextEncoder: TextEncoderConstructor;

let TextDecoderFatal: TextDecoder;
let TextDecoderNonFatal: TextDecoder;

/**
* Determines if the passed in bytes are valid utf8
Expand All @@ -16,32 +32,19 @@ const CONTINUING_CHAR = 0x80;
* @param end - The index to end validating
*/
export function validateUtf8(
bytes: { [index: number]: number },
buffer: Uint8Array,
start: number,
end: number
): boolean {
let continuation = 0;

for (let i = start; i < end; i += 1) {
const byte = bytes[i];

if (continuation) {
if ((byte & FIRST_TWO_BITS) !== CONTINUING_CHAR) {
return false;
}
continuation -= 1;
} else if (byte & FIRST_BIT) {
if ((byte & FIRST_THREE_BITS) === TWO_BIT_CHAR) {
continuation = 1;
} else if ((byte & FIRST_FOUR_BITS) === THREE_BIT_CHAR) {
continuation = 2;
} else if ((byte & FIRST_FIVE_BITS) === FOUR_BIT_CHAR) {
continuation = 3;
} else {
return false;
}
end: number,
fatal: boolean
): string {
if (fatal) {
try {
TextDecoderFatal ??= new TextDecoder('utf8', { fatal: true });
return TextDecoderFatal.decode(buffer.slice(start, end));
nbbeeken marked this conversation as resolved.
Show resolved Hide resolved
} catch (cause) {
throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
}
}

return !continuation;
TextDecoderNonFatal ??= new TextDecoder('utf8', { fatal: false });
return TextDecoderNonFatal.decode(buffer.slice(start, end));
nbbeeken marked this conversation as resolved.
Show resolved Hide resolved
}
103 changes: 97 additions & 6 deletions test/node/byte_utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { webByteUtils } from '../../src/utils/web_byte_utils';
import * as sinon from 'sinon';
import { loadCJSModuleBSON, loadReactNativeCJSModuleBSON, loadESModuleBSON } from '../load_bson';
import * as crypto from 'node:crypto';
import { BSONError } from '../../src/error';

type ByteUtilTest<K extends keyof ByteUtils> = {
name: string;
Expand Down Expand Up @@ -399,6 +400,7 @@ const fromUTF8Tests: ByteUtilTest<'encodeUTF8Into'>[] = [
}
}
];

const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
{
name: 'should create utf8 string from buffer input',
Expand All @@ -417,21 +419,57 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
}
},
{
name: 'should throw an error if fatal is set and string is invalid',
name: 'should insert replacement character fatal is false and string is invalid',
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false],
expectation({ error, output }) {
expect(error).to.not.exist;
expect(output).to.equal('abc\uFFFD');
}
},
{
name: 'should throw an error if fatal is set and string is a sequence that decodes to an invalid code point',
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{
name: 'should insert replacement character fatal is false and string is invalid',
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false],
expectation({ error, output }) {
expect(error).to.not.exist;
expect(output).to.equal('abc\uFFFD');
name: 'throw an error if fatal is set and string contains overlong encoding',
inputs: [Buffer.from('11000000025f0005000000f08282ac0000', 'hex'), 0, 18, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{
name: 'throw an error if fatal is set and string contains invalid bytes',
nbbeeken marked this conversation as resolved.
Show resolved Hide resolved
inputs: [Buffer.from('abcff', 'hex'), 0, 2, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{
name: 'throw an error if fatal is set and string contains an unexpected continuation byte',
inputs: [Buffer.from('7F80', 'hex'), 0, 2, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{
name: 'throw an error if fatal is set and string contains a non-continuation byte before the end of the character',
inputs: [Buffer.from('c000', 'hex'), 0, 2, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{
name: 'throw an error if fatal is set and string ends before the end of the character',
inputs: [Buffer.from('c0', 'hex'), 0, 1, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
}
];

const utf8ByteLengthTests: ByteUtilTest<'utf8ByteLength'>[] = [
{
name: 'should return zero for empty string',
Expand Down Expand Up @@ -493,6 +531,51 @@ const randomBytesTests: ByteUtilTest<'randomBytes'>[] = [
}
];

// extra error cases copied from Web platform specs
const toUTF8ErrorCaseTests = [
{ input: [0xff], name: 'invalid code' },
nbbeeken marked this conversation as resolved.
Show resolved Hide resolved
{ input: [0xc0], name: 'ends early' },
{ input: [0xe0], name: 'ends early 2' },
{ input: [0xc0, 0x00], name: 'invalid trail' },
{ input: [0xc0, 0xc0], name: 'invalid trail 2' },
{ input: [0xe0, 0x00], name: 'invalid trail 3' },
{ input: [0xe0, 0xc0], name: 'invalid trail 4' },
{ input: [0xe0, 0x80, 0x00], name: 'invalid trail 5' },
{ input: [0xe0, 0x80, 0xc0], name: 'invalid trail 6' },
{ input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10ffff' },
{ input: [0xfe, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' },

// Overlong encodings
{ input: [0xc0, 0x80], name: 'overlong U+0000 - 2 bytes' },
{ input: [0xe0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' },
{ input: [0xf0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' },
{ input: [0xf8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' },
{ input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 6 bytes' },

{ input: [0xc1, 0xbf], name: 'overlong U+007f - 2 bytes' },
{ input: [0xe0, 0x81, 0xbf], name: 'overlong U+007f - 3 bytes' },
{ input: [0xf0, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 4 bytes' },
{ input: [0xf8, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 5 bytes' },
{ input: [0xfc, 0x80, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 6 bytes' },

{ input: [0xe0, 0x9f, 0xbf], name: 'overlong U+07ff - 3 bytes' },
{ input: [0xf0, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 4 bytes' },
{ input: [0xf8, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 5 bytes' },
{ input: [0xfc, 0x80, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 6 bytes' },

{ input: [0xf0, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 4 bytes' },
{ input: [0xf8, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 5 bytes' },
{ input: [0xfc, 0x80, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 6 bytes' },

{ input: [0xf8, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 5 bytes' },
{ input: [0xfc, 0x80, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 6 bytes' },

// UTf-16 surrogates encoded as code points in UTf-8
{ input: [0xed, 0xa0, 0x80], name: 'lead surrogate' },
{ input: [0xed, 0xb0, 0x80], name: 'trail surrogate' },
{ input: [0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80], name: 'surrogate pair' }
];

const utils = new Map([
['nodeJsByteUtils', nodeJsByteUtils],
['webByteUtils', webByteUtils]
Expand Down Expand Up @@ -798,6 +881,14 @@ describe('ByteUtils', () => {
test.expectation({ web: byteUtilsName === 'webByteUtils', output, error });
});
}
if (utility === 'toUTF8')
for (const test of toUTF8ErrorCaseTests) {
it(`throws error when fatal is set and provided ${test.name} as input`, () => {
expect(() =>
byteUtils[utility](Uint8Array.from(test.input), 0, test.input.length, true)
).to.throw(BSONError, /Invalid UTF-8 string in BSON document/i);
});
}
});
}
}
Expand Down