Skip to content

Commit

Permalink
fix(NODE-6123): toUtf8 validation insufficiently strict
Browse files Browse the repository at this point in the history
  • Loading branch information
aditi-khare-mongoDB committed Apr 24, 2024
1 parent c58d1e2 commit 8d2ed1e
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 57 deletions.
2 changes: 1 addition & 1 deletion etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ export class RequireVendor {
* @returns {{ code: string; map: import('magic-string').SourceMap }}
*/
transform(code, id) {
if (!id.includes('web_byte_utils')) {
if (!id.includes('validate_utf8')) {
return;
}

Expand Down
13 changes: 6 additions & 7 deletions src/parser/deserializer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ import { BSONSymbol } from '../symbol';
import { Timestamp } from '../timestamp';
import { ByteUtils } from '../utils/byte_utils';
import { NumberUtils } from '../utils/number_utils';
import { validateUtf8 } from '../validate_utf8';

/** @public */
export interface DeserializeOptions {
Expand Down Expand Up @@ -604,12 +603,12 @@ function deserializeObject(
)
throw new BSONError('bad string length in bson');
// Namespace
if (validation != null && validation.utf8) {
if (!validateUtf8(buffer, index, index + stringSize - 1)) {
throw new BSONError('Invalid UTF-8 string in BSON document');
}
}
const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, false);
const namespace = ByteUtils.toUTF8(
buffer,
index,
index + stringSize - 1,
validation != null && (validation.utf8 as boolean)
);
// Update parse index position
index = index + stringSize;

Expand Down
2 changes: 1 addition & 1 deletion src/utils/node_byte_utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ export const nodeJsByteUtils = {
// TODO(NODE-4930): Insufficiently strict BSON UTF8 validation
for (let i = 0; i < string.length; i++) {
if (string.charCodeAt(i) === 0xfffd) {
if (!validateUtf8(buffer, start, end)) {
if (!validateUtf8(buffer, start, end, fatal)) {
throw new BSONError('Invalid UTF-8 string in BSON document');
}
break;
Expand Down
10 changes: 2 additions & 8 deletions src/utils/web_byte_utils.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { BSONError } from '../error';
import { tryReadBasicLatin } from './latin';
import { validateUtf8 } from '../validate_utf8';

type TextDecoder = {
readonly encoding: string;
Expand Down Expand Up @@ -179,14 +180,7 @@ export const webByteUtils = {
return basicLatin;
}

if (fatal) {
try {
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
} catch (cause) {
throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
}
}
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
return validateUtf8(uint8array, start, end, fatal);
},

utf8ByteLength(input: string): number {
Expand Down
66 changes: 32 additions & 34 deletions src/validate_utf8.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,26 @@
const FIRST_BIT = 0x80;
const FIRST_TWO_BITS = 0xc0;
const FIRST_THREE_BITS = 0xe0;
const FIRST_FOUR_BITS = 0xf0;
const FIRST_FIVE_BITS = 0xf8;
import { BSONError } from './error';

const TWO_BIT_CHAR = 0xc0;
const THREE_BIT_CHAR = 0xe0;
const FOUR_BIT_CHAR = 0xf0;
const CONTINUING_CHAR = 0x80;
type TextDecoder = {
readonly encoding: string;
readonly fatal: boolean;
readonly ignoreBOM: boolean;
decode(input?: Uint8Array): string;
};
type TextDecoderConstructor = {
new (label: 'utf8', options: { fatal: boolean; ignoreBOM?: boolean }): TextDecoder;
};

type TextEncoder = {
readonly encoding: string;
encode(input?: string): Uint8Array;
};
type TextEncoderConstructor = {
new (): TextEncoder;
};

// Node byte utils global
declare const TextDecoder: TextDecoderConstructor;
declare const TextEncoder: TextEncoderConstructor;

/**
* Determines if the passed in bytes are valid utf8
Expand All @@ -16,32 +29,17 @@ const CONTINUING_CHAR = 0x80;
* @param end - The index to end validating
*/
export function validateUtf8(
bytes: { [index: number]: number },
buffer: Uint8Array,
start: number,
end: number
): boolean {
let continuation = 0;

for (let i = start; i < end; i += 1) {
const byte = bytes[i];

if (continuation) {
if ((byte & FIRST_TWO_BITS) !== CONTINUING_CHAR) {
return false;
}
continuation -= 1;
} else if (byte & FIRST_BIT) {
if ((byte & FIRST_THREE_BITS) === TWO_BIT_CHAR) {
continuation = 1;
} else if ((byte & FIRST_FOUR_BITS) === THREE_BIT_CHAR) {
continuation = 2;
} else if ((byte & FIRST_FIVE_BITS) === FOUR_BIT_CHAR) {
continuation = 3;
} else {
return false;
}
end: number,
fatal: boolean
): string {
if (fatal) {
try {
return new TextDecoder('utf8', { fatal }).decode(buffer.slice(start, end));
} catch (cause) {
throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
}
}

return !continuation;
return new TextDecoder('utf8', { fatal }).decode(buffer.slice(start, end));
}
103 changes: 97 additions & 6 deletions test/node/byte_utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { webByteUtils } from '../../src/utils/web_byte_utils';
import * as sinon from 'sinon';
import { loadCJSModuleBSON, loadReactNativeCJSModuleBSON, loadESModuleBSON } from '../load_bson';
import * as crypto from 'node:crypto';
import { BSONError } from '../../src/error';

type ByteUtilTest<K extends keyof ByteUtils> = {
name: string;
Expand Down Expand Up @@ -399,6 +400,7 @@ const fromUTF8Tests: ByteUtilTest<'encodeUTF8Into'>[] = [
}
}
];

const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
{
name: 'should create utf8 string from buffer input',
Expand All @@ -417,21 +419,57 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
}
},
{
name: 'should throw an error if fatal is set and string is invalid',
name: 'should insert replacement character fatal is false and string is invalid',
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false],
expectation({ error, output }) {
expect(error).to.not.exist;
expect(output).to.equal('abc\uFFFD');
}
},
{
name: 'should throw an error if fatal is set and string is a sequence that decodes to an invalid code point',
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{
name: 'should insert replacement character fatal is false and string is invalid',
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false],
expectation({ error, output }) {
expect(error).to.not.exist;
expect(output).to.equal('abc\uFFFD');
name: 'throw an error if fatal is set and string contains overlong encoding',
inputs: [Buffer.from('11000000025f0005000000f08282ac0000', 'hex'), 0, 18, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{
name: 'throw an error if fatal is set and string contains invalid bytes',
inputs: [Buffer.from('abcff', 'hex'), 0, 2, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{
name: 'throw an error if fatal is set and string contains an unexpected continuation byte',
inputs: [Buffer.from('7F80', 'hex'), 0, 2, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{
name: 'throw an error if fatal is set and string contains a non-continuation byte before the end of the character',
inputs: [Buffer.from('c000', 'hex'), 0, 2, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{
name: 'throw an error if fatal is set and string ends before the end of the character',
inputs: [Buffer.from('c0', 'hex'), 0, 1, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
}
];

const utf8ByteLengthTests: ByteUtilTest<'utf8ByteLength'>[] = [
{
name: 'should return zero for empty string',
Expand Down Expand Up @@ -493,6 +531,51 @@ const randomBytesTests: ByteUtilTest<'randomBytes'>[] = [
}
];

// extra error cases copied from Web platform specs
const toUTF8ErrorCaseTests = [
{ input: [0xff], name: 'invalid code' },
{ input: [0xc0], name: 'ends early' },
{ input: [0xe0], name: 'ends early 2' },
{ input: [0xc0, 0x00], name: 'invalid trail' },
{ input: [0xc0, 0xc0], name: 'invalid trail 2' },
{ input: [0xe0, 0x00], name: 'invalid trail 3' },
{ input: [0xe0, 0xc0], name: 'invalid trail 4' },
{ input: [0xe0, 0x80, 0x00], name: 'invalid trail 5' },
{ input: [0xe0, 0x80, 0xc0], name: 'invalid trail 6' },
{ input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10ffff' },
{ input: [0xfe, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' },

// Overlong encodings
{ input: [0xc0, 0x80], name: 'overlong U+0000 - 2 bytes' },
{ input: [0xe0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' },
{ input: [0xf0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' },
{ input: [0xf8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' },
{ input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 6 bytes' },

{ input: [0xc1, 0xbf], name: 'overlong U+007f - 2 bytes' },
{ input: [0xe0, 0x81, 0xbf], name: 'overlong U+007f - 3 bytes' },
{ input: [0xf0, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 4 bytes' },
{ input: [0xf8, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 5 bytes' },
{ input: [0xfc, 0x80, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 6 bytes' },

{ input: [0xe0, 0x9f, 0xbf], name: 'overlong U+07ff - 3 bytes' },
{ input: [0xf0, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 4 bytes' },
{ input: [0xf8, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 5 bytes' },
{ input: [0xfc, 0x80, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 6 bytes' },

{ input: [0xf0, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 4 bytes' },
{ input: [0xf8, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 5 bytes' },
{ input: [0xfc, 0x80, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 6 bytes' },

{ input: [0xf8, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 5 bytes' },
{ input: [0xfc, 0x80, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 6 bytes' },

// UTf-16 surrogates encoded as code points in UTf-8
{ input: [0xed, 0xa0, 0x80], name: 'lead surrogate' },
{ input: [0xed, 0xb0, 0x80], name: 'trail surrogate' },
{ input: [0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80], name: 'surrogate pair' }
];

const utils = new Map([
['nodeJsByteUtils', nodeJsByteUtils],
['webByteUtils', webByteUtils]
Expand Down Expand Up @@ -798,6 +881,14 @@ describe('ByteUtils', () => {
test.expectation({ web: byteUtilsName === 'webByteUtils', output, error });
});
}
if (utility === 'toUTF8')
for (const test of toUTF8ErrorCaseTests) {
it(`throws error when fatal is set and provided ${test.name} as input`, () => {
expect(() =>
byteUtils[utility](Uint8Array.from(test.input), 0, test.input.length, true)
).to.throw(BSONError, /Invalid UTF-8 string in BSON document/i);
});
}
});
}
}
Expand Down

0 comments on commit 8d2ed1e

Please sign in to comment.