From 8d2ed1e5b6fc352c4720b3d8cb2445186fb48194 Mon Sep 17 00:00:00 2001
From: Aditi Khare <aditi.khare@mongodb.com>
Date: Wed, 24 Apr 2024 15:49:16 -0400
Subject: [PATCH 1/8] fix(NODE-6123): toUtf8 validation insufficiently strict

---
 .../require_vendor.mjs                        |   2 +-
 src/parser/deserializer.ts                    |  13 +--
 src/utils/node_byte_utils.ts                  |   2 +-
 src/utils/web_byte_utils.ts                   |  10 +-
 src/validate_utf8.ts                          |  66 ++++++-----
 test/node/byte_utils.test.ts                  | 103 +++++++++++++++++-
 6 files changed, 139 insertions(+), 57 deletions(-)

diff --git a/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs b/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs
index 7d4fa4e91..659afe8d9 100644
--- a/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs
+++ b/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs
@@ -14,7 +14,7 @@ export class RequireVendor {
    * @returns {{ code: string; map: import('magic-string').SourceMap }}
    */
   transform(code, id) {
-    if (!id.includes('web_byte_utils')) {
+    if (!id.includes('validate_utf8')) {
       return;
     }
 
diff --git a/src/parser/deserializer.ts b/src/parser/deserializer.ts
index ac2781903..a01a167d0 100644
--- a/src/parser/deserializer.ts
+++ b/src/parser/deserializer.ts
@@ -16,7 +16,6 @@ import { BSONSymbol } from '../symbol';
 import { Timestamp } from '../timestamp';
 import { ByteUtils } from '../utils/byte_utils';
 import { NumberUtils } from '../utils/number_utils';
-import { validateUtf8 } from '../validate_utf8';
 
 /** @public */
 export interface DeserializeOptions {
@@ -604,12 +603,12 @@ function deserializeObject(
       )
         throw new BSONError('bad string length in bson');
       // Namespace
-      if (validation != null && validation.utf8) {
-        if (!validateUtf8(buffer, index, index + stringSize - 1)) {
-          throw new BSONError('Invalid UTF-8 string in BSON document');
-        }
-      }
-      const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, false);
+      const namespace = ByteUtils.toUTF8(
+        buffer,
+        index,
+        index + stringSize - 1,
+        validation != null && (validation.utf8 as boolean)
+      );
       // Update parse index position
       index = index + stringSize;
 
diff --git a/src/utils/node_byte_utils.ts b/src/utils/node_byte_utils.ts
index d6a641a47..d9487046a 100644
--- a/src/utils/node_byte_utils.ts
+++ b/src/utils/node_byte_utils.ts
@@ -139,7 +139,7 @@ export const nodeJsByteUtils = {
       // TODO(NODE-4930): Insufficiently strict BSON UTF8 validation
       for (let i = 0; i < string.length; i++) {
         if (string.charCodeAt(i) === 0xfffd) {
-          if (!validateUtf8(buffer, start, end)) {
+          if (!validateUtf8(buffer, start, end, fatal)) {
             throw new BSONError('Invalid UTF-8 string in BSON document');
           }
           break;
diff --git a/src/utils/web_byte_utils.ts b/src/utils/web_byte_utils.ts
index 77a1f0f74..f2d1b09b1 100644
--- a/src/utils/web_byte_utils.ts
+++ b/src/utils/web_byte_utils.ts
@@ -1,5 +1,6 @@
 import { BSONError } from '../error';
 import { tryReadBasicLatin } from './latin';
+import { validateUtf8 } from '../validate_utf8';
 
 type TextDecoder = {
   readonly encoding: string;
@@ -179,14 +180,7 @@ export const webByteUtils = {
       return basicLatin;
     }
 
-    if (fatal) {
-      try {
-        return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
-      } catch (cause) {
-        throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
-      }
-    }
-    return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
+    return validateUtf8(uint8array, start, end, fatal);
   },
 
   utf8ByteLength(input: string): number {
diff --git a/src/validate_utf8.ts b/src/validate_utf8.ts
index e1da934c6..1d2a81565 100644
--- a/src/validate_utf8.ts
+++ b/src/validate_utf8.ts
@@ -1,13 +1,26 @@
-const FIRST_BIT = 0x80;
-const FIRST_TWO_BITS = 0xc0;
-const FIRST_THREE_BITS = 0xe0;
-const FIRST_FOUR_BITS = 0xf0;
-const FIRST_FIVE_BITS = 0xf8;
+import { BSONError } from './error';
 
-const TWO_BIT_CHAR = 0xc0;
-const THREE_BIT_CHAR = 0xe0;
-const FOUR_BIT_CHAR = 0xf0;
-const CONTINUING_CHAR = 0x80;
+type TextDecoder = {
+  readonly encoding: string;
+  readonly fatal: boolean;
+  readonly ignoreBOM: boolean;
+  decode(input?: Uint8Array): string;
+};
+type TextDecoderConstructor = {
+  new (label: 'utf8', options: { fatal: boolean; ignoreBOM?: boolean }): TextDecoder;
+};
+
+type TextEncoder = {
+  readonly encoding: string;
+  encode(input?: string): Uint8Array;
+};
+type TextEncoderConstructor = {
+  new (): TextEncoder;
+};
+
+// Node byte utils global
+declare const TextDecoder: TextDecoderConstructor;
+declare const TextEncoder: TextEncoderConstructor;
 
 /**
  * Determines if the passed in bytes are valid utf8
@@ -16,32 +29,17 @@ const CONTINUING_CHAR = 0x80;
  * @param end - The index to end validating
  */
 export function validateUtf8(
-  bytes: { [index: number]: number },
+  buffer: Uint8Array,
   start: number,
-  end: number
-): boolean {
-  let continuation = 0;
-
-  for (let i = start; i < end; i += 1) {
-    const byte = bytes[i];
-
-    if (continuation) {
-      if ((byte & FIRST_TWO_BITS) !== CONTINUING_CHAR) {
-        return false;
-      }
-      continuation -= 1;
-    } else if (byte & FIRST_BIT) {
-      if ((byte & FIRST_THREE_BITS) === TWO_BIT_CHAR) {
-        continuation = 1;
-      } else if ((byte & FIRST_FOUR_BITS) === THREE_BIT_CHAR) {
-        continuation = 2;
-      } else if ((byte & FIRST_FIVE_BITS) === FOUR_BIT_CHAR) {
-        continuation = 3;
-      } else {
-        return false;
-      }
+  end: number,
+  fatal: boolean
+): string {
+  if (fatal) {
+    try {
+      return new TextDecoder('utf8', { fatal }).decode(buffer.slice(start, end));
+    } catch (cause) {
+      throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
     }
   }
-
-  return !continuation;
+  return new TextDecoder('utf8', { fatal }).decode(buffer.slice(start, end));
 }
diff --git a/test/node/byte_utils.test.ts b/test/node/byte_utils.test.ts
index fa6d7f893..a41b19d3a 100644
--- a/test/node/byte_utils.test.ts
+++ b/test/node/byte_utils.test.ts
@@ -8,6 +8,7 @@ import { webByteUtils } from '../../src/utils/web_byte_utils';
 import * as sinon from 'sinon';
 import { loadCJSModuleBSON, loadReactNativeCJSModuleBSON, loadESModuleBSON } from '../load_bson';
 import * as crypto from 'node:crypto';
+import { BSONError } from '../../src/error';
 
 type ByteUtilTest<K extends keyof ByteUtils> = {
   name: string;
@@ -399,6 +400,7 @@ const fromUTF8Tests: ByteUtilTest<'encodeUTF8Into'>[] = [
     }
   }
 ];
+
 const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
   {
     name: 'should create utf8 string from buffer input',
@@ -417,21 +419,57 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
     }
   },
   {
-    name: 'should throw an error if fatal is set and string is invalid',
+    name: 'should insert replacement character fatal is false and string is invalid',
+    inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false],
+    expectation({ error, output }) {
+      expect(error).to.not.exist;
+      expect(output).to.equal('abc\uFFFD');
+    }
+  },
+  {
+    name: 'should throw an error if fatal is set and string is a sequence that decodes to an invalid code point',
     inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, true],
     expectation({ error }) {
       expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
     }
   },
   {
-    name: 'should insert replacement character fatal is false and string is invalid',
-    inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false],
-    expectation({ error, output }) {
-      expect(error).to.not.exist;
-      expect(output).to.equal('abc\uFFFD');
+    name: 'throw an error if fatal is set and string contains overlong encoding',
+    inputs: [Buffer.from('11000000025f0005000000f08282ac0000', 'hex'), 0, 18, true],
+    expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  {
+    name: 'throw an error if fatal is set and string contains invalid bytes',
+    inputs: [Buffer.from('abcff', 'hex'), 0, 2, true],
+    expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  {
+    name: 'throw an error if fatal is set and string contains an unexpected continuation byte',
+    inputs: [Buffer.from('7F80', 'hex'), 0, 2, true],
+    expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  {
+    name: 'throw an error if fatal is set and string contains a non-continuation byte before the end of the character',
+    inputs: [Buffer.from('c000', 'hex'), 0, 2, true],
+    expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  {
+    name: 'throw an error if fatal is set and string ends before the end of the character',
+    inputs: [Buffer.from('c0', 'hex'), 0, 1, true],
+    expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
     }
   }
 ];
+
 const utf8ByteLengthTests: ByteUtilTest<'utf8ByteLength'>[] = [
   {
     name: 'should return zero for empty string',
@@ -493,6 +531,51 @@ const randomBytesTests: ByteUtilTest<'randomBytes'>[] = [
   }
 ];
 
+// extra error cases copied from Web platform specs
+const toUTF8ErrorCaseTests = [
+  { input: [0xff], name: 'invalid code' },
+  { input: [0xc0], name: 'ends early' },
+  { input: [0xe0], name: 'ends early 2' },
+  { input: [0xc0, 0x00], name: 'invalid trail' },
+  { input: [0xc0, 0xc0], name: 'invalid trail 2' },
+  { input: [0xe0, 0x00], name: 'invalid trail 3' },
+  { input: [0xe0, 0xc0], name: 'invalid trail 4' },
+  { input: [0xe0, 0x80, 0x00], name: 'invalid trail 5' },
+  { input: [0xe0, 0x80, 0xc0], name: 'invalid trail 6' },
+  { input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10ffff' },
+  { input: [0xfe, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' },
+
+  // Overlong encodings
+  { input: [0xc0, 0x80], name: 'overlong U+0000 - 2 bytes' },
+  { input: [0xe0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' },
+  { input: [0xf0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' },
+  { input: [0xf8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' },
+  { input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 6 bytes' },
+
+  { input: [0xc1, 0xbf], name: 'overlong U+007f - 2 bytes' },
+  { input: [0xe0, 0x81, 0xbf], name: 'overlong U+007f - 3 bytes' },
+  { input: [0xf0, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 4 bytes' },
+  { input: [0xf8, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 5 bytes' },
+  { input: [0xfc, 0x80, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 6 bytes' },
+
+  { input: [0xe0, 0x9f, 0xbf], name: 'overlong U+07ff - 3 bytes' },
+  { input: [0xf0, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 4 bytes' },
+  { input: [0xf8, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 5 bytes' },
+  { input: [0xfc, 0x80, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 6 bytes' },
+
+  { input: [0xf0, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 4 bytes' },
+  { input: [0xf8, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 5 bytes' },
+  { input: [0xfc, 0x80, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 6 bytes' },
+
+  { input: [0xf8, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 5 bytes' },
+  { input: [0xfc, 0x80, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 6 bytes' },
+
+  // UTf-16 surrogates encoded as code points in UTf-8
+  { input: [0xed, 0xa0, 0x80], name: 'lead surrogate' },
+  { input: [0xed, 0xb0, 0x80], name: 'trail surrogate' },
+  { input: [0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80], name: 'surrogate pair' }
+];
+
 const utils = new Map([
   ['nodeJsByteUtils', nodeJsByteUtils],
   ['webByteUtils', webByteUtils]
@@ -798,6 +881,14 @@ describe('ByteUtils', () => {
             test.expectation({ web: byteUtilsName === 'webByteUtils', output, error });
           });
         }
+        if (utility === 'toUTF8')
+          for (const test of toUTF8ErrorCaseTests) {
+            it(`throws error when fatal is set and provided ${test.name} as input`, () => {
+              expect(() =>
+                byteUtils[utility](Uint8Array.from(test.input), 0, test.input.length, true)
+              ).to.throw(BSONError, /Invalid UTF-8 string in BSON document/i);
+            });
+          }
       });
     }
   }

From 9d3033af3b739d6f720bd5f3ef09e6ec6840beef Mon Sep 17 00:00:00 2001
From: Aditi Khare <aditi.khare@mongodb.com>
Date: Wed, 24 Apr 2024 16:55:56 -0400
Subject: [PATCH 2/8] store decoder instances, so not created upon each call

---
 src/validate_utf8.ts | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/validate_utf8.ts b/src/validate_utf8.ts
index 1d2a81565..42782e75c 100644
--- a/src/validate_utf8.ts
+++ b/src/validate_utf8.ts
@@ -18,10 +18,13 @@ type TextEncoderConstructor = {
   new (): TextEncoder;
 };
 
-// Node byte utils global
+// validate utf8 globals
 declare const TextDecoder: TextDecoderConstructor;
 declare const TextEncoder: TextEncoderConstructor;
 
+const TextDecoderFatal: TextDecoder = new TextDecoder('utf8', { fatal: true });
+const TextDecoderNonFatal: TextDecoder = new TextDecoder('utf8', { fatal: false });
+
 /**
  * Determines if the passed in bytes are valid utf8
  * @param bytes - An array of 8-bit bytes. Must be indexable and have length property
@@ -36,10 +39,10 @@ export function validateUtf8(
 ): string {
   if (fatal) {
     try {
-      return new TextDecoder('utf8', { fatal }).decode(buffer.slice(start, end));
+      return TextDecoderFatal.decode(buffer.slice(start, end));
     } catch (cause) {
       throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
     }
   }
-  return new TextDecoder('utf8', { fatal }).decode(buffer.slice(start, end));
+  return TextDecoderNonFatal.decode(buffer.slice(start, end));
 }

From 8a07891c057a600d8fe3c0c2cb1dbbb5f5ed182d Mon Sep 17 00:00:00 2001
From: Aditi Khare <aditi.khare@mongodb.com>
Date: Thu, 25 Apr 2024 11:08:18 -0400
Subject: [PATCH 3/8] lazy load text decoder

---
 src/validate_utf8.ts | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/validate_utf8.ts b/src/validate_utf8.ts
index 42782e75c..dc9dfe353 100644
--- a/src/validate_utf8.ts
+++ b/src/validate_utf8.ts
@@ -22,8 +22,8 @@ type TextEncoderConstructor = {
 declare const TextDecoder: TextDecoderConstructor;
 declare const TextEncoder: TextEncoderConstructor;
 
-const TextDecoderFatal: TextDecoder = new TextDecoder('utf8', { fatal: true });
-const TextDecoderNonFatal: TextDecoder = new TextDecoder('utf8', { fatal: false });
+let TextDecoderFatal: TextDecoder;
+let TextDecoderNonFatal: TextDecoder;
 
 /**
  * Determines if the passed in bytes are valid utf8
@@ -39,10 +39,12 @@ export function validateUtf8(
 ): string {
   if (fatal) {
     try {
+      TextDecoderFatal ??= new TextDecoder('utf8', { fatal: true });
       return TextDecoderFatal.decode(buffer.slice(start, end));
     } catch (cause) {
       throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
     }
   }
+  TextDecoderNonFatal ??= new TextDecoder('utf8', { fatal: false });
   return TextDecoderNonFatal.decode(buffer.slice(start, end));
 }

From d6b87141f301131c6cfa9c18f8ef385290c423f2 Mon Sep 17 00:00:00 2001
From: Aditi Khare <aditi.khare@mongodb.com>
Date: Thu, 25 Apr 2024 16:41:13 -0400
Subject: [PATCH 4/8] fix require polyfill logic

---
 .../require_vendor.mjs                        | 34 ++++++++++++-------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs b/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs
index 659afe8d9..abff43b84 100644
--- a/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs
+++ b/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs
@@ -1,9 +1,12 @@
 import MagicString from 'magic-string';
 
-const REQUIRE_POLYFILLS =
-  `const { TextEncoder, TextDecoder } = require('../vendor/text-encoding');
+const REQUIRE_WEB_UTILS_POLYFILLS =
+  `const { TextEncoder } = require('../vendor/text-encoding');
 const { encode: btoa, decode: atob } = require('../vendor/base64');\n`
 
+const REQUIRE_VALIDATE_UTF8_POLYFILLS = 
+  `const { TextEncoder } = require('../vendor/text-encoding');`;
+
 export class RequireVendor {
   /**
    * Take the compiled source code input; types are expected to already have been removed.
@@ -14,17 +17,24 @@ export class RequireVendor {
    * @returns {{ code: string; map: import('magic-string').SourceMap }}
    */
   transform(code, id) {
-    if (!id.includes('validate_utf8')) {
-      return;
-    }
+    if (id.includes('validate_utf8')) {
+      // MagicString lets us edit the source code and still generate an accurate source map
+      const magicString = new MagicString(code);
+      magicString.prepend(REQUIRE_VALIDATE_UTF8_POLYFILLS);
 
-    // MagicString lets us edit the source code and still generate an accurate source map
-    const magicString = new MagicString(code);
-    magicString.prepend(REQUIRE_POLYFILLS);
+      return {
+        code: magicString.toString(),
+        map: magicString.generateMap({ hires: true })
+      };
+    } else if (id.includes('web_byte_utils')) {
+      // MagicString lets us edit the source code and still generate an accurate source map
+      const magicString = new MagicString(code);
+      magicString.prepend(REQUIRE_WEB_UTILS_POLYFILLS);
 
-    return {
-      code: magicString.toString(),
-      map: magicString.generateMap({ hires: true })
-    };
+      return {
+        code: magicString.toString(),
+        map: magicString.generateMap({ hires: true })
+      };
+    }
   }
 }

From 75b9485663f6424edec017f99a90711eece4ffb8 Mon Sep 17 00:00:00 2001
From: Aditi Khare <aditi.khare@mongodb.com>
Date: Thu, 25 Apr 2024 16:41:57 -0400
Subject: [PATCH 5/8] fix require polyfill logic 2

---
 etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs b/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs
index abff43b84..9a6d3930b 100644
--- a/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs
+++ b/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs
@@ -5,7 +5,7 @@ const REQUIRE_WEB_UTILS_POLYFILLS =
 const { encode: btoa, decode: atob } = require('../vendor/base64');\n`
 
 const REQUIRE_VALIDATE_UTF8_POLYFILLS = 
-  `const { TextEncoder } = require('../vendor/text-encoding');`;
+  `const { TextDecoder } = require('../vendor/text-encoding');`;
 
 export class RequireVendor {
   /**

From 92c7770ecadb0b629e2c179be8eabe43fd739ea5 Mon Sep 17 00:00:00 2001
From: Aditi Khare <aditi.khare@mongodb.com>
Date: Thu, 25 Apr 2024 18:39:55 -0400
Subject: [PATCH 6/8] requested changes

---
 .../require_vendor.mjs                        |   8 +-
 src/{validate_utf8.ts => parse_utf8.ts}       |  23 +---
 src/parser/deserializer.ts                    |   7 +-
 src/utils/node_byte_utils.ts                  |   7 +-
 src/utils/web_byte_utils.ts                   |   4 +-
 test/node/byte_utils.test.ts                  | 101 +++++++++++-------
 test/node/release.test.ts                     |   2 +-
 7 files changed, 75 insertions(+), 77 deletions(-)
 rename src/{validate_utf8.ts => parse_utf8.ts} (66%)

diff --git a/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs b/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs
index 9a6d3930b..4819023dd 100644
--- a/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs
+++ b/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs
@@ -4,8 +4,8 @@ const REQUIRE_WEB_UTILS_POLYFILLS =
   `const { TextEncoder } = require('../vendor/text-encoding');
 const { encode: btoa, decode: atob } = require('../vendor/base64');\n`
 
-const REQUIRE_VALIDATE_UTF8_POLYFILLS = 
-  `const { TextDecoder } = require('../vendor/text-encoding');`;
+const REQUIRE_PARSE_UTF8_POLYFILLS = 
+  `const { TextDecoder } = require('../vendor/text-encoding');\n`;
 
 export class RequireVendor {
   /**
@@ -17,10 +17,10 @@ export class RequireVendor {
    * @returns {{ code: string; map: import('magic-string').SourceMap }}
    */
   transform(code, id) {
-    if (id.includes('validate_utf8')) {
+    if (id.includes('parse_utf8')) {
       // MagicString lets us edit the source code and still generate an accurate source map
       const magicString = new MagicString(code);
-      magicString.prepend(REQUIRE_VALIDATE_UTF8_POLYFILLS);
+      magicString.prepend(REQUIRE_PARSE_UTF8_POLYFILLS);
 
       return {
         code: magicString.toString(),
diff --git a/src/validate_utf8.ts b/src/parse_utf8.ts
similarity index 66%
rename from src/validate_utf8.ts
rename to src/parse_utf8.ts
index dc9dfe353..0e12793bf 100644
--- a/src/validate_utf8.ts
+++ b/src/parse_utf8.ts
@@ -10,18 +10,8 @@ type TextDecoderConstructor = {
   new (label: 'utf8', options: { fatal: boolean; ignoreBOM?: boolean }): TextDecoder;
 };
 
-type TextEncoder = {
-  readonly encoding: string;
-  encode(input?: string): Uint8Array;
-};
-type TextEncoderConstructor = {
-  new (): TextEncoder;
-};
-
-// validate utf8 globals
+// parse utf8 globals
 declare const TextDecoder: TextDecoderConstructor;
-declare const TextEncoder: TextEncoderConstructor;
-
 let TextDecoderFatal: TextDecoder;
 let TextDecoderNonFatal: TextDecoder;
 
@@ -31,20 +21,15 @@ let TextDecoderNonFatal: TextDecoder;
  * @param start - The index to start validating
  * @param end - The index to end validating
  */
-export function validateUtf8(
-  buffer: Uint8Array,
-  start: number,
-  end: number,
-  fatal: boolean
-): string {
+export function parseUtf8(buffer: Uint8Array, start: number, end: number, fatal: boolean): string {
   if (fatal) {
     try {
       TextDecoderFatal ??= new TextDecoder('utf8', { fatal: true });
-      return TextDecoderFatal.decode(buffer.slice(start, end));
+      return TextDecoderFatal.decode(buffer.subarray(start, end));
     } catch (cause) {
       throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
     }
   }
   TextDecoderNonFatal ??= new TextDecoder('utf8', { fatal: false });
-  return TextDecoderNonFatal.decode(buffer.slice(start, end));
+  return TextDecoderNonFatal.decode(buffer.subarray(start, end));
 }
diff --git a/src/parser/deserializer.ts b/src/parser/deserializer.ts
index a01a167d0..165a529cf 100644
--- a/src/parser/deserializer.ts
+++ b/src/parser/deserializer.ts
@@ -603,12 +603,7 @@ function deserializeObject(
       )
         throw new BSONError('bad string length in bson');
       // Namespace
-      const namespace = ByteUtils.toUTF8(
-        buffer,
-        index,
-        index + stringSize - 1,
-        validation != null && (validation.utf8 as boolean)
-      );
+      const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, shouldValidateKey);
       // Update parse index position
       index = index + stringSize;
 
diff --git a/src/utils/node_byte_utils.ts b/src/utils/node_byte_utils.ts
index d9487046a..ca1482ca0 100644
--- a/src/utils/node_byte_utils.ts
+++ b/src/utils/node_byte_utils.ts
@@ -1,5 +1,5 @@
 import { BSONError } from '../error';
-import { validateUtf8 } from '../validate_utf8';
+import { parseUtf8 } from '../parse_utf8';
 import { tryReadBasicLatin, tryWriteBasicLatin } from './latin';
 
 type NodeJsEncoding = 'base64' | 'hex' | 'utf8' | 'binary';
@@ -136,12 +136,9 @@ export const nodeJsByteUtils = {
 
     const string = nodeJsByteUtils.toLocalBufferType(buffer).toString('utf8', start, end);
     if (fatal) {
-      // TODO(NODE-4930): Insufficiently strict BSON UTF8 validation
       for (let i = 0; i < string.length; i++) {
         if (string.charCodeAt(i) === 0xfffd) {
-          if (!validateUtf8(buffer, start, end, fatal)) {
-            throw new BSONError('Invalid UTF-8 string in BSON document');
-          }
+          parseUtf8(buffer, start, end, true);
           break;
         }
       }
diff --git a/src/utils/web_byte_utils.ts b/src/utils/web_byte_utils.ts
index f2d1b09b1..0f79f0df3 100644
--- a/src/utils/web_byte_utils.ts
+++ b/src/utils/web_byte_utils.ts
@@ -1,6 +1,6 @@
 import { BSONError } from '../error';
 import { tryReadBasicLatin } from './latin';
-import { validateUtf8 } from '../validate_utf8';
+import { parseUtf8 } from '../parse_utf8';
 
 type TextDecoder = {
   readonly encoding: string;
@@ -180,7 +180,7 @@ export const webByteUtils = {
       return basicLatin;
     }
 
-    return validateUtf8(uint8array, start, end, fatal);
+    return parseUtf8(uint8array, start, end, fatal);
   },
 
   utf8ByteLength(input: string): number {
diff --git a/test/node/byte_utils.test.ts b/test/node/byte_utils.test.ts
index a41b19d3a..7141cfd93 100644
--- a/test/node/byte_utils.test.ts
+++ b/test/node/byte_utils.test.ts
@@ -531,49 +531,70 @@ const randomBytesTests: ByteUtilTest<'randomBytes'>[] = [
   }
 ];
 
-// extra error cases copied from Web platform specs
-const toUTF8ErrorCaseTests = [
-  { input: [0xff], name: 'invalid code' },
-  { input: [0xc0], name: 'ends early' },
-  { input: [0xe0], name: 'ends early 2' },
-  { input: [0xc0, 0x00], name: 'invalid trail' },
-  { input: [0xc0, 0xc0], name: 'invalid trail 2' },
-  { input: [0xe0, 0x00], name: 'invalid trail 3' },
-  { input: [0xe0, 0xc0], name: 'invalid trail 4' },
-  { input: [0xe0, 0x80, 0x00], name: 'invalid trail 5' },
-  { input: [0xe0, 0x80, 0xc0], name: 'invalid trail 6' },
-  { input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10ffff' },
-  { input: [0xfe, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' },
+// extra error cases copied from wpt/encoding/textdecoder-fatal.any.js
+// commit sha: 7c9f867
+const toUTF8WebPlatformSpecTests = [
+  { encoding: 'utf-8', input: [0xff], name: 'invalid code' },
+  { encoding: 'utf-8', input: [0xc0], name: 'ends early' },
+  { encoding: 'utf-8', input: [0xe0], name: 'ends early 2' },
+  { encoding: 'utf-8', input: [0xc0, 0x00], name: 'invalid trail' },
+  { encoding: 'utf-8', input: [0xc0, 0xc0], name: 'invalid trail 2' },
+  { encoding: 'utf-8', input: [0xe0, 0x00], name: 'invalid trail 3' },
+  { encoding: 'utf-8', input: [0xe0, 0xc0], name: 'invalid trail 4' },
+  { encoding: 'utf-8', input: [0xe0, 0x80, 0x00], name: 'invalid trail 5' },
+  { encoding: 'utf-8', input: [0xe0, 0x80, 0xc0], name: 'invalid trail 6' },
+  { encoding: 'utf-8', input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10ffff' },
+  { encoding: 'utf-8', input: [0xfe, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' },
 
   // Overlong encodings
-  { input: [0xc0, 0x80], name: 'overlong U+0000 - 2 bytes' },
-  { input: [0xe0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' },
-  { input: [0xf0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' },
-  { input: [0xf8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' },
-  { input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 6 bytes' },
-
-  { input: [0xc1, 0xbf], name: 'overlong U+007f - 2 bytes' },
-  { input: [0xe0, 0x81, 0xbf], name: 'overlong U+007f - 3 bytes' },
-  { input: [0xf0, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 4 bytes' },
-  { input: [0xf8, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 5 bytes' },
-  { input: [0xfc, 0x80, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 6 bytes' },
-
-  { input: [0xe0, 0x9f, 0xbf], name: 'overlong U+07ff - 3 bytes' },
-  { input: [0xf0, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 4 bytes' },
-  { input: [0xf8, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 5 bytes' },
-  { input: [0xfc, 0x80, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 6 bytes' },
-
-  { input: [0xf0, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 4 bytes' },
-  { input: [0xf8, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 5 bytes' },
-  { input: [0xfc, 0x80, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 6 bytes' },
-
-  { input: [0xf8, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 5 bytes' },
-  { input: [0xfc, 0x80, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 6 bytes' },
+  { encoding: 'utf-8', input: [0xc0, 0x80], name: 'overlong U+0000 - 2 bytes' },
+  { encoding: 'utf-8', input: [0xe0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' },
+  { encoding: 'utf-8', input: [0xf0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' },
+  { encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' },
+  {
+    encoding: 'utf-8',
+    input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80],
+    name: 'overlong U+0000 - 6 bytes'
+  },
+
+  { encoding: 'utf-8', input: [0xc1, 0xbf], name: 'overlong U+007f - 2 bytes' },
+  { encoding: 'utf-8', input: [0xe0, 0x81, 0xbf], name: 'overlong U+007f - 3 bytes' },
+  { encoding: 'utf-8', input: [0xf0, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 4 bytes' },
+  { encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 5 bytes' },
+  {
+    encoding: 'utf-8',
+    input: [0xfc, 0x80, 0x80, 0x80, 0x81, 0xbf],
+    name: 'overlong U+007f - 6 bytes'
+  },
+
+  { encoding: 'utf-8', input: [0xe0, 0x9f, 0xbf], name: 'overlong U+07ff - 3 bytes' },
+  { encoding: 'utf-8', input: [0xf0, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 4 bytes' },
+  { encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 5 bytes' },
+  {
+    encoding: 'utf-8',
+    input: [0xfc, 0x80, 0x80, 0x80, 0x9f, 0xbf],
+    name: 'overlong U+07ff - 6 bytes'
+  },
+
+  { encoding: 'utf-8', input: [0xf0, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 4 bytes' },
+  { encoding: 'utf-8', input: [0xf8, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 5 bytes' },
+  {
+    encoding: 'utf-8',
+    input: [0xfc, 0x80, 0x80, 0x8f, 0xbf, 0xbf],
+    name: 'overlong U+ffff - 6 bytes'
+  },
+
+  { encoding: 'utf-8', input: [0xf8, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 5 bytes' },
+  {
+    encoding: 'utf-8',
+    input: [0xfc, 0x80, 0x84, 0x8f, 0xbf, 0xbf],
+    name: 'overlong U+10ffff - 6 bytes'
+  },
 
   // UTf-16 surrogates encoded as code points in UTf-8
-  { input: [0xed, 0xa0, 0x80], name: 'lead surrogate' },
-  { input: [0xed, 0xb0, 0x80], name: 'trail surrogate' },
-  { input: [0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80], name: 'surrogate pair' }
+  { encoding: 'utf-8', input: [0xed, 0xa0, 0x80], name: 'lead surrogate' },
+  { encoding: 'utf-8', input: [0xed, 0xb0, 0x80], name: 'trail surrogate' },
+  { encoding: 'utf-8', input: [0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80], name: 'surrogate pair' }
 ];
 
 const utils = new Map([
@@ -882,7 +903,7 @@ describe('ByteUtils', () => {
           });
         }
         if (utility === 'toUTF8')
-          for (const test of toUTF8ErrorCaseTests) {
+          for (const test of toUTF8WebPlatformSpecTests) {
             it(`throws error when fatal is set and provided ${test.name} as input`, () => {
               expect(() =>
                 byteUtils[utility](Uint8Array.from(test.input), 0, test.input.length, true)
diff --git a/test/node/release.test.ts b/test/node/release.test.ts
index da69230df..756305b38 100644
--- a/test/node/release.test.ts
+++ b/test/node/release.test.ts
@@ -50,7 +50,7 @@ const REQUIRED_FILES = [
   'src/utils/number_utils.ts',
   'src/utils/web_byte_utils.ts',
   'src/utils/latin.ts',
-  'src/validate_utf8.ts',
+  'src/parse_utf8.ts',
   'vendor/base64/base64.js',
   'vendor/base64/package.json',
   'vendor/base64/LICENSE-MIT.txt',

From 4cb2c9b2a775ea868c0c9078668951cfe6cc9db6 Mon Sep 17 00:00:00 2001
From: Aditi Khare <aditi.khare@mongodb.com>
Date: Mon, 29 Apr 2024 16:52:55 -0400
Subject: [PATCH 7/8] part of requested changes

---
 src/parse_utf8.ts            | 2 +-
 test/node/byte_utils.test.ts | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/parse_utf8.ts b/src/parse_utf8.ts
index 0e12793bf..045a9080b 100644
--- a/src/parse_utf8.ts
+++ b/src/parse_utf8.ts
@@ -23,8 +23,8 @@ let TextDecoderNonFatal: TextDecoder;
  */
 export function parseUtf8(buffer: Uint8Array, start: number, end: number, fatal: boolean): string {
   if (fatal) {
+    TextDecoderFatal ??= new TextDecoder('utf8', { fatal: true });
     try {
-      TextDecoderFatal ??= new TextDecoder('utf8', { fatal: true });
       return TextDecoderFatal.decode(buffer.subarray(start, end));
     } catch (cause) {
       throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
diff --git a/test/node/byte_utils.test.ts b/test/node/byte_utils.test.ts
index 7141cfd93..aff6b9a6d 100644
--- a/test/node/byte_utils.test.ts
+++ b/test/node/byte_utils.test.ts
@@ -533,6 +533,7 @@ const randomBytesTests: ByteUtilTest<'randomBytes'>[] = [
 
 // extra error cases copied from wpt/encoding/textdecoder-fatal.any.js
 // commit sha: 7c9f867
+// link: https://github.com/web-platform-tests/wpt/commit/7c9f8674d9809731e8919073d957d6233f6e0544
 const toUTF8WebPlatformSpecTests = [
   { encoding: 'utf-8', input: [0xff], name: 'invalid code' },
   { encoding: 'utf-8', input: [0xc0], name: 'ends early' },

From 521dd768891296e7cc001b60d9e766d2771068c6 Mon Sep 17 00:00:00 2001
From: Aditi Khare <aditi.khare@mongodb.com>
Date: Tue, 30 Apr 2024 14:36:18 -0400
Subject: [PATCH 8/8] updated tests and added to deserialize

---
 test/node/byte_utils.test.ts           | 125 ++-----------------------
 test/node/data/utf8_wpt_error_cases.ts |  67 +++++++++++++
 test/node/parser/deserializer.test.ts  |  46 ++++++++-
 3 files changed, 122 insertions(+), 116 deletions(-)
 create mode 100644 test/node/data/utf8_wpt_error_cases.ts

diff --git a/test/node/byte_utils.test.ts b/test/node/byte_utils.test.ts
index aff6b9a6d..67a4721fe 100644
--- a/test/node/byte_utils.test.ts
+++ b/test/node/byte_utils.test.ts
@@ -8,7 +8,7 @@ import { webByteUtils } from '../../src/utils/web_byte_utils';
 import * as sinon from 'sinon';
 import { loadCJSModuleBSON, loadReactNativeCJSModuleBSON, loadESModuleBSON } from '../load_bson';
 import * as crypto from 'node:crypto';
-import { BSONError } from '../../src/error';
+import { utf8WebPlatformSpecTests } from './data/utf8_wpt_error_cases';
 
 type ByteUtilTest<K extends keyof ByteUtils> = {
   name: string;
@@ -426,48 +426,18 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
       expect(output).to.equal('abc\uFFFD');
     }
   },
-  {
-    name: 'should throw an error if fatal is set and string is a sequence that decodes to an invalid code point',
-    inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, true],
-    expectation({ error }) {
-      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
-    }
-  },
-  {
-    name: 'throw an error if fatal is set and string contains overlong encoding',
-    inputs: [Buffer.from('11000000025f0005000000f08282ac0000', 'hex'), 0, 18, true],
-    expectation({ error }) {
-      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
-    }
-  },
-  {
-    name: 'throw an error if fatal is set and string contains invalid bytes',
-    inputs: [Buffer.from('abcff', 'hex'), 0, 2, true],
-    expectation({ error }) {
-      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
-    }
-  },
-  {
-    name: 'throw an error if fatal is set and string contains an unexpected continuation byte',
-    inputs: [Buffer.from('7F80', 'hex'), 0, 2, true],
-    expectation({ error }) {
-      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
-    }
-  },
-  {
-    name: 'throw an error if fatal is set and string contains a non-continuation byte before the end of the character',
-    inputs: [Buffer.from('c000', 'hex'), 0, 2, true],
-    expectation({ error }) {
-      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
-    }
-  },
-  {
-    name: 'throw an error if fatal is set and string ends before the end of the character',
-    inputs: [Buffer.from('c0', 'hex'), 0, 1, true],
+  ...utf8WebPlatformSpecTests.map(t => ({
+    name: t.name,
+    inputs: [Uint8Array.from(t.input), 0, t.input.length, true] as [
+      buffer: Uint8Array,
+      start: number,
+      end: number,
+      fatal: boolean
+    ],
     expectation({ error }) {
       expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
     }
-  }
+  }))
 ];
 
 const utf8ByteLengthTests: ByteUtilTest<'utf8ByteLength'>[] = [
@@ -531,73 +501,6 @@ const randomBytesTests: ByteUtilTest<'randomBytes'>[] = [
   }
 ];
 
-// extra error cases copied from wpt/encoding/textdecoder-fatal.any.js
-// commit sha: 7c9f867
-// link: https://github.com/web-platform-tests/wpt/commit/7c9f8674d9809731e8919073d957d6233f6e0544
-const toUTF8WebPlatformSpecTests = [
-  { encoding: 'utf-8', input: [0xff], name: 'invalid code' },
-  { encoding: 'utf-8', input: [0xc0], name: 'ends early' },
-  { encoding: 'utf-8', input: [0xe0], name: 'ends early 2' },
-  { encoding: 'utf-8', input: [0xc0, 0x00], name: 'invalid trail' },
-  { encoding: 'utf-8', input: [0xc0, 0xc0], name: 'invalid trail 2' },
-  { encoding: 'utf-8', input: [0xe0, 0x00], name: 'invalid trail 3' },
-  { encoding: 'utf-8', input: [0xe0, 0xc0], name: 'invalid trail 4' },
-  { encoding: 'utf-8', input: [0xe0, 0x80, 0x00], name: 'invalid trail 5' },
-  { encoding: 'utf-8', input: [0xe0, 0x80, 0xc0], name: 'invalid trail 6' },
-  { encoding: 'utf-8', input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10ffff' },
-  { encoding: 'utf-8', input: [0xfe, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' },
-
-  // Overlong encodings
-  { encoding: 'utf-8', input: [0xc0, 0x80], name: 'overlong U+0000 - 2 bytes' },
-  { encoding: 'utf-8', input: [0xe0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' },
-  { encoding: 'utf-8', input: [0xf0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' },
-  { encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' },
-  {
-    encoding: 'utf-8',
-    input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80],
-    name: 'overlong U+0000 - 6 bytes'
-  },
-
-  { encoding: 'utf-8', input: [0xc1, 0xbf], name: 'overlong U+007f - 2 bytes' },
-  { encoding: 'utf-8', input: [0xe0, 0x81, 0xbf], name: 'overlong U+007f - 3 bytes' },
-  { encoding: 'utf-8', input: [0xf0, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 4 bytes' },
-  { encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 5 bytes' },
-  {
-    encoding: 'utf-8',
-    input: [0xfc, 0x80, 0x80, 0x80, 0x81, 0xbf],
-    name: 'overlong U+007f - 6 bytes'
-  },
-
-  { encoding: 'utf-8', input: [0xe0, 0x9f, 0xbf], name: 'overlong U+07ff - 3 bytes' },
-  { encoding: 'utf-8', input: [0xf0, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 4 bytes' },
-  { encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 5 bytes' },
-  {
-    encoding: 'utf-8',
-    input: [0xfc, 0x80, 0x80, 0x80, 0x9f, 0xbf],
-    name: 'overlong U+07ff - 6 bytes'
-  },
-
-  { encoding: 'utf-8', input: [0xf0, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 4 bytes' },
-  { encoding: 'utf-8', input: [0xf8, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 5 bytes' },
-  {
-    encoding: 'utf-8',
-    input: [0xfc, 0x80, 0x80, 0x8f, 0xbf, 0xbf],
-    name: 'overlong U+ffff - 6 bytes'
-  },
-
-  { encoding: 'utf-8', input: [0xf8, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 5 bytes' },
-  {
-    encoding: 'utf-8',
-    input: [0xfc, 0x80, 0x84, 0x8f, 0xbf, 0xbf],
-    name: 'overlong U+10ffff - 6 bytes'
-  },
-
-  // UTf-16 surrogates encoded as code points in UTf-8
-  { encoding: 'utf-8', input: [0xed, 0xa0, 0x80], name: 'lead surrogate' },
-  { encoding: 'utf-8', input: [0xed, 0xb0, 0x80], name: 'trail surrogate' },
-  { encoding: 'utf-8', input: [0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80], name: 'surrogate pair' }
-];
-
 const utils = new Map([
   ['nodeJsByteUtils', nodeJsByteUtils],
   ['webByteUtils', webByteUtils]
@@ -903,14 +806,6 @@ describe('ByteUtils', () => {
             test.expectation({ web: byteUtilsName === 'webByteUtils', output, error });
           });
         }
-        if (utility === 'toUTF8')
-          for (const test of toUTF8WebPlatformSpecTests) {
-            it(`throws error when fatal is set and provided ${test.name} as input`, () => {
-              expect(() =>
-                byteUtils[utility](Uint8Array.from(test.input), 0, test.input.length, true)
-              ).to.throw(BSONError, /Invalid UTF-8 string in BSON document/i);
-            });
-          }
       });
     }
   }
diff --git a/test/node/data/utf8_wpt_error_cases.ts b/test/node/data/utf8_wpt_error_cases.ts
new file mode 100644
index 000000000..6d3a98135
--- /dev/null
+++ b/test/node/data/utf8_wpt_error_cases.ts
@@ -0,0 +1,67 @@
+// extra error cases copied from wpt/encoding/textdecoder-fatal.any.js
+// commit sha: 7c9f867
+// link: https://github.com/web-platform-tests/wpt/commit/7c9f8674d9809731e8919073d957d6233f6e0544
+
+export const utf8WebPlatformSpecTests = [
+  { encoding: 'utf-8', input: [0xff], name: 'invalid code' },
+  { encoding: 'utf-8', input: [0xc0], name: 'ends early' },
+  { encoding: 'utf-8', input: [0xe0], name: 'ends early 2' },
+  { encoding: 'utf-8', input: [0xc0, 0x00], name: 'invalid trail' },
+  { encoding: 'utf-8', input: [0xc0, 0xc0], name: 'invalid trail 2' },
+  { encoding: 'utf-8', input: [0xe0, 0x00], name: 'invalid trail 3' },
+  { encoding: 'utf-8', input: [0xe0, 0xc0], name: 'invalid trail 4' },
+  { encoding: 'utf-8', input: [0xe0, 0x80, 0x00], name: 'invalid trail 5' },
+  { encoding: 'utf-8', input: [0xe0, 0x80, 0xc0], name: 'invalid trail 6' },
+  { encoding: 'utf-8', input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10ffff' },
+  { encoding: 'utf-8', input: [0xfe, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' },
+
+  // Overlong encodings
+  { encoding: 'utf-8', input: [0xc0, 0x80], name: 'overlong U+0000 - 2 bytes' },
+  { encoding: 'utf-8', input: [0xe0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' },
+  { encoding: 'utf-8', input: [0xf0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' },
+  { encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' },
+  {
+    encoding: 'utf-8',
+    input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80],
+    name: 'overlong U+0000 - 6 bytes'
+  },
+
+  { encoding: 'utf-8', input: [0xc1, 0xbf], name: 'overlong U+007f - 2 bytes' },
+  { encoding: 'utf-8', input: [0xe0, 0x81, 0xbf], name: 'overlong U+007f - 3 bytes' },
+  { encoding: 'utf-8', input: [0xf0, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 4 bytes' },
+  { encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 5 bytes' },
+  {
+    encoding: 'utf-8',
+    input: [0xfc, 0x80, 0x80, 0x80, 0x81, 0xbf],
+    name: 'overlong U+007f - 6 bytes'
+  },
+
+  { encoding: 'utf-8', input: [0xe0, 0x9f, 0xbf], name: 'overlong U+07ff - 3 bytes' },
+  { encoding: 'utf-8', input: [0xf0, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 4 bytes' },
+  { encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 5 bytes' },
+  {
+    encoding: 'utf-8',
+    input: [0xfc, 0x80, 0x80, 0x80, 0x9f, 0xbf],
+    name: 'overlong U+07ff - 6 bytes'
+  },
+
+  { encoding: 'utf-8', input: [0xf0, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 4 bytes' },
+  { encoding: 'utf-8', input: [0xf8, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 5 bytes' },
+  {
+    encoding: 'utf-8',
+    input: [0xfc, 0x80, 0x80, 0x8f, 0xbf, 0xbf],
+    name: 'overlong U+ffff - 6 bytes'
+  },
+
+  { encoding: 'utf-8', input: [0xf8, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 5 bytes' },
+  {
+    encoding: 'utf-8',
+    input: [0xfc, 0x80, 0x84, 0x8f, 0xbf, 0xbf],
+    name: 'overlong U+10ffff - 6 bytes'
+  },
+
+  // UTf-16 surrogates encoded as code points in UTf-8
+  { encoding: 'utf-8', input: [0xed, 0xa0, 0x80], name: 'lead surrogate' },
+  { encoding: 'utf-8', input: [0xed, 0xb0, 0x80], name: 'trail surrogate' },
+  { encoding: 'utf-8', input: [0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80], name: 'surrogate pair' }
+];
diff --git a/test/node/parser/deserializer.test.ts b/test/node/parser/deserializer.test.ts
index 005ccefa4..30c684be5 100644
--- a/test/node/parser/deserializer.test.ts
+++ b/test/node/parser/deserializer.test.ts
@@ -1,6 +1,7 @@
 import * as BSON from '../../register-bson';
 import { expect } from 'chai';
-import { bufferFromHexArray } from '../tools/utils';
+import { bufferFromHexArray, int32LEToHex } from '../tools/utils';
+import { utf8WebPlatformSpecTests } from '../data/utf8_wpt_error_cases';
 
 describe('deserializer()', () => {
   describe('when the fieldsAsRaw options is present and has a value that corresponds to a key in the object', () => {
@@ -58,4 +59,47 @@ describe('deserializer()', () => {
       expect(resultCodeWithScope).to.have.deep.nested.property('a.scope', { b: true });
     });
   });
+
+  describe('utf8 validation', () => {
+    for (const test of utf8WebPlatformSpecTests) {
+      const inputStringSize = int32LEToHex(test.input.length + 1); // int32 size of string
+      const inputHexString = Buffer.from(test.input).toString('hex');
+      const buffer = bufferFromHexArray([
+        '02', // string
+        '6100', // 'a' key with null terminator
+        inputStringSize,
+        inputHexString,
+        '00'
+      ]);
+      context(`when utf8 validation is on and input is ${test.name}`, () => {
+        it(`throws error containing 'Invalid UTF-8'`, () => {
+          // global case
+          expect(() => BSON.deserialize(buffer, { validation: { utf8: true } })).to.throw(
+            BSON.BSONError,
+            /Invalid UTF-8 string in BSON document/i
+          );
+
+          // specific case
+          expect(() => BSON.deserialize(buffer, { validation: { utf8: { a: true } } })).to.throw(
+            BSON.BSONError,
+            /Invalid UTF-8 string in BSON document/i
+          );
+        });
+      });
+
+      context(`when utf8 validation is off and input is ${test.name}`, () => {
+        it('returns a string containing at least 1 replacement character', () => {
+          // global case
+          expect(BSON.deserialize(buffer, { validation: { utf8: false } }))
+            .to.have.property('a')
+            .that.includes('\uFFFD');
+
+          // specific case
+          expect(BSON.deserialize(buffer, { validation: { utf8: { a: false } } }))
+            .to.have.property('a')
+            .that.includes('\uFFFD');
+        });
+      });
+    }
+  });
 });