shaka-project · joeyparrish · Aug 12, 2022 · Aug 11, 2022 · avelad · Aug 12, 2022
diff --git a/karma.conf.js b/karma.conf.js
@@ -160,10 +160,6 @@ module.exports = (config) => {
       'node_modules/es6-promise-polyfill/promise.js',
       //   Babel polyfill, required for async/await
       'node_modules/@babel/polyfill/dist/polyfill.js',
-      //   TextDecoder polyfill, required for TextDecoder/TextEncoder on IE and
-      //   legacy Edge
-      //   eslint-disable-next-line max-len
-      'node_modules/fastestsmallesttextencoderdecoder/EncoderDecoderTogether.min.js',
 
       // muxjs module next
       'node_modules/mux.js/dist/mux.min.js',

diff --git a/lib/player.js b/lib/player.js
@@ -857,14 +857,9 @@ shaka.Player = class extends shaka.util.FakeEventTarget {
     if (!window.Promise) {
       shaka.log.alwaysWarn('A Promise implementation or polyfill is required');
     }
-    if (!window.TextDecoder || !window.TextEncoder) {
-      shaka.log.alwaysWarn(
-          'A TextDecoder/TextEncoder implementation or polyfill is required');
-    }
 
     // Basic features needed for the library to be usable.
     const basicSupport = !!window.Promise && !!window.Uint8Array &&
-                         !!window.TextDecoder && !!window.TextEncoder &&
                          // eslint-disable-next-line no-restricted-syntax
                          !!Array.prototype.forEach;
     if (!basicSupport) {

diff --git a/lib/util/string_utils.js b/lib/util/string_utils.js
@@ -37,6 +37,7 @@ shaka.util.StringUtils = class {
     if (uint8[0] == 0xef && uint8[1] == 0xbb && uint8[2] == 0xbf) {
       uint8 = uint8.subarray(3);
     }
+
     if (window.TextDecoder && !shaka.util.Platform.isPS4()) {
       // Use the TextDecoder interface to decode the text.  This has the
       // advantage compared to the previously-standard decodeUriComponent that
@@ -51,23 +52,69 @@ shaka.util.StringUtils = class {
       }
       return decoded;
     } else {
-      // http://stackoverflow.com/a/13691499
-      const utf8 = shaka.util.StringUtils.fromCharCode(uint8);
-      // This converts each character in the string to an escape sequence.  If
-      // the character is in the ASCII range, it is not converted; otherwise it
-      // is converted to a URI escape sequence.
-      // Example: '\x67\x35\xe3\x82\xac' -> 'g#%E3%82%AC'
-      const escaped = escape(utf8);
-      // Decode the escaped sequence.  This will interpret UTF-8 sequences into
-      // the correct character.
-      // Example: 'g#%E3%82%AC' -> 'g#€'
-      try {
-        return decodeURIComponent(escaped);
-      } catch (e) {
-        throw new shaka.util.Error(
-            shaka.util.Error.Severity.CRITICAL, shaka.util.Error.Category.TEXT,
-            shaka.util.Error.Code.BAD_ENCODING);
+      // Homebrewed UTF-8 decoder based on
+      // https://en.wikipedia.org/wiki/UTF-8#Encoding
+      // Unlike decodeURIComponent, won't throw on bad encoding.
+      // In this way, it is similar to TextDecoder.
+
+      let decoded = '';
+      for (let i = 0; i < uint8.length; ++i) {
+        // By default, the "replacement character" codepoint.
+        let codePoint = 0xFFFD;
+
+        // Top bit is 0, 1-byte encoding.
+        if ((uint8[i] & 0x80) == 0) {
+          codePoint = uint8[i];
+
+        // Top 3 bits of byte 0 are 110, top 2 bits of byte 1 are 10,
+        // 2-byte encoding.
+        } else if (uint8.length >= i + 2 &&
+            (uint8[i] & 0xe0) == 0xc0 &&
+            (uint8[i + 1] & 0xc0) == 0x80) {
+          codePoint = ((uint8[i] & 0x1f) << 6) |
+                      ((uint8[i + 1] & 0x3f));
+          i += 1;  // Consume one extra byte.
+
+        // Top 4 bits of byte 0 are 1110, top 2 bits of byte 1 and 2 are 10,
+        // 3-byte encoding.
+        } else if (uint8.length >= i + 3 &&
+            (uint8[i] & 0xf0) == 0xe0 &&
+            (uint8[i + 1] & 0xc0) == 0x80 &&
+            (uint8[i + 2] & 0xc0) == 0x80) {
+          codePoint = ((uint8[i] & 0x0f) << 12) |
+                      ((uint8[i + 1] & 0x3f) << 6) |
+                      ((uint8[i + 2] & 0x3f));
+          i += 2;  // Consume two extra bytes.
+
+        // Top 5 bits of byte 0 are 11110, top 2 bits of byte 1, 2 and 3 are 10,
+        // 4-byte encoding.
+        } else if (uint8.length >= i + 4 &&
+            (uint8[i] & 0xf1) == 0xf0 &&
+            (uint8[i + 1] & 0xc0) == 0x80 &&
+            (uint8[i + 2] & 0xc0) == 0x80 &&
+            (uint8[i + 3] & 0xc0) == 0x80) {
+          codePoint = ((uint8[i] & 0x07) << 18) |
+                      ((uint8[i + 1] & 0x3f) << 12) |
+                      ((uint8[i + 2] & 0x3f) << 6) |
+                      ((uint8[i + 3] & 0x3f));
+          i += 3;  // Consume three extra bytes.
+        }
+
+        // JavaScript strings are a series of UTF-16 characters.
+        if (codePoint <= 0xffff) {
+          decoded += String.fromCharCode(codePoint);
+        } else {
+          // UTF-16 surrogate-pair encoding, based on
+          // https://en.wikipedia.org/wiki/UTF-16#Description
+          const baseCodePoint = codePoint - 0x10000;
+          const highPart = baseCodePoint >> 10;
+          const lowPart = baseCodePoint & 0x3ff;
+          decoded += String.fromCharCode(0xd800 + highPart);
+          decoded += String.fromCharCode(0xdc00 + lowPart);
+        }
       }
+
+      return decoded;
     }
   }
 

diff --git a/test/util/string_utils_unit.js b/test/util/string_utils_unit.js
@@ -16,19 +16,34 @@ describe('StringUtils', () => {
   });
 
   it('won\'t break if given cut-off UTF8 character', () => {
-    // This array contains the first half of a 2-byte UTF8 character, stranded
-    // at the very end of the string.
-    const arr1 = [0x53, 0x61, 0x6e, 0x20, 0x4a, 0x6f, 0x73, 0x81];
+    const arr1 = [0x53, 0x61, 0x6e, 0x20, 0x4a, 0x6f, 0x73, 0xc3, 0xa9];
     expect(StringUtils.fromUTF8(new Uint8Array(arr1)))
+        .toBe('San Jos\u00E9');
+
+    // This array contains the first half of a 2-byte UTF8 character
+    // (0xc3 0xa9 = é).  The half-character is stranded at the very end of the
+    // string.
+    const arr = [0x53, 0x61, 0x6e, 0x20, 0x4a, 0x6f, 0x73, 0xc3];
+    expect(StringUtils.fromUTF8(new Uint8Array(arr)))
         .toBe('San Jos\uFFFD');
+  });
 
-    // For reasons I don't know, it seems like 0xE9 cannot be the start of a
-    // UTF8 character.  Perhaps it is a reserved number?
-    const arr2 = [0x4a, 0x6f, 0x73, 0xE9, 0x33, 0x33, 0x20, 0x53, 0x61, 0x6e];
-    expect(StringUtils.fromUTF8(new Uint8Array(arr2)))
+  it('won\'t break if given an invalid UTF-8 sequence', () => {
+    // 0xe9 0x33 0x33 is an invalid UTF-8 sequence.
+    const arr = [0x4a, 0x6f, 0x73, 0xE9, 0x33, 0x33, 0x20, 0x53, 0x61, 0x6e];
+    expect(StringUtils.fromUTF8(new Uint8Array(arr)))
         .toBe('Jos\uFFFD33 San');
   });
 
+  it('can handle an 8-byte character', () => {
+    // This is the UTF-8 encoding of the US flag emoji.
+    // It decodes into two Unicode codepoints, which becomes 4 JavaScript
+    // UTF-16 characters.
+    const arr = [0xf0, 0x9f, 0x87, 0xba, 0xf0, 0x9f, 0x87, 0xb8];
+    expect(StringUtils.fromUTF8(new Uint8Array(arr)))
+        .toBe('\uD83C\uDDFA\uD83C\uDDF8');
+  });
+
   it('strips the BOM in fromUTF8', () => {
     // This is 4 Unicode characters, the last will be split into a surrogate
     // pair.