fix: Fix TextDecoder fallback and browser support check (shaka-projec…

…t#4403) In PR shaka-project#4324, we lifted the requirement to have a native or polyfilled TextDecoder implementation. However, we forgot to remove the check for it in isBrowserSupported(). This led to tests being skipped entirely on Xbox, as Xbox was determined to be an unsupported platform by Player. To fix this, the check for TextDecoder/TextEncode in isBrowserSupported() has been removed. When the TextDecoder polyfill was removed, we left a reference to it in karma.conf.js. This didn't hurt anything per se, but this has now been cleaned up. Finally, TextDecoder was originally introduced to give us a way to recover from errors instead of throwing. The fallback that was reintroduced in shaka-project#4324 was the original code that throws on error. This led to a test failure on Xbox, which represents a complete subtitle failure in real content with an encoding issue. To fix this, we replace the utf-8 decoding fallback based on decodeURIComponent with a plain JS implementation. This adds only 477 bytes to Shaka Player, which is pretty good compared to the 2315 byte polyfill we used to recommend for this. To better verify these text decoding features, a test that checked two things has been split into two, comments around the tests have been improved, and an additional test case has been added.
joeyparrish · Aug 12, 2022 · 04fc0d4 · 04fc0d4
1 parent 4293a14
commit 04fc0d4
Show file tree

Hide file tree

Showing 4 changed files with 85 additions and 32 deletions.
diff --git a/karma.conf.js b/karma.conf.js
@@ -160,10 +160,6 @@ module.exports = (config) => {
       'node_modules/es6-promise-polyfill/promise.js',
       //   Babel polyfill, required for async/await
       'node_modules/@babel/polyfill/dist/polyfill.js',
-      //   TextDecoder polyfill, required for TextDecoder/TextEncoder on IE and
-      //   legacy Edge
-      //   eslint-disable-next-line max-len
-      'node_modules/fastestsmallesttextencoderdecoder/EncoderDecoderTogether.min.js',
 
       // muxjs module next
       'node_modules/mux.js/dist/mux.min.js',

diff --git a/lib/player.js b/lib/player.js
@@ -857,14 +857,9 @@ shaka.Player = class extends shaka.util.FakeEventTarget {
     if (!window.Promise) {
       shaka.log.alwaysWarn('A Promise implementation or polyfill is required');
     }
-    if (!window.TextDecoder || !window.TextEncoder) {
-      shaka.log.alwaysWarn(
-          'A TextDecoder/TextEncoder implementation or polyfill is required');
-    }
 
     // Basic features needed for the library to be usable.
     const basicSupport = !!window.Promise && !!window.Uint8Array &&
-                         !!window.TextDecoder && !!window.TextEncoder &&
                          // eslint-disable-next-line no-restricted-syntax
                          !!Array.prototype.forEach;
     if (!basicSupport) {

diff --git a/lib/util/string_utils.js b/lib/util/string_utils.js
@@ -37,6 +37,7 @@ shaka.util.StringUtils = class {
     if (uint8[0] == 0xef && uint8[1] == 0xbb && uint8[2] == 0xbf) {
       uint8 = uint8.subarray(3);
     }
+
     if (window.TextDecoder && !shaka.util.Platform.isPS4()) {
       // Use the TextDecoder interface to decode the text.  This has the
       // advantage compared to the previously-standard decodeUriComponent that
@@ -51,23 +52,69 @@ shaka.util.StringUtils = class {
       }
       return decoded;
     } else {
-      // http://stackoverflow.com/a/13691499
-      const utf8 = shaka.util.StringUtils.fromCharCode(uint8);
-      // This converts each character in the string to an escape sequence.  If
-      // the character is in the ASCII range, it is not converted; otherwise it
-      // is converted to a URI escape sequence.
-      // Example: '\x67\x35\xe3\x82\xac' -> 'g#%E3%82%AC'
-      const escaped = escape(utf8);
-      // Decode the escaped sequence.  This will interpret UTF-8 sequences into
-      // the correct character.
-      // Example: 'g#%E3%82%AC' -> 'g#€'
-      try {
-        return decodeURIComponent(escaped);
-      } catch (e) {
-        throw new shaka.util.Error(
-            shaka.util.Error.Severity.CRITICAL, shaka.util.Error.Category.TEXT,
-            shaka.util.Error.Code.BAD_ENCODING);
+      // Homebrewed UTF-8 decoder based on
+      // https://en.wikipedia.org/wiki/UTF-8#Encoding
+      // Unlike decodeURIComponent, won't throw on bad encoding.
+      // In this way, it is similar to TextDecoder.
+
+      let decoded = '';
+      for (let i = 0; i < uint8.length; ++i) {
+        // By default, the "replacement character" codepoint.
+        let codePoint = 0xFFFD;
+
+        // Top bit is 0, 1-byte encoding.
+        if ((uint8[i] & 0x80) == 0) {
+          codePoint = uint8[i];
+
+        // Top 3 bits of byte 0 are 110, top 2 bits of byte 1 are 10,
+        // 2-byte encoding.
+        } else if (uint8.length >= i + 2 &&
+            (uint8[i] & 0xe0) == 0xc0 &&
+            (uint8[i + 1] & 0xc0) == 0x80) {
+          codePoint = ((uint8[i] & 0x1f) << 6) |
+                      ((uint8[i + 1] & 0x3f));
+          i += 1;  // Consume one extra byte.
+
+        // Top 4 bits of byte 0 are 1110, top 2 bits of byte 1 and 2 are 10,
+        // 3-byte encoding.
+        } else if (uint8.length >= i + 3 &&
+            (uint8[i] & 0xf0) == 0xe0 &&
+            (uint8[i + 1] & 0xc0) == 0x80 &&
+            (uint8[i + 2] & 0xc0) == 0x80) {
+          codePoint = ((uint8[i] & 0x0f) << 12) |
+                      ((uint8[i + 1] & 0x3f) << 6) |
+                      ((uint8[i + 2] & 0x3f));
+          i += 2;  // Consume two extra bytes.
+
+        // Top 5 bits of byte 0 are 11110, top 2 bits of byte 1, 2 and 3 are 10,
+        // 4-byte encoding.
+        } else if (uint8.length >= i + 4 &&
+            (uint8[i] & 0xf1) == 0xf0 &&
+            (uint8[i + 1] & 0xc0) == 0x80 &&
+            (uint8[i + 2] & 0xc0) == 0x80 &&
+            (uint8[i + 3] & 0xc0) == 0x80) {
+          codePoint = ((uint8[i] & 0x07) << 18) |
+                      ((uint8[i + 1] & 0x3f) << 12) |
+                      ((uint8[i + 2] & 0x3f) << 6) |
+                      ((uint8[i + 3] & 0x3f));
+          i += 3;  // Consume three extra bytes.
+        }
+
+        // JavaScript strings are a series of UTF-16 characters.
+        if (codePoint <= 0xffff) {
+          decoded += String.fromCharCode(codePoint);
+        } else {
+          // UTF-16 surrogate-pair encoding, based on
+          // https://en.wikipedia.org/wiki/UTF-16#Description
+          const baseCodePoint = codePoint - 0x10000;
+          const highPart = baseCodePoint >> 10;
+          const lowPart = baseCodePoint & 0x3ff;
+          decoded += String.fromCharCode(0xd800 + highPart);
+          decoded += String.fromCharCode(0xdc00 + lowPart);
+        }
       }
+
+      return decoded;
     }
   }
 

diff --git a/test/util/string_utils_unit.js b/test/util/string_utils_unit.js
@@ -16,19 +16,34 @@ describe('StringUtils', () => {
   });
 
   it('won\'t break if given cut-off UTF8 character', () => {
-    // This array contains the first half of a 2-byte UTF8 character, stranded
-    // at the very end of the string.
-    const arr1 = [0x53, 0x61, 0x6e, 0x20, 0x4a, 0x6f, 0x73, 0x81];
+    const arr1 = [0x53, 0x61, 0x6e, 0x20, 0x4a, 0x6f, 0x73, 0xc3, 0xa9];
     expect(StringUtils.fromUTF8(new Uint8Array(arr1)))
+        .toBe('San Jos\u00E9');
+
+    // This array contains the first half of a 2-byte UTF8 character
+    // (0xc3 0xa9 = é).  The half-character is stranded at the very end of the
+    // string.
+    const arr = [0x53, 0x61, 0x6e, 0x20, 0x4a, 0x6f, 0x73, 0xc3];
+    expect(StringUtils.fromUTF8(new Uint8Array(arr)))
         .toBe('San Jos\uFFFD');
+  });
 
-    // For reasons I don't know, it seems like 0xE9 cannot be the start of a
-    // UTF8 character.  Perhaps it is a reserved number?
-    const arr2 = [0x4a, 0x6f, 0x73, 0xE9, 0x33, 0x33, 0x20, 0x53, 0x61, 0x6e];
-    expect(StringUtils.fromUTF8(new Uint8Array(arr2)))
+  it('won\'t break if given an invalid UTF-8 sequence', () => {
+    // 0xe9 0x33 0x33 is an invalid UTF-8 sequence.
+    const arr = [0x4a, 0x6f, 0x73, 0xE9, 0x33, 0x33, 0x20, 0x53, 0x61, 0x6e];
+    expect(StringUtils.fromUTF8(new Uint8Array(arr)))
         .toBe('Jos\uFFFD33 San');
   });
 
+  it('can handle an 8-byte character', () => {
+    // This is the UTF-8 encoding of the US flag emoji.
+    // It decodes into two Unicode codepoints, which becomes 4 JavaScript
+    // UTF-16 characters.
+    const arr = [0xf0, 0x9f, 0x87, 0xba, 0xf0, 0x9f, 0x87, 0xb8];
+    expect(StringUtils.fromUTF8(new Uint8Array(arr)))
+        .toBe('\uD83C\uDDFA\uD83C\uDDF8');
+  });
+
   it('strips the BOM in fromUTF8', () => {
     // This is 4 Unicode characters, the last will be split into a surrogate
     // pair.