Parse CEA-708 Packets from Fragmented MP4 streams. (#2660)

This is an MP4 Parser which extracts CEA-708 packets from Fragmented MP4 streams. The Closed Caption Parser (shaka.media.ClosedCaptionParser) will own this MP4 Parser, and will initialize it and call it as shown. As data comes in, the parser will parse this data, and the caption packets data then be returned in a callback (on708Data), as shown. Here, a theoretical decoder (future pull request, mentioned as a Todo comment) will decode and extract the parsed captions from these packets. Issue #2648
shaka-project · Jul 5, 2020 · 480d4a8 · 480d4a8
1 parent fb43b1f
commit 480d4a8
Show file tree

Hide file tree

Showing 11 changed files with 546 additions and 18 deletions.
diff --git a/build/types/cea b/build/types/cea
@@ -0,0 +1,5 @@
+# Inband closed caption support.
+
++../../lib/cea/mp4_cea_parser.js
++../../lib/cea/i_cea_parser.js
++../../lib/cea/sei_processor.js
diff --git a/build/types/core b/build/types/core
@@ -100,3 +100,5 @@
 
 +../../third_party/closure-uri/uri.js
 +../../third_party/closure-uri/utils.js
+
++@cea
diff --git a/lib/cea/i_cea_parser.js b/lib/cea/i_cea_parser.js
@@ -0,0 +1,56 @@
+/*! @license
+ * Shaka Player
+ * Copyright 2016 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+goog.provide('shaka.cea.ICeaParser');
+
+/**
+ * Interface for parsing inband closed caption data from MP4 streams.
+ * @interface
+ */
+shaka.cea.ICeaParser = class {
+  /**
+   * Initializes the parser with init segment data.
+   * @param {!BufferSource} initSegment init segment to parse.
+   */
+  init(initSegment) {}
+
+  /**
+   * Parses the stream and extracts closed captions packets.
+   * @param {!BufferSource} mediaSegment media segment to parse.
+   * @return {!Array<!shaka.cea.ICeaParser.CaptionPacket>}
+   */
+  parse(mediaSegment) {}
+};
+
+/**
+ * NALU type for Supplemental Enhancement Information (SEI).
+ * @const {number}
+ */
+shaka.cea.ICeaParser.NALU_TYPE_SEI = 0x06;
+
+/**
+ * Default timescale value for a track.
+ */
+shaka.cea.ICeaParser.DEFAULT_TIMESCALE_VALUE = 90000;
+
+/**
+ * @typedef {{
+ *   packet: !Uint8Array,
+ *   pts: !number
+ * }}
+ *
+ * @description Parsed Caption Packet.
+ * @property {!Uint8Array} packet
+ * Caption packet. More specifically, it contains a "User data
+ * registered by Recommendation ITU-T T.35 SEI message", from section D.1.6
+ * and section D.2.6 of Rec. ITU-T H.264 (06/2019).
+ * @property {!number} pts
+ * The presentation timestamp (pts) at which the ITU-T T.35 data shows up,
+ * in seconds.
+ * @exportDoc
+ */
+shaka.cea.ICeaParser.CaptionPacket;
+
diff --git a/lib/cea/mp4_cea_parser.js b/lib/cea/mp4_cea_parser.js
@@ -0,0 +1,258 @@
+/*! @license
+ * Shaka Player
+ * Copyright 2016 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+goog.provide('shaka.cea.Mp4CeaParser');
+
+goog.require('goog.asserts');
+goog.require('shaka.cea.ICeaParser');
+goog.require('shaka.cea.SeiProcessor');
+goog.require('shaka.util.Mp4Parser');
+goog.require('shaka.util.Mp4BoxParsers');
+goog.require('shaka.util.DataViewReader');
+
+/**
+ * MPEG4 stream parser used for extracting 708 closed captions data.
+ * @implements {shaka.cea.ICeaParser}
+ */
+shaka.cea.Mp4CeaParser = class {
+  constructor() {
+    /**
+     * SEI data processor.
+     * @private
+     * @const {!shaka.cea.SeiProcessor}
+     */
+    this.seiProcessor_ = new shaka.cea.SeiProcessor();
+
+    /**
+     * Map of track id to corresponding timescale.
+     * @private {!Map<number, number>}
+     */
+    this.trackIdToTimescale_ = new Map();
+
+    /**
+     * Default sample duration, as specified by the TREX box.
+     * @private {!number}
+     */
+    this.defaultSampleDuration_ = 0;
+
+    /**
+     * Default sample size, as specified by the TREX box.
+     * @private {!number}
+     */
+    this.defaultSampleSize_ = 0;
+  }
+
+  /**
+   * Parses the init segment. Gets Default Sample Duration and Size from the
+   * TREX box, and constructs a map of Track IDs to timescales. Each TRAK box
+   * contains a track header (TKHD) containing track ID, and a media header box
+   * (MDHD) containing the timescale for the track
+   * @override
+   */
+  init(initSegment) {
+    const Mp4Parser = shaka.util.Mp4Parser;
+    const trackIds = [];
+    const timescales = [];
+
+    new Mp4Parser()
+        .box('moov', Mp4Parser.children)
+        .box('mvex', Mp4Parser.children)
+        .fullBox('trex', (box) => {
+          const parsedTREXBox = shaka.util.Mp4BoxParsers.parseTREX(
+              box.reader);
+
+          this.defaultSampleDuration_ = parsedTREXBox.defaultSampleDuration;
+          this.defaultSampleSize_ = parsedTREXBox.defaultSampleSize;
+        })
+        .box('trak', Mp4Parser.children)
+        .fullBox('tkhd', (box) => {
+          goog.asserts.assert(
+              box.version != null,
+              'TKHD is a full box and should have a valid version.');
+          const parsedTKHDBox = shaka.util.Mp4BoxParsers.parseTKHD(
+              box.reader, box.version);
+          trackIds.push(parsedTKHDBox.trackId);
+        })
+        .box('mdia', Mp4Parser.children)
+        .fullBox('mdhd', (box) => {
+          goog.asserts.assert(
+              box.version != null,
+              'MDHD is a full box and should have a valid version.');
+          const parsedMDHDBox = shaka.util.Mp4BoxParsers.parseMDHD(
+              box.reader, box.version);
+          timescales.push(parsedMDHDBox.timescale);
+        })
+        .parse(initSegment, /* partialOkay= */ true);
+
+    // At least one track should exist, and each track should have a
+    // corresponding Id in TKHD box, and timescale in its MDHD box
+    if (!trackIds.length|| !timescales.length ||
+      trackIds.length != timescales.length) {
+      throw new shaka.util.Error(
+          shaka.util.Error.Severity.CRITICAL,
+          shaka.util.Error.Category.TEXT,
+          shaka.util.Error.Code.INVALID_MP4_CEA);
+    }
+
+    // Populate the map from track Id to timescale
+    trackIds.forEach((trackId, idx) => {
+      this.trackIdToTimescale_.set(trackId, timescales[idx]);
+    });
+  }
+
+  /**
+   * Parses each video segment. In fragmented MP4s, MOOF and MDAT come in
+   * pairs. The following logic gets the necessary info from MOOFs to parse
+   * MDATs (base media decode time, sample sizes/offsets/durations, etc),
+   * and then parses the MDAT boxes for CEA-708 packets using this information.
+   * CEA-708 packets are returned in the callback.
+   * @override
+   */
+  parse(mediaSegment) {
+    const Mp4Parser = shaka.util.Mp4Parser;
+
+    /** @type {!Array<!shaka.cea.ICeaParser.CaptionPacket>} **/
+    const captionPackets = [];
+
+    // Fields that are found in MOOF boxes
+    let defaultSampleDuration = this.defaultSampleDuration_;
+    let defaultSampleSize = this.defaultSampleSize_;
+    let sampleData = [];
+    let baseMediaDecodeTime = null;
+    let timescale = shaka.cea.ICeaParser.DEFAULT_TIMESCALE_VALUE;
+
+    new Mp4Parser()
+        .box('moof', Mp4Parser.children)
+        .box('traf', Mp4Parser.children)
+        .fullBox('trun', (box) => {
+          goog.asserts.assert(
+              box.version != null && box.flags!=null,
+              'TRUN is a full box and should have a valid version & flags.');
+
+          const parsedTRUN = shaka.util.Mp4BoxParsers.parseTRUN(
+              box.reader, box.version, box.flags);
+
+          sampleData = parsedTRUN.sampleData;
+        })
+
+        .fullBox('tfhd', (box) => {
+          goog.asserts.assert(
+              box.flags != null,
+              'TFHD is a full box and should have valid flags.');
+
+          const parsedTFHD = shaka.util.Mp4BoxParsers.parseTFHD(
+              box.reader, box.flags);
+
+          // If specified, defaultSampleDuration and defaultSampleSize
+          // override the ones specified in the TREX box
+          defaultSampleDuration = parsedTFHD.defaultSampleDuration
+            || this.defaultSampleDuration_;
+
+          defaultSampleSize = parsedTFHD.defaultSampleSize
+            || this.defaultSampleSize_;
+
+          const trackId = parsedTFHD.trackId;
+
+          // Get the timescale from the track Id
+          if (this.trackIdToTimescale_.has(trackId)) {
+            timescale = this.trackIdToTimescale_.get(trackId);
+          }
+        })
+
+        .fullBox('tfdt', (box) => {
+          goog.asserts.assert(
+              box.version != null,
+              'TFDT is a full box and should have a valid version.');
+
+          const parsedTFDT = shaka.util.Mp4BoxParsers.parseTFDT(
+              box.reader, box.version);
+
+          baseMediaDecodeTime = parsedTFDT.baseMediaDecodeTime;
+        })
+        .box('mdat', (box) => {
+          if (baseMediaDecodeTime === null) {
+            // This field should have been populated by
+            // the Base Media Decode time in the TFDT box
+            throw new shaka.util.Error(
+                shaka.util.Error.Severity.CRITICAL,
+                shaka.util.Error.Category.TEXT,
+                shaka.util.Error.Code.INVALID_MP4_CEA);
+          }
+          this.parseMdat_(box.reader, baseMediaDecodeTime, timescale,
+              defaultSampleDuration, defaultSampleSize, sampleData,
+              captionPackets);
+        })
+        .parse(mediaSegment, /* partialOkay= */ false);
+
+    return captionPackets;
+  }
+
+  /**
+   * Parse MDAT box.
+   * @param {!shaka.util.DataViewReader} reader
+   * @param {!number} time
+   * @param {!number} timescale
+   * @param {!number} defaultSampleDuration
+   * @param {!number} defaultSampleSize
+   * @param {!Array<shaka.util.ParsedTRUNSample>} sampleData
+   * @param {!Array<!shaka.cea.ICeaParser.CaptionPacket>} captionPackets
+   * @private
+   */
+  parseMdat_(reader, time, timescale, defaultSampleDuration,
+      defaultSampleSize, sampleData, captionPackets) {
+    let sampleIndex = 0;
+
+    // The fields in each ParsedTRUNSample contained in the sampleData
+    // array are nullable. In the case of sample data and sample duration,
+    // we use the defaults provided by the TREX/TFHD boxes. For sample
+    // composition time offset, we default to 0.
+    let sampleSize = defaultSampleSize;
+
+    if (sampleData.length) {
+      sampleSize = sampleData[0].sampleSize || defaultSampleSize;
+    }
+
+    while (reader.hasMoreData()) {
+      const naluSize = reader.readUint32();
+      const naluType = reader.readUint8() & 0x1F;
+      if (naluType == shaka.cea.ICeaParser.NALU_TYPE_SEI) {
+        let timeOffset = 0;
+
+        if (sampleData.length > sampleIndex) {
+          timeOffset = sampleData[sampleIndex].sampleCompositionTimeOffset || 0;
+        }
+
+        const pts = (time + timeOffset)/timescale;
+        for (const packet of this.seiProcessor_
+            .process(reader.readBytes(naluSize - 1))) {
+          captionPackets.push({
+            packet,
+            pts,
+          });
+        }
+      } else {
+        reader.skip(naluSize - 1);
+      }
+      sampleSize -= (naluSize + 4);
+      if (sampleSize == 0) {
+        if (sampleData.length > sampleIndex) {
+          time += sampleData[sampleIndex].sampleDuration ||
+              defaultSampleDuration;
+        } else {
+          time += defaultSampleDuration;
+        }
+
+        sampleIndex++;
+
+        if (sampleData.length > sampleIndex) {
+          sampleSize = sampleData[sampleIndex].sampleSize || defaultSampleSize;
+        } else {
+          sampleSize = defaultSampleSize;
+        }
+      }
+    }
+  }
+};