Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Fix VTT cue timing in HLS #4217

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 91 additions & 37 deletions lib/media/media_source_engine.js
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,12 @@ shaka.media.MediaSourceEngine = class {

/** @private {string} */
this.url_ = '';

/** @private {boolean} */
this.sequenceMode_ = false;

/** @private {!shaka.util.PublicPromise.<number>} */
this.textSequenceModeOffset_ = new shaka.util.PublicPromise();
}

/**
Expand Down Expand Up @@ -331,6 +337,8 @@ shaka.media.MediaSourceEngine = class {

await this.mediaSourceOpen_;

this.sequenceMode_ = sequenceMode;

for (const contentType of streamsByType.keys()) {
const stream = streamsByType.get(contentType);
goog.asserts.assert(
Expand All @@ -348,11 +356,9 @@ shaka.media.MediaSourceEngine = class {
mimeType =
shaka.media.Transmuxer.convertTsCodecs(contentType, mimeType);
}

const sourceBuffer = this.mediaSource_.addSourceBuffer(mimeType);
if (sequenceMode) {
sourceBuffer.mode =
shaka.media.MediaSourceEngine.SourceBufferMode_.SEQUENCE;
}

this.eventManager_.listen(
sourceBuffer, 'error',
() => this.onError_(contentType));
Expand Down Expand Up @@ -515,35 +521,29 @@ shaka.media.MediaSourceEngine = class {
* @param {?boolean} hasClosedCaptions True if the buffer contains CEA closed
* captions
* @param {boolean=} seeked True if we just seeked
* @param {boolean=} sequenceMode True if sequence mode
* @return {!Promise}
*/
async appendBuffer(contentType, data, startTime, endTime, hasClosedCaptions,
seeked, sequenceMode) {
async appendBuffer(
contentType, data, startTime, endTime, hasClosedCaptions, seeked) {
const ContentType = shaka.util.ManifestParserUtils.ContentType;

if (startTime != null && sequenceMode && contentType != ContentType.TEXT) {
// If we just cleared buffer and is on an unbuffered seek, we need to set
// the new timestampOffset of the sourceBuffer.
// Don't do this for text streams, though, since they don't use
// MediaSource anyway.
if (seeked) {
const timestampOffset = /** @type {number} */ (startTime);
this.enqueueOperation_(
contentType,
() => this.setTimestampOffset_(contentType, timestampOffset));
if (contentType == ContentType.TEXT) {
if (this.sequenceMode_) {
// This won't be known until the first video segment is appended.
const offset = await this.textSequenceModeOffset_;
this.textEngine_.setTimestampOffset(offset);
}
await this.textEngine_.appendBuffer(data, startTime, endTime);
return;
}

if (contentType == ContentType.TEXT) {
await this.textEngine_.appendBuffer(data, startTime, endTime);
} else if (this.transmuxers_[contentType]) {
if (this.transmuxers_[contentType]) {
const transmuxedData =
await this.transmuxers_[contentType].transmux(data);
// For HLS CEA-608/708 CLOSED-CAPTIONS, text data is embedded in
// the video stream, so textEngine may not have been initialized.
if (!this.textEngine_) {
this.reinitText('text/vtt', sequenceMode || false);
this.reinitText('text/vtt', this.sequenceMode_);
}

if (transmuxedData.metadata) {
Expand All @@ -562,15 +562,10 @@ shaka.media.MediaSourceEngine = class {
closedCaptions, startTime, endTime, videoOffset);
}

let transmuxedSegment = transmuxedData.data;
transmuxedSegment = this.workAroundBrokenPlatforms_(
transmuxedSegment, startTime, contentType);

await this.enqueueOperation_(
contentType, () => this.append_(contentType, transmuxedSegment));
data = transmuxedData.data;
} else if (hasClosedCaptions) {
if (!this.textEngine_) {
this.reinitText('text/vtt', sequenceMode || false);
this.reinitText('text/vtt', this.sequenceMode_);
}
// If it is the init segment for closed captions, initialize the closed
// caption parser.
Expand All @@ -585,19 +580,78 @@ shaka.media.MediaSourceEngine = class {
closedCaptions, startTime, endTime, videoOffset);
}
}
}

data = this.workAroundBrokenPlatforms_(data, startTime, contentType);
data = this.workAroundBrokenPlatforms_(data, startTime, contentType);

const sourceBuffer = this.sourceBuffers_[contentType];
const SEQUENCE = shaka.media.MediaSourceEngine.SourceBufferMode_.SEQUENCE;

if (this.sequenceMode_ && sourceBuffer.mode != SEQUENCE &&
startTime != null) {
// This is the first media segment to be appended to a SourceBuffer in
// sequence mode. We set the mode late so that we can trick MediaSource
// into extracting a timestamp for us to align text segments in sequence
// mode.

// Timestamps can only be reliably extracted from video, not audio.
// Packed audio formats do not have internal timestamps at all.
// Prefer video for this when available.
const isBestSourceBufferForTimestamps =
contentType == ContentType.VIDEO ||
!(ContentType.VIDEO in this.sourceBuffers_);
if (isBestSourceBufferForTimestamps) {
// Append the segment in segments mode first, with offset of 0 and an
// open append window.
const originalRange =
[sourceBuffer.appendWindowStart, sourceBuffer.appendWindowEnd];
sourceBuffer.appendWindowStart = 0;
sourceBuffer.appendWindowEnd = Infinity;

const originalOffset = sourceBuffer.timestampOffset;
sourceBuffer.timestampOffset = 0;

await this.enqueueOperation_(
contentType, () => this.append_(contentType, data));

// Reset the offset and append window.
sourceBuffer.timestampOffset = originalOffset;
sourceBuffer.appendWindowStart = originalRange[0];
sourceBuffer.appendWindowEnd = originalRange[1];

// Now get the timestamp of the segment and compute the offset for text
// segments.
const mediaStartTime = shaka.media.TimeRangesUtils.bufferStart(
this.getBuffered_(contentType));
const textOffset = (startTime || 0) - (mediaStartTime || 0);
this.textSequenceModeOffset_.resolve(textOffset);

// Finally, clear the buffer.
await this.enqueueOperation_(
contentType,
() => this.remove_(contentType, 0, this.mediaSource_.duration));
}

await this.enqueueOperation_(
contentType,
() => this.append_(contentType, data));
} else {
data = this.workAroundBrokenPlatforms_(data, startTime, contentType);
// Now switch to sequence mode and fall through to our normal operations.
sourceBuffer.mode = SEQUENCE;
}

await this.enqueueOperation_(
contentType,
() => this.append_(contentType, data));
if (startTime != null && this.sequenceMode_ &&
contentType != ContentType.TEXT) {
// In sequence mode, for non-text streams, if we just cleared the buffer
// and are performing an unbuffered seek, we need to set a new
// timestampOffset on the sourceBuffer.
if (seeked) {
const timestampOffset = /** @type {number} */ (startTime);
this.enqueueOperation_(
contentType,
() => this.setTimestampOffset_(contentType, timestampOffset));
}
}

await this.enqueueOperation_(
contentType,
() => this.append_(contentType, data));
}

/**
Expand Down
3 changes: 1 addition & 2 deletions lib/media/streaming_engine.js
Original file line number Diff line number Diff line change
Expand Up @@ -1612,8 +1612,7 @@ shaka.media.StreamingEngine = class {
reference.syncTime == null ? reference.startTime : reference.syncTime,
reference.endTime,
hasClosedCaptions,
seeked,
this.manifest_.sequenceMode);
seeked);
this.destroyer_.ensureNotDestroyed();
shaka.log.v2(logPrefix, 'appended media segment');
}
Expand Down
14 changes: 7 additions & 7 deletions lib/text/vtt_text_parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,15 @@ shaka.text.VttTextParser = class {
// to the beginning of each segment.
// NOTE: "periodStart" is the timestamp offset applied via TextEngine.
// It is no longer closely tied to periods, but the name stuck around.
// NOTE: This offset and the flag choosing its meaning have no effect on
// HLS content, which should use X-TIMESTAMP-MAP and periodStart instead.
let offset = time.vttOffset;

// Do not honor the 'X-TIMESTAMP-MAP' value when in sequence mode.
// That is because it is used mainly (solely?) to account for the timestamp
// offset of the video/audio; when in sequence mode, we normalize that
// timestamp offset to 0, so we should not account for it.
if (blocks[0].includes('X-TIMESTAMP-MAP') && !this.sequenceMode_) {
// Only use 'X-TIMESTAMP-MAP' in sequence mode, as that is currently
// shorthand for HLS. Note that an offset based on the first video
// timestamp has already been extracted, and appears in periodStart.
// The relative offset from X-TIMESTAMP-MAP will be added to that for HLS.
if (blocks[0].includes('X-TIMESTAMP-MAP') && this.sequenceMode_) {
// https://bit.ly/2K92l7y
// The 'X-TIMESTAMP-MAP' header is used in HLS to align text with
// the rest of the media.
Expand Down Expand Up @@ -109,8 +111,6 @@ shaka.text.VttTextParser = class {
mpegTime += shaka.text.VttTextParser.TS_ROLLOVER_;
}

// Apple-encoded HLS content uses absolute timestamps, so assume the
// presence of the map tag means the content uses absolute timestamps.
offset = time.periodStart + mpegTime / mpegTimescale - cueTime;
}
}
Expand Down
69 changes: 64 additions & 5 deletions test/text/vtt_text_parser_unit.js
Original file line number Diff line number Diff line change
Expand Up @@ -535,7 +535,61 @@ describe('VttTextParser', () => {
'Test\n\n' +
'00:00:40.000 --> 00:00:50.000 line:-1\n' +
'Test2',
{periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0});
{periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0},
/* sequenceMode= */ true);
});

it('ignores X-TIMESTAMP-MAP header if not in sequence mode', () => {
verifyHelper(
[
{startTime: 20, endTime: 40, payload: 'Test'},
{startTime: 40, endTime: 50, payload: 'Test2'},
],
'WEBVTT\n' +
'X-TIMESTAMP-MAP=MPEGTS:900000,LOCAL:01:00:00.000\n\n' +
'00:00:20.000 --> 00:00:40.000 line:0\n' +
'Test\n\n' +
'00:00:40.000 --> 00:00:50.000 line:-1\n' +
'Test2',
{periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0},
/* sequenceMode= */ false);
});

it('parses X-TIMESTAMP-MAP header with non-zero local base', () => {
verifyHelper(
[
{startTime: 1800, endTime: 1810, payload: 'Test'},
{startTime: 1820, endTime: 1830, payload: 'Test2'},
],
// 162000000 = 30 * 60 * 90k = 30 minutes for the TS part of the map.
// The local (VTT) part of the map is 1 hour.
// So text times of 1 hour map to media times of 30 minutes.
'WEBVTT\n' +
'X-TIMESTAMP-MAP=MPEGTS:162000000,LOCAL:01:00:00.000\n\n' +
'01:00:00.000 --> 01:00:10.000 line:0\n' +
'Test\n\n' +
'01:00:20.000 --> 01:00:30.000 line:-1\n' +
'Test2',
{periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0},
/* sequenceMode= */ true);
});

it('combines X-TIMESTAMP-MAP header with periodStart', () => {
verifyHelper(
[
{startTime: 130, endTime: 150, payload: 'Test'},
{startTime: 150, endTime: 160, payload: 'Test2'},
],
// 900000 = 10 sec, so expect every timestamp to be 10
// seconds ahead of what is specified.
'WEBVTT\n' +
'X-TIMESTAMP-MAP=MPEGTS:900000,LOCAL:00:00:00.000\n\n' +
'00:00:20.000 --> 00:00:40.000 line:0\n' +
'Test\n\n' +
'00:00:40.000 --> 00:00:50.000 line:-1\n' +
'Test2',
{periodStart: 100, segmentStart: 25, segmentEnd: 65, vttOffset: 0},
/* sequenceMode= */ true);
});

it('handles timestamp rollover with X-TIMESTAMP-MAP header', () => {
Expand All @@ -551,7 +605,8 @@ describe('VttTextParser', () => {
'Test',
// Non-null segmentStart takes precedence over X-TIMESTAMP-MAP.
// This protects us from rollover in the MPEGTS field.
{periodStart: 0, segmentStart: 95440, segmentEnd: 95550, vttOffset: 0});
{periodStart: 0, segmentStart: 95440, segmentEnd: 95550, vttOffset: 0},
/* sequenceMode= */ true);

verifyHelper(
[
Expand All @@ -564,7 +619,8 @@ describe('VttTextParser', () => {
'X-TIMESTAMP-MAP=MPEGTS:9745408,LOCAL:00:00:00.000\n\n' +
'00:00:00.000 --> 00:00:02.000 line:0\n' +
'Test2',
{periodStart: 0, segmentStart: 95550, segmentEnd: 95560, vttOffset: 0});
{periodStart: 0, segmentStart: 95550, segmentEnd: 95560, vttOffset: 0},
/* sequenceMode= */ true);
});

it('supports global style blocks', () => {
Expand Down Expand Up @@ -978,11 +1034,14 @@ describe('VttTextParser', () => {
* @param {!Array} cues
* @param {string} text
* @param {shaka.extern.TextParser.TimeContext} time
* @param {boolean=} sequenceMode
*/
function verifyHelper(cues, text, time) {
function verifyHelper(cues, text, time, sequenceMode = false) {
const data =
shaka.util.BufferUtils.toUint8(shaka.util.StringUtils.toUTF8(text));
const result = new shaka.text.VttTextParser().parseMedia(data, time);
const parser = new shaka.text.VttTextParser();
parser.setSequenceMode(sequenceMode);
const result = parser.parseMedia(data, time);

const expected = cues.map((cue) => {
if (cue.nestedCues) {
Expand Down