Skip to content

Commit

Permalink
DLSpeech: Fix various issues (#2671)
Browse files Browse the repository at this point in the history
* Bump tarball

* Set output format

* Fix iOS on sample rate

* Use Array.fill for better performance

* Interpolate when upsampling

* Adding multi-buffering player

* Fix Safari missing copyToChannel function

* Add entry

* Update SHA for tarball

* Fix test

* Fix tests

* Apply suggestions from code review

Co-Authored-By: Corina <[email protected]>

* Link to issue

* Apply PR comments
  • Loading branch information
compulim authored and corinagum committed Dec 6, 2019
1 parent 1f100e7 commit cb7657e
Show file tree
Hide file tree
Showing 8 changed files with 265 additions and 56 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- `component`: Updated timer to use functional component, by [@spyip](https://github.com/spyip) in PR [#2546](https://github.com/microsoft/BotFramework-WebChat/pull/2546)
- Fixes [#2651](https://github.com/microsoft/BotFramework-WebChat/issues/2651). Add `ends-with` string module to es5 bundle, by [@corinagum](https://github.com/corinagum) in PR [#2654](https://github.com/microsoft/BotFramework-WebChat/pull/2654)
- Fixes [#2658](https://github.com/microsoft/BotFramework-WebChat/issues/2658). Fix rendering of markdown images in IE11, by [@corinagum](https://github.com/corinagum) in PR [#2659](https://github.com/microsoft/BotFramework-WebChat/pull/2659)
- Fixes [#2662](https://github.com/microsoft/BotFramework-WebChat/issues/2662) and [#2666](https://github.com/microsoft/BotFramework-WebChat/issues/2666). Fix various issues related to Direct Line Speech, by [@compulim](https://github.com/compulim) in PR [#2671](https://github.com/microsoft/BotFramework-WebChat/pull/2671)
- Added triple-buffering to reduce pops/cracks.
- Enable Safari by upsampling to 48000 Hz.
- Support detailed output format on Web Chat side.

### Changed

Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
class MockAudioBuffer {
constructor(channels, frames, samplesPerSec) {
this._channelData = new Array(channels).fill(new Array(frames * samplesPerSec));
this._channels = channels;
this._channelData = new Array(channels).fill().map(() => new Array(frames * samplesPerSec));
}

getChannelData(channel) {
return this._channelData[channel];
}

get numberOfChannels() {
return this._channels;
}
}

class MockAudioBufferSource {
Expand Down
Binary file not shown.
2 changes: 1 addition & 1 deletion packages/directlinespeech/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 10 additions & 11 deletions packages/directlinespeech/src/createAdapters.js
Original file line number Diff line number Diff line change
Expand Up @@ -92,20 +92,18 @@ export default async function create({

// Supported options can be found in DialogConnectorFactory.js.

// Set the language used for recognition.
config.setProperty(PropertyId.SpeechServiceConnection_RecoLanguage, speechRecognitionLanguage);

// The following code set the output format. But currently, none of the following works for setting detailed output format.
// We will leave these code commented until the Speech SDK support, possibly it in one of the way mentioned below.

// The following code sets the output format.
// As advised by the Speech team, this API may be subject to future changes.
// We are not enabling output format option because it does not send detailed output format to the bot, rendering this option useless.
// config.setProperty(PropertyId.SpeechServiceResponse_OutputFormatOption, OutputFormat[OutputFormat.Detailed]);
// config.setProperty(PropertyId.SpeechServiceResponse_RequestDetailedResultTrueFalse, true);
// config.setProperty(OutputFormatPropertyName, OutputFormat[OutputFormat.Detailed]);
// config.setServiceProperty(PropertyId.SpeechServiceResponse_RequestDetailedResultTrueFalse, "true", ServicePropertyChannel.UriQueryParameter);

// The following code is copied from C#, it should set from.id, but it did not.
// https://github.com/Azure-Samples/Cognitive-Services-Direct-Line-Speech-Client/blob/master/DLSpeechClient/MainWindow.xaml.cs#L236
// Set the user ID for starting the conversation.
userID && config.setProperty(PropertyId.Conversation_From_Id, userID);

// Set Custom Speech and Custom Voice.
// The following code is copied from C#, and it is not working yet.
// https://github.com/Azure-Samples/Cognitive-Services-Direct-Line-Speech-Client/blob/master/DLSpeechClient/MainWindow.xaml.cs
// speechRecognitionEndpointId && config.setServiceProperty('cid', speechRecognitionEndpointId, ServicePropertyChannel.UriQueryParameter);
Expand All @@ -115,11 +113,12 @@ export default async function create({

dialogServiceConnector.connect();

// Renew token
// Renew token per interval.
if (authorizationToken) {
const interval = setInterval(async () => {
// If the connector has been disposed, we should stop renewing the token.
// TODO: We should use a public implementation if Speech SDK has one.
// #2660 If the connector has been disposed, we should stop renewing the token.

// TODO: We should use a public implementation if Speech SDK has one related to "privIsDisposed".
if (dialogServiceConnector.privIsDisposed) {
clearInterval(interval);
}
Expand Down
108 changes: 108 additions & 0 deletions packages/directlinespeech/src/createMultiBufferingPlayer.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
// Currently, Web Chat uses a triple-buffer approach.
const NUM_BUFFER = 3;

function zeroBuffer(buffer) {
const channels = buffer.numberOfChannels;

for (let channel = 0; channel < channels; channel++) {
const audioData = buffer.getChannelData(channel);

[].fill.call(audioData, 0);
}
}

function copyBuffer(buffer, multiChannelArrayBuffer) {
const channels = buffer.numberOfChannels;

for (let channel = 0; channel < channels; channel++) {
const arrayBuffer = multiChannelArrayBuffer[channel];

// Note that Safari does not support AudioBuffer.copyToChannel yet.
if (buffer.copyToChannel) {
buffer.copyToChannel(arrayBuffer, channel);
} else {
const { length: arrayBufferLength } = arrayBuffer;
const perChannelBuffer = buffer.getChannelData(channel);

for (let offset = 0; offset < arrayBufferLength; offset++) {
perChannelBuffer[offset] = arrayBuffer[offset];
}
}
}
}

// This is a multi-buffering player. Users can keep pushing buffer to Web Chat.
// The buffer, realized as BufferSource, is queued to AudioContext.
// Data will be queued as quickly and frequently as possible.
// Web Chat does not support progressive buffering (pushing a partial buffer) and there are currently no plans to implement.

export default function createMultiBufferingPlayer(audioContext, { channels, samplesPerSec }, numSamplePerBuffer) {
const freeBuffers = new Array(NUM_BUFFER)
.fill()
.map(() => audioContext.createBuffer(channels, numSamplePerBuffer, samplesPerSec));
let queuedBufferSources = [];
let nextSchedule;

const queue = [];

const playNext = () => {
if (typeof nextSchedule !== 'number') {
nextSchedule = audioContext.currentTime;
}

const bufferSource = audioContext.createBufferSource();
const multiChannelArrayBuffer = queue.shift();

if (typeof multiChannelArrayBuffer === 'function') {
// If the queued item is a function, it is because the user called "flush".
// The "flush" function will callback when all queued buffers before the "flush" call have played.
multiChannelArrayBuffer();
} else if (multiChannelArrayBuffer) {
const nextBuffer = freeBuffers.shift();

// If all buffers are currently occupied, prepend the data back to the queue.
// When one of the buffers finish, it will call playNext() again to pick up items from the queue.
if (!nextBuffer) {
queue.unshift(multiChannelArrayBuffer);

return;
}

zeroBuffer(nextBuffer);
copyBuffer(nextBuffer, multiChannelArrayBuffer);

bufferSource.buffer = nextBuffer;
bufferSource.connect(audioContext.destination);
bufferSource.start(nextSchedule);

// All BufferSource data that is currently queued will be stored at the AudioContext, via bufferSource.start().
// This is for cancelAll() to effectively cancel all BufferSource queued at the AudioContext.
queuedBufferSources.push(bufferSource);

nextSchedule += nextBuffer.duration;

bufferSource.addEventListener('ended', () => {
queuedBufferSources = queuedBufferSources.filter(target => target !== bufferSource);

// Declare the buffer is free to pick up on the next iteration.
freeBuffers.push(nextBuffer);
playNext();
});
}
};

return {
cancelAll: () => {
queue.splice(0);

// Although all buffers are cleared, there are still some BufferSources queued at the AudioContext that need to be stopped.
queuedBufferSources.forEach(bufferSource => bufferSource.stop());
},
flush: () => new Promise(resolve => queue.push(resolve)),
push: multiChannelArrayBuffer => {
queue.push(multiChannelArrayBuffer);

playNext();
}
};
}
134 changes: 99 additions & 35 deletions packages/directlinespeech/src/playCognitiveServicesStream.js
Original file line number Diff line number Diff line change
@@ -1,27 +1,29 @@
/* eslint no-magic-numbers: ["error", { "ignore": [8, 16, 32, 128, 1000, 32768, 2147483648] }] */
/* eslint no-magic-numbers: ["error", { "ignore": [0, 1, 8, 16, 32, 128, 1000, 32768, 96000, 2147483648] }] */
/* eslint no-await-in-loop: "off" */
/* eslint prefer-destructuring: "off" */

import cognitiveServicesPromiseToESPromise from './cognitiveServicesPromiseToESPromise';
import createDeferred from 'p-defer';
import createMultiBufferingPlayer from './createMultiBufferingPlayer';

function createBufferSource(audioContext, { channels, samplesPerSec }, channelInterleavedAudioData) {
const bufferSource = audioContext.createBufferSource();
const frames = channelInterleavedAudioData.length / channels;
const audioBuffer = audioContext.createBuffer(channels, frames, samplesPerSec);
// Safari requires an audio buffer with a sample rate of 22050 Hz.
// Using a minimum sample rate of 44100 Hz as an example, the Speech SDK's default 16000 Hz will be upsampled to 48000 Hz.
const MIN_SAMPLE_RATE = 44100;

for (let channel = 0; channel < channels; channel++) {
const perChannelAudioData = audioBuffer.getChannelData(channel);
// The Speech SDK is hardcoded to chop packets to 4096 bytes.
// Web Chat's multi-buffering player is set up with 3 buffers; each is 4096 bytes (2048 16-bit samples).
// For simplicity, the multi-buffer player currently does not support progressive buffering.

// We are copying channel-interleaved audio data, into per-channel audio data
for (let perChannelIndex = 0; perChannelIndex < channelInterleavedAudioData.length; perChannelIndex++) {
perChannelAudioData[perChannelIndex] = channelInterleavedAudioData[perChannelIndex * channels + channel];
}
}
// Progressive buffering allows queuing at any sample size and will be concatenated.
// If 1000 samples are queued, then 1048 samples are queued, they will be concatenated into a single buffer of size 2048.

// For simplicity, data will be queued to two buffers.
// The first buffer is 1000 samples followed by 1048 zeroes, and the second buffer is 1048 samples followed by 1000 zeroes.

bufferSource.buffer = audioBuffer;
// There is no plan to support progressive buffering until the Speech SDK chops data at dynamic size.
const DEFAULT_BUFFER_SIZE = 4096;

return bufferSource;
function average(array) {
return array.reduce((sum, value) => sum + value, 0) / array.length;
}

function formatTypedBitArrayToFloatArray(audioData, maxValue) {
Expand Down Expand Up @@ -56,6 +58,49 @@ function abortToReject(signal) {
});
}

// In a 2 channel audio (e.g. A/B), the data arrives as interleaved, like "ABABABABAB".
// This function will take "ABABABABAB" and return an array ["AAAAA", "BBBBB"].
function deinterleave(channelInterleavedAudioData, { channels }) {
const multiChannelArrayBuffer = new Array(channels);
const frameSize = channelInterleavedAudioData.length / channels;

for (let channel = 0; channel < channels; channel++) {
const audioData = new Float32Array(frameSize);

multiChannelArrayBuffer[channel] = audioData;

for (let offset = 0; offset < frameSize; offset++) {
audioData[offset] = channelInterleavedAudioData[offset * channels + channel];
}
}

return multiChannelArrayBuffer;
}

// This function upsamples the audio data via an integer multiplier.
// Web Chat uses simple anti-aliasing. For simplicity, the anti-aliasing does not roll over to next buffer.
function multiplySampleRate(source, sampleRateMultiplier) {
if (sampleRateMultiplier === 1) {
return source;
}

const lastValues = new Array(sampleRateMultiplier).fill(source[0]);
const target = new Float32Array(source.length * sampleRateMultiplier);

for (let sourceOffset = 0; sourceOffset < source.length; sourceOffset++) {
const value = source[sourceOffset];
const targetOffset = sourceOffset * sampleRateMultiplier;

for (let multiplierIndex = 0; multiplierIndex < sampleRateMultiplier; multiplierIndex++) {
lastValues.shift();
lastValues.push(value);
target[targetOffset + multiplierIndex] = average(lastValues);
}
}

return target;
}

export default async function playCognitiveServicesStream(
audioContext,
audioFormat,
Expand All @@ -66,7 +111,6 @@ export default async function playCognitiveServicesStream(

try {
const abortPromise = abortToReject(signal);
let lastBufferSource;

const read = () =>
Promise.race([
Expand All @@ -79,43 +123,63 @@ export default async function playCognitiveServicesStream(
throw new Error('aborted');
}

let newSamplesPerSec = audioFormat.samplesPerSec;
let sampleRateMultiplier = 1;

// Safari requires a minimum sample rate of 22100 Hz.
// A multiplier is calculated the the data meets the minimum sample rate.
// An integer-based multiplier to simplify our upsampler.
// For security, data will only be upsampled up to 96000 Hz.
while (newSamplesPerSec < MIN_SAMPLE_RATE && newSamplesPerSec < 96000) {
sampleRateMultiplier++;
newSamplesPerSec = audioFormat.samplesPerSec * sampleRateMultiplier;
}

// The third parameter is the sample size in bytes.
// For example, if the Speech SDK sends Web Chat 4096 bytes of 16-bit samples, there will be 2048 samples per channel.
// The multi-buffering player is set up to handle 2048 samples per buffer.
// If the multiplier 3x, it will handle 6144 samples per buffer.
const player = createMultiBufferingPlayer(
audioContext,
{ ...audioFormat, samplesPerSec: newSamplesPerSec },
(DEFAULT_BUFFER_SIZE / (audioFormat.bitsPerSample / 8)) * sampleRateMultiplier
);

// For security, the maximum number of chunks handled will be 1000.
for (
let chunk = await read(), currentTime, maxChunks = 0;
let chunk = await read(), maxChunks = 0;
!chunk.isEnd && maxChunks < 1000 && !signal.aborted;
chunk = await read(), maxChunks++
) {
if (signal.aborted) {
break;
}

const audioData = formatAudioDataArrayBufferToFloatArray(audioFormat, chunk.buffer);
const bufferSource = createBufferSource(audioContext, audioFormat, audioData);
const { duration } = bufferSource.buffer;
// Data received from Speech SDK is interleaved; 2 channels (e.g. A and B) will be sent as "ABABABABAB"
// And each sample (A/B) will be an 8 to 32-bit number.

if (!currentTime) {
currentTime = audioContext.currentTime;
}
// Convert the 8 - 32-bit number into a floating-point number, as required by Web Audio API.
const interleavedArrayBuffer = formatAudioDataArrayBufferToFloatArray(audioFormat, chunk.buffer);

bufferSource.connect(audioContext.destination);
bufferSource.start(currentTime);
// Deinterleave data back into two array buffer, e.g. "AAAAA" and "BBBBB".
const multiChannelArrayBuffer = deinterleave(interleavedArrayBuffer, audioFormat);

queuedBufferSourceNodes.push(bufferSource);
// Upsample data if necessary. If the multiplier is 2x, "AAAAA" will be upsampled to "AAAAAAAAAA" (with anti-alias).
const upsampledMultiChannelArrayBuffer = multiChannelArrayBuffer.map(arrayBuffer =>
multiplySampleRate(arrayBuffer, sampleRateMultiplier)
);

lastBufferSource = bufferSource;
currentTime += duration;
// Queue to the buffering player.
player.push(upsampledMultiChannelArrayBuffer);
}

abortPromise.catch(() => player.cancelAll());

if (signal.aborted) {
throw new Error('aborted');
}

if (lastBufferSource) {
const { promise, resolve } = createDeferred();

lastBufferSource.onended = resolve;

await Promise.race([abortPromise, promise]);
}
await Promise.race([abortPromise, player.flush()]);
} finally {
queuedBufferSourceNodes.forEach(node => node.stop());
}
Expand Down
Loading

0 comments on commit cb7657e

Please sign in to comment.