DLSpeech: Fix various issues (#2671)

* Bump tarball * Set output format * Fix iOS on sample rate * Use Array.fill for better performance * Interpolate when upsampling * Adding multi-buffering player * Fix Safari missing copyToChannel function * Add entry * Update SHA for tarball * Fix test * Fix tests * Apply suggestions from code review Co-Authored-By: Corina <[email protected]> * Link to issue * Apply PR comments
microsoft · Dec 6, 2019 · cb7657e · cb7657e
1 parent 1f100e7
commit cb7657e
Show file tree

Hide file tree

Showing 8 changed files with 265 additions and 56 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -66,6 +66,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 -  `component`: Updated timer to use functional component, by [@spyip](https://github.com/spyip) in PR [#2546](https://github.com/microsoft/BotFramework-WebChat/pull/2546)
 -  Fixes [#2651](https://github.com/microsoft/BotFramework-WebChat/issues/2651). Add `ends-with` string module to es5 bundle, by [@corinagum](https://github.com/corinagum) in PR [#2654](https://github.com/microsoft/BotFramework-WebChat/pull/2654)
 -  Fixes [#2658](https://github.com/microsoft/BotFramework-WebChat/issues/2658). Fix rendering of markdown images in IE11, by [@corinagum](https://github.com/corinagum) in PR [#2659](https://github.com/microsoft/BotFramework-WebChat/pull/2659)
+-  Fixes [#2662](https://github.com/microsoft/BotFramework-WebChat/issues/2662) and [#2666](https://github.com/microsoft/BotFramework-WebChat/issues/2666). Fix various issues related to Direct Line Speech, by [@compulim](https://github.com/compulim) in PR [#2671](https://github.com/microsoft/BotFramework-WebChat/pull/2671)
+   -  Added triple-buffering to reduce pops/cracks.
+   -  Enable Safari by upsampling to 48000 Hz.
+   -  Support detailed output format on Web Chat side.
 
 ### Changed
 

diff --git a/packages/directlinespeech/__tests__/utilities/MockAudioContext.js b/packages/directlinespeech/__tests__/utilities/MockAudioContext.js
@@ -1,11 +1,16 @@
 class MockAudioBuffer {
   constructor(channels, frames, samplesPerSec) {
-    this._channelData = new Array(channels).fill(new Array(frames * samplesPerSec));
+    this._channels = channels;
+    this._channelData = new Array(channels).fill().map(() => new Array(frames * samplesPerSec));
   }
 
   getChannelData(channel) {
     return this._channelData[channel];
   }
+
+  get numberOfChannels() {
+    return this._channels;
+  }
 }
 
 class MockAudioBufferSource {

diff --git a/...ages/directlinespeech/external/microsoft-cognitiveservices-speech-sdk-1.6.0-alpha.0.1.tgz b/...ages/directlinespeech/external/microsoft-cognitiveservices-speech-sdk-1.6.0-alpha.0.1.tgz
diff --git a/packages/directlinespeech/package-lock.json b/packages/directlinespeech/package-lock.json
diff --git a/packages/directlinespeech/src/createAdapters.js b/packages/directlinespeech/src/createAdapters.js
@@ -92,20 +92,18 @@ export default async function create({
 
   // Supported options can be found in DialogConnectorFactory.js.
 
+  // Set the language used for recognition.
   config.setProperty(PropertyId.SpeechServiceConnection_RecoLanguage, speechRecognitionLanguage);
 
-  // The following code set the output format. But currently, none of the following works for setting detailed output format.
-  // We will leave these code commented until the Speech SDK support, possibly it in one of the way mentioned below.
-
+  // The following code sets the output format.
+  // As advised by the Speech team, this API may be subject to future changes.
+  // We are not enabling output format option because it does not send detailed output format to the bot, rendering this option useless.
   // config.setProperty(PropertyId.SpeechServiceResponse_OutputFormatOption, OutputFormat[OutputFormat.Detailed]);
-  // config.setProperty(PropertyId.SpeechServiceResponse_RequestDetailedResultTrueFalse, true);
-  // config.setProperty(OutputFormatPropertyName, OutputFormat[OutputFormat.Detailed]);
-  // config.setServiceProperty(PropertyId.SpeechServiceResponse_RequestDetailedResultTrueFalse, "true", ServicePropertyChannel.UriQueryParameter);
 
-  // The following code is copied from C#, it should set from.id, but it did not.
-  // https://github.com/Azure-Samples/Cognitive-Services-Direct-Line-Speech-Client/blob/master/DLSpeechClient/MainWindow.xaml.cs#L236
+  // Set the user ID for starting the conversation.
   userID && config.setProperty(PropertyId.Conversation_From_Id, userID);
 
+  // Set Custom Speech and Custom Voice.
   // The following code is copied from C#, and it is not working yet.
   // https://github.com/Azure-Samples/Cognitive-Services-Direct-Line-Speech-Client/blob/master/DLSpeechClient/MainWindow.xaml.cs
   // speechRecognitionEndpointId && config.setServiceProperty('cid', speechRecognitionEndpointId, ServicePropertyChannel.UriQueryParameter);
@@ -115,11 +113,12 @@ export default async function create({
 
   dialogServiceConnector.connect();
 
-  // Renew token
+  // Renew token per interval.
   if (authorizationToken) {
     const interval = setInterval(async () => {
-      // If the connector has been disposed, we should stop renewing the token.
-      // TODO: We should use a public implementation if Speech SDK has one.
+      // #2660 If the connector has been disposed, we should stop renewing the token.
+
+      // TODO: We should use a public implementation if Speech SDK has one related to "privIsDisposed".
       if (dialogServiceConnector.privIsDisposed) {
         clearInterval(interval);
       }

diff --git a/packages/directlinespeech/src/createMultiBufferingPlayer.js b/packages/directlinespeech/src/createMultiBufferingPlayer.js
@@ -0,0 +1,108 @@
+// Currently, Web Chat uses a triple-buffer approach.
+const NUM_BUFFER = 3;
+
+function zeroBuffer(buffer) {
+  const channels = buffer.numberOfChannels;
+
+  for (let channel = 0; channel < channels; channel++) {
+    const audioData = buffer.getChannelData(channel);
+
+    [].fill.call(audioData, 0);
+  }
+}
+
+function copyBuffer(buffer, multiChannelArrayBuffer) {
+  const channels = buffer.numberOfChannels;
+
+  for (let channel = 0; channel < channels; channel++) {
+    const arrayBuffer = multiChannelArrayBuffer[channel];
+
+    // Note that Safari does not support AudioBuffer.copyToChannel yet.
+    if (buffer.copyToChannel) {
+      buffer.copyToChannel(arrayBuffer, channel);
+    } else {
+      const { length: arrayBufferLength } = arrayBuffer;
+      const perChannelBuffer = buffer.getChannelData(channel);
+
+      for (let offset = 0; offset < arrayBufferLength; offset++) {
+        perChannelBuffer[offset] = arrayBuffer[offset];
+      }
+    }
+  }
+}
+
+// This is a multi-buffering player. Users can keep pushing buffer to Web Chat.
+// The buffer, realized as BufferSource, is queued to AudioContext.
+// Data will be queued as quickly and frequently as possible.
+// Web Chat does not support progressive buffering (pushing a partial buffer) and there are currently no plans to implement.
+
+export default function createMultiBufferingPlayer(audioContext, { channels, samplesPerSec }, numSamplePerBuffer) {
+  const freeBuffers = new Array(NUM_BUFFER)
+    .fill()
+    .map(() => audioContext.createBuffer(channels, numSamplePerBuffer, samplesPerSec));
+  let queuedBufferSources = [];
+  let nextSchedule;
+
+  const queue = [];
+
+  const playNext = () => {
+    if (typeof nextSchedule !== 'number') {
+      nextSchedule = audioContext.currentTime;
+    }
+
+    const bufferSource = audioContext.createBufferSource();
+    const multiChannelArrayBuffer = queue.shift();
+
+    if (typeof multiChannelArrayBuffer === 'function') {
+      // If the queued item is a function, it is because the user called "flush".
+      // The "flush" function will callback when all queued buffers before the "flush" call have played.
+      multiChannelArrayBuffer();
+    } else if (multiChannelArrayBuffer) {
+      const nextBuffer = freeBuffers.shift();
+
+      // If all buffers are currently occupied, prepend the data back to the queue.
+      // When one of the buffers finish, it will call playNext() again to pick up items from the queue.
+      if (!nextBuffer) {
+        queue.unshift(multiChannelArrayBuffer);
+
+        return;
+      }
+
+      zeroBuffer(nextBuffer);
+      copyBuffer(nextBuffer, multiChannelArrayBuffer);
+
+      bufferSource.buffer = nextBuffer;
+      bufferSource.connect(audioContext.destination);
+      bufferSource.start(nextSchedule);
+
+      // All BufferSource data that is currently queued will be stored at the AudioContext, via bufferSource.start().
+      // This is for cancelAll() to effectively cancel all BufferSource queued at the AudioContext.
+      queuedBufferSources.push(bufferSource);
+
+      nextSchedule += nextBuffer.duration;
+
+      bufferSource.addEventListener('ended', () => {
+        queuedBufferSources = queuedBufferSources.filter(target => target !== bufferSource);
+
+        // Declare the buffer is free to pick up on the next iteration.
+        freeBuffers.push(nextBuffer);
+        playNext();
+      });
+    }
+  };
+
+  return {
+    cancelAll: () => {
+      queue.splice(0);
+
+      // Although all buffers are cleared, there are still some BufferSources queued at the AudioContext that need to be stopped.
+      queuedBufferSources.forEach(bufferSource => bufferSource.stop());
+    },
+    flush: () => new Promise(resolve => queue.push(resolve)),
+    push: multiChannelArrayBuffer => {
+      queue.push(multiChannelArrayBuffer);
+
+      playNext();
+    }
+  };
+}
diff --git a/packages/directlinespeech/src/playCognitiveServicesStream.js b/packages/directlinespeech/src/playCognitiveServicesStream.js
@@ -1,27 +1,29 @@
-/* eslint no-magic-numbers: ["error", { "ignore": [8, 16, 32, 128, 1000, 32768, 2147483648] }] */
+/* eslint no-magic-numbers: ["error", { "ignore": [0, 1, 8, 16, 32, 128, 1000, 32768, 96000, 2147483648] }] */
 /* eslint no-await-in-loop: "off" */
 /* eslint prefer-destructuring: "off" */
 
 import cognitiveServicesPromiseToESPromise from './cognitiveServicesPromiseToESPromise';
-import createDeferred from 'p-defer';
+import createMultiBufferingPlayer from './createMultiBufferingPlayer';
 
-function createBufferSource(audioContext, { channels, samplesPerSec }, channelInterleavedAudioData) {
-  const bufferSource = audioContext.createBufferSource();
-  const frames = channelInterleavedAudioData.length / channels;
-  const audioBuffer = audioContext.createBuffer(channels, frames, samplesPerSec);
+// Safari requires an audio buffer with a sample rate of 22050 Hz.
+// Using a minimum sample rate of 44100 Hz as an example, the Speech SDK's default 16000 Hz will be upsampled to 48000 Hz.
+const MIN_SAMPLE_RATE = 44100;
 
-  for (let channel = 0; channel < channels; channel++) {
-    const perChannelAudioData = audioBuffer.getChannelData(channel);
+// The Speech SDK is hardcoded to chop packets to 4096 bytes.
+// Web Chat's multi-buffering player is set up with 3 buffers; each is 4096 bytes (2048 16-bit samples).
+// For simplicity, the multi-buffer player currently does not support progressive buffering.
 
-    // We are copying channel-interleaved audio data, into per-channel audio data
-    for (let perChannelIndex = 0; perChannelIndex < channelInterleavedAudioData.length; perChannelIndex++) {
-      perChannelAudioData[perChannelIndex] = channelInterleavedAudioData[perChannelIndex * channels + channel];
-    }
-  }
+// Progressive buffering allows queuing at any sample size and will be concatenated.
+// If 1000 samples are queued, then 1048 samples are queued, they will be concatenated into a single buffer of size 2048.
+
+// For simplicity, data will be queued to two buffers.
+// The first buffer is 1000 samples followed by 1048 zeroes, and the second buffer is 1048 samples followed by 1000 zeroes.
 
-  bufferSource.buffer = audioBuffer;
+// There is no plan to support progressive buffering until the Speech SDK chops data at dynamic size.
+const DEFAULT_BUFFER_SIZE = 4096;
 
-  return bufferSource;
+function average(array) {
+  return array.reduce((sum, value) => sum + value, 0) / array.length;
 }
 
 function formatTypedBitArrayToFloatArray(audioData, maxValue) {
@@ -56,6 +58,49 @@ function abortToReject(signal) {
   });
 }
 
+// In a 2 channel audio (e.g. A/B), the data arrives as interleaved, like "ABABABABAB".
+// This function will take "ABABABABAB" and return an array ["AAAAA", "BBBBB"].
+function deinterleave(channelInterleavedAudioData, { channels }) {
+  const multiChannelArrayBuffer = new Array(channels);
+  const frameSize = channelInterleavedAudioData.length / channels;
+
+  for (let channel = 0; channel < channels; channel++) {
+    const audioData = new Float32Array(frameSize);
+
+    multiChannelArrayBuffer[channel] = audioData;
+
+    for (let offset = 0; offset < frameSize; offset++) {
+      audioData[offset] = channelInterleavedAudioData[offset * channels + channel];
+    }
+  }
+
+  return multiChannelArrayBuffer;
+}
+
+// This function upsamples the audio data via an integer multiplier.
+// Web Chat uses simple anti-aliasing. For simplicity, the anti-aliasing does not roll over to next buffer.
+function multiplySampleRate(source, sampleRateMultiplier) {
+  if (sampleRateMultiplier === 1) {
+    return source;
+  }
+
+  const lastValues = new Array(sampleRateMultiplier).fill(source[0]);
+  const target = new Float32Array(source.length * sampleRateMultiplier);
+
+  for (let sourceOffset = 0; sourceOffset < source.length; sourceOffset++) {
+    const value = source[sourceOffset];
+    const targetOffset = sourceOffset * sampleRateMultiplier;
+
+    for (let multiplierIndex = 0; multiplierIndex < sampleRateMultiplier; multiplierIndex++) {
+      lastValues.shift();
+      lastValues.push(value);
+      target[targetOffset + multiplierIndex] = average(lastValues);
+    }
+  }
+
+  return target;
+}
+
 export default async function playCognitiveServicesStream(
   audioContext,
   audioFormat,
@@ -66,7 +111,6 @@ export default async function playCognitiveServicesStream(
 
   try {
     const abortPromise = abortToReject(signal);
-    let lastBufferSource;
 
     const read = () =>
       Promise.race([
@@ -79,43 +123,63 @@ export default async function playCognitiveServicesStream(
       throw new Error('aborted');
     }
 
+    let newSamplesPerSec = audioFormat.samplesPerSec;
+    let sampleRateMultiplier = 1;
+
+    // Safari requires a minimum sample rate of 22100 Hz.
+    // A multiplier is calculated the the data meets the minimum sample rate.
+    // An integer-based multiplier to simplify our upsampler.
+    // For security, data will only be upsampled up to 96000 Hz.
+    while (newSamplesPerSec < MIN_SAMPLE_RATE && newSamplesPerSec < 96000) {
+      sampleRateMultiplier++;
+      newSamplesPerSec = audioFormat.samplesPerSec * sampleRateMultiplier;
+    }
+
+    // The third parameter is the sample size in bytes.
+    // For example, if the Speech SDK sends Web Chat 4096 bytes of 16-bit samples, there will be 2048 samples per channel.
+    // The multi-buffering player is set up to handle 2048 samples per buffer.
+    // If the multiplier 3x, it will handle 6144 samples per buffer.
+    const player = createMultiBufferingPlayer(
+      audioContext,
+      { ...audioFormat, samplesPerSec: newSamplesPerSec },
+      (DEFAULT_BUFFER_SIZE / (audioFormat.bitsPerSample / 8)) * sampleRateMultiplier
+    );
+
+    // For security, the maximum number of chunks handled will be 1000.
     for (
-      let chunk = await read(), currentTime, maxChunks = 0;
+      let chunk = await read(), maxChunks = 0;
       !chunk.isEnd && maxChunks < 1000 && !signal.aborted;
       chunk = await read(), maxChunks++
     ) {
       if (signal.aborted) {
         break;
       }
 
-      const audioData = formatAudioDataArrayBufferToFloatArray(audioFormat, chunk.buffer);
-      const bufferSource = createBufferSource(audioContext, audioFormat, audioData);
-      const { duration } = bufferSource.buffer;
+      // Data received from Speech SDK is interleaved; 2 channels (e.g. A and B) will be sent as "ABABABABAB"
+      // And each sample (A/B) will be an 8 to 32-bit number.
 
-      if (!currentTime) {
-        currentTime = audioContext.currentTime;
-      }
+      // Convert the 8 - 32-bit number into a floating-point number, as required by Web Audio API.
+      const interleavedArrayBuffer = formatAudioDataArrayBufferToFloatArray(audioFormat, chunk.buffer);
 
-      bufferSource.connect(audioContext.destination);
-      bufferSource.start(currentTime);
+      // Deinterleave data back into two array buffer, e.g. "AAAAA" and "BBBBB".
+      const multiChannelArrayBuffer = deinterleave(interleavedArrayBuffer, audioFormat);
 
-      queuedBufferSourceNodes.push(bufferSource);
+      // Upsample data if necessary. If the multiplier is 2x, "AAAAA" will be upsampled to "AAAAAAAAAA" (with anti-alias).
+      const upsampledMultiChannelArrayBuffer = multiChannelArrayBuffer.map(arrayBuffer =>
+        multiplySampleRate(arrayBuffer, sampleRateMultiplier)
+      );
 
-      lastBufferSource = bufferSource;
-      currentTime += duration;
+      // Queue to the buffering player.
+      player.push(upsampledMultiChannelArrayBuffer);
     }
 
+    abortPromise.catch(() => player.cancelAll());
+
     if (signal.aborted) {
       throw new Error('aborted');
     }
 
-    if (lastBufferSource) {
-      const { promise, resolve } = createDeferred();
-
-      lastBufferSource.onended = resolve;
-
-      await Promise.race([abortPromise, promise]);
-    }
+    await Promise.race([abortPromise, player.flush()]);
   } finally {
     queuedBufferSourceNodes.forEach(node => node.stop());
   }