microsoft · corinagum · Dec 6, 2019 · Dec 5, 2019 · Dec 5, 2019 · Dec 5, 2019
@@ -66,6 +66,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 -  `component`: Updated timer to use functional component, by [@spyip](https://github.com/spyip) in PR [#2546](https://github.com/microsoft/BotFramework-WebChat/pull/2546)
 -  Fixes [#2651](https://github.com/microsoft/BotFramework-WebChat/issues/2651). Add `ends-with` string module to es5 bundle, by [@corinagum](https://github.com/corinagum) in PR [#2654](https://github.com/microsoft/BotFramework-WebChat/pull/2654)
 -  Fixes [#2658](https://github.com/microsoft/BotFramework-WebChat/issues/2658). Fix rendering of markdown images in IE11, by [@corinagum](https://github.com/corinagum) in PR [#2659](https://github.com/microsoft/BotFramework-WebChat/pull/2659)
+-  Fixes [#2662](https://github.com/microsoft/BotFramework-WebChat/issues/2662) and [#2666](https://github.com/microsoft/BotFramework-WebChat/issues/2666). Fix various issues related to Direct Line Speech, by [@compulim](https://github.com/compulim) in PR [#2671](https://github.com/microsoft/BotFramework-WebChat/pull/2671)
+   -  Added triple-buffering to reduce pops/cracks.
+   -  Enable Safari by upsampling to 48000 Hz.
+   -  Support detailed output format on Web Chat side.
 
 ### Changed
 

@@ -92,20 +92,17 @@ export default async function create({
 
   // Supported options can be found in DialogConnectorFactory.js.
 
+  // Setting the language use for recognition.
   config.setProperty(PropertyId.SpeechServiceConnection_RecoLanguage, speechRecognitionLanguage);
 
-  // The following code set the output format. But currently, none of the following works for setting detailed output format.
-  // We will leave these code commented until the Speech SDK support, possibly it in one of the way mentioned below.
+  // The following code set the output format.
+  // As advised by Speech team, this API may change in the future.
+  config.setProperty(PropertyId.SpeechServiceResponse_OutputFormatOption, 'detailed');
 
-  // config.setProperty(PropertyId.SpeechServiceResponse_OutputFormatOption, OutputFormat[OutputFormat.Detailed]);
-  // config.setProperty(PropertyId.SpeechServiceResponse_RequestDetailedResultTrueFalse, true);
-  // config.setProperty(OutputFormatPropertyName, OutputFormat[OutputFormat.Detailed]);
-  // config.setServiceProperty(PropertyId.SpeechServiceResponse_RequestDetailedResultTrueFalse, "true", ServicePropertyChannel.UriQueryParameter);
-
-  // The following code is copied from C#, it should set from.id, but it did not.
-  // https://github.com/Azure-Samples/Cognitive-Services-Direct-Line-Speech-Client/blob/master/DLSpeechClient/MainWindow.xaml.cs#L236
+  // Setting the user ID for starting the conversation.
   userID && config.setProperty(PropertyId.Conversation_From_Id, userID);
 
+  // Setting Custom Speech and Custom Voice.
   // The following code is copied from C#, and it is not working yet.
   // https://github.com/Azure-Samples/Cognitive-Services-Direct-Line-Speech-Client/blob/master/DLSpeechClient/MainWindow.xaml.cs
   // speechRecognitionEndpointId && config.setServiceProperty('cid', speechRecognitionEndpointId, ServicePropertyChannel.UriQueryParameter);
@@ -115,11 +112,12 @@ export default async function create({
 
   dialogServiceConnector.connect();
 
-  // Renew token
+  // Renew token per interval.
   if (authorizationToken) {
     const interval = setInterval(async () => {
       // If the connector has been disposed, we should stop renewing the token.
-      // TODO: We should use a public implementation if Speech SDK has one.
+
+      // TODO: We should use a public implementation if Speech SDK has one related to "privIsDisposed".
       if (dialogServiceConnector.privIsDisposed) {
         clearInterval(interval);
       }

@@ -0,0 +1,108 @@
+// Currently, we use a triple-buffer approach.
+const NUM_BUFFER = 3;
+
+function zeroBuffer(buffer) {
+  const channels = buffer.numberOfChannels;
+
+  for (let channel = 0; channel < channels; channel++) {
+    const audioData = buffer.getChannelData(channel);
+
+    [].fill.call(audioData, 0);
+  }
+}
+
+function copyBuffer(buffer, multiChannelArrayBuffer) {
+  const channels = buffer.numberOfChannels;
+
+  for (let channel = 0; channel < channels; channel++) {
+    const arrayBuffer = multiChannelArrayBuffer[channel];
+
+    // Safari does not support AudioBuffer.copyToChannel yet.
+    if (buffer.copyToChannel) {
+      buffer.copyToChannel(arrayBuffer, channel);
+    } else {
+      const { length: arrayBufferLength } = arrayBuffer;
+      const perChannelBuffer = buffer.getChannelData(channel);
+
+      for (let offset = 0; offset < arrayBufferLength; offset++) {
+        perChannelBuffer[offset] = arrayBuffer[offset];
+      }
+    }
+  }
+}
+
+// This is a multi-buffering player. Users can keep pushing buffer to us.
+// We will realize the buffer as BufferSource and queue it to AudioContext.
+// We will queue as soon, and as much as possible.
+// We do not support progressive buffering (push partial buffer) and do not have plan for it.
+
+export default function createMultiBufferingPlayer(audioContext, { channels, samplesPerSec }, numSamplePerBuffer) {
+  const freeBuffers = new Array(NUM_BUFFER)
+    .fill()
+    .map(() => audioContext.createBuffer(channels, numSamplePerBuffer, samplesPerSec));
+  let queuedBufferSources = [];
+  let nextSchedule;
+
+  const queue = [];
+
+  const playNext = () => {
+    if (typeof nextSchedule !== 'number') {
+      nextSchedule = audioContext.currentTime;
+    }
+
+    const bufferSource = audioContext.createBufferSource();
+    const multiChannelArrayBuffer = queue.shift();
+
+    if (typeof multiChannelArrayBuffer === 'function') {
+      // If the queued item is a function, it is because the user called "flush".
+      // The "flush" function will callback when all queued buffer before the "flush" call had played.
+      multiChannelArrayBuffer();
+    } else if (multiChannelArrayBuffer) {
+      const nextBuffer = freeBuffers.shift();
+
+      // If all buffers are currently occupied, prepend the data back to the queue.
+      // When one of the buffer finish, it will call playNext() again to pick up things from the queue.
+      if (!nextBuffer) {
+        queue.unshift(multiChannelArrayBuffer);
+
+        return;
+      }
+
+      zeroBuffer(nextBuffer);
+      copyBuffer(nextBuffer, multiChannelArrayBuffer);
+
+      bufferSource.buffer = nextBuffer;
+      bufferSource.connect(audioContext.destination);
+      bufferSource.start(nextSchedule);
+
+      // We will remember all BufferSource that is currently queued at the AudioContext, thru bufferSource.start().
+      // This is for cancelAll() to effectively cancel all BufferSource queued at the AudioContext.
+      queuedBufferSources.push(bufferSource);
+
+      nextSchedule += nextBuffer.duration;
+
+      bufferSource.addEventListener('ended', () => {
+        queuedBufferSources = queuedBufferSources.filter(target => target !== bufferSource);
+
+        // Declare this buffer is free to pick up on next round.
+        freeBuffers.push(nextBuffer);
+        playNext();
+      });
+    }
+  };
+
+  return {
+    cancelAll: () => {
+      queue.splice(0);
+
+      // Although all buffer are cleared, there are still some BufferSources queued at the AudioContext that need to be stopped.
+      queuedBufferSources.forEach(bufferSource => bufferSource.stop());
+    },
+    flush: () => new Promise(resolve => queue.push(resolve)),
+    push: multiChannelArrayBuffer => {
+      queue.push(multiChannelArrayBuffer);
+
+      playNext();
+    }
+  };
+}
@@ -1,27 +1,29 @@
-/* eslint no-magic-numbers: ["error", { "ignore": [8, 16, 32, 128, 1000, 32768, 2147483648] }] */
+/* eslint no-magic-numbers: ["error", { "ignore": [0, 1, 8, 16, 32, 128, 1000, 32768, 96000, 2147483648] }] */
 /* eslint no-await-in-loop: "off" */
 /* eslint prefer-destructuring: "off" */
 
 import cognitiveServicesPromiseToESPromise from './cognitiveServicesPromiseToESPromise';
-import createDeferred from 'p-defer';
+import createMultiBufferingPlayer from './createMultiBufferingPlayer';
 
-function createBufferSource(audioContext, { channels, samplesPerSec }, channelInterleavedAudioData) {
-  const bufferSource = audioContext.createBufferSource();
-  const frames = channelInterleavedAudioData.length / channels;
-  const audioBuffer = audioContext.createBuffer(channels, frames, samplesPerSec);
+// Safari requires audio buffer with sample rate of 22050 Hz.
+// Let's use 44100 Hz, Speech SDK's default 16000 Hz sample will be upsampled to 48000 Hz.
+const MIN_SAMPLE_RATE = 44100;
 
-  for (let channel = 0; channel < channels; channel++) {
-    const perChannelAudioData = audioBuffer.getChannelData(channel);
+// We assume Speech SDK chop packet at size 4096 bytes, they hardcode it in Speech SDK.
+// We will set up our multi-buffering player with 3 buffers each of 4096 bytes (2048 of 16-bit samples).
+// For simplicity, our multi-buffer player currently do not support progressive buffering.
 
-    // We are copying channel-interleaved audio data, into per-channel audio data
-    for (let perChannelIndex = 0; perChannelIndex < channelInterleavedAudioData.length; perChannelIndex++) {
-      perChannelAudioData[perChannelIndex] = channelInterleavedAudioData[perChannelIndex * channels + channel];
-    }
-  }
+// Progressive buffering means, we can queue at any sample size and they will be concatenated.
+// For example, queue 1000 samples, then queue 1048 samples, they will be concatenated into a single buffer of size 2048.
+
+// Currently, for simplicity, we will queue as two buffers.
+// First one is 1000 samples followed by 1048 zeroes, second one is 1048 sample followed by 1000 zeroes.
 
-  bufferSource.buffer = audioBuffer;
+// There is no plan to support progressive buffering unless Speech SDK chop at dynamic size.
+const DEFAULT_BUFFER_SIZE = 4096;
 
-  return bufferSource;
+function average(array) {
+  return array.reduce((sum, value) => sum + value, 0) / array.length;
 }
 
 function formatTypedBitArrayToFloatArray(audioData, maxValue) {
@@ -56,6 +58,49 @@ function abortToReject(signal) {
   });
 }
 
+// In a 2 channel audio (A/B), the data come as interleaved like "ABABABABAB".
+// This function will take "ABABABABAB" and return an array ["AAAAA", "BBBBB"].
+function deinterleave(channelInterleavedAudioData, { channels }) {
+  const multiChannelArrayBuffer = new Array(channels);
+  const frameSize = channelInterleavedAudioData.length / channels;
+
+  for (let channel = 0; channel < channels; channel++) {
+    const audioData = new Float32Array(frameSize);
+
+    multiChannelArrayBuffer[channel] = audioData;
+
+    for (let offset = 0; offset < frameSize; offset++) {
+      audioData[offset] = channelInterleavedAudioData[offset * channels + channel];
+    }
+  }
+
+  return multiChannelArrayBuffer;
+}
+
+// This function upsample the audio data by an integer multiplier.
+// We implemented simple anti-aliasing. For simplicity, the anti-aliasing do not roll over to next buffer.
+function multiplySampleRate(source, sampleRateMultiplier) {
+  if (sampleRateMultiplier === 1) {
+    return source;
+  }
+
+  const lastValues = new Array(sampleRateMultiplier).fill(source[0]);
+  const target = new Float32Array(source.length * sampleRateMultiplier);
+
+  for (let sourceOffset = 0; sourceOffset < source.length; sourceOffset++) {
+    const value = source[sourceOffset];
+    const targetOffset = sourceOffset * sampleRateMultiplier;
+
+    for (let multiplierIndex = 0; multiplierIndex < sampleRateMultiplier; multiplierIndex++) {
+      lastValues.shift();
+      lastValues.push(value);
+      target[targetOffset + multiplierIndex] = average(lastValues);
+    }
+  }
+
+  return target;
+}
+
 export default async function playCognitiveServicesStream(
   audioContext,
   audioFormat,
@@ -66,7 +111,6 @@ export default async function playCognitiveServicesStream(
 
   try {
     const abortPromise = abortToReject(signal);
-    let lastBufferSource;
 
     const read = () =>
       Promise.race([
@@ -79,43 +123,63 @@ export default async function playCognitiveServicesStream(
       throw new Error('aborted');
     }
 
+    let newSamplesPerSec = audioFormat.samplesPerSec;
+    let sampleRateMultiplier = 1;
+
+    // Safari requires a minimum sample rate of 22100 Hz.
+    // We will calculate a multiplier so it meet the minimum sample rate.
+    // We prefer an integer-based multiplier to simplify our upsampler.
+    // For safety, we will only upsample up to 96000 Hz.
+    while (newSamplesPerSec < MIN_SAMPLE_RATE && newSamplesPerSec < 96000) {
+      sampleRateMultiplier++;
+      newSamplesPerSec = audioFormat.samplesPerSec * sampleRateMultiplier;
+    }
+
+    // The third parameter is sample size in bytes.
+    // For example, Speech SDK send us 4096 bytes of 16-bit samples. That means, 2048 samples per channel.
+    // The multi-buffering player will be set up to handle 2048 samples per buffer.
+    // If we have a multiplier of 3x, it will handle 6144 samples per buffer.
+    const player = createMultiBufferingPlayer(
+      audioContext,
+      { ...audioFormat, samplesPerSec: newSamplesPerSec },
+      (DEFAULT_BUFFER_SIZE / (audioFormat.bitsPerSample / 8)) * sampleRateMultiplier
+    );
+
+    // For safety, we will only handle up to 1000 chunks.
     for (
-      let chunk = await read(), currentTime, maxChunks = 0;
+      let chunk = await read(), maxChunks = 0;
       !chunk.isEnd && maxChunks < 1000 && !signal.aborted;
       chunk = await read(), maxChunks++
     ) {
       if (signal.aborted) {
         break;
       }
 
-      const audioData = formatAudioDataArrayBufferToFloatArray(audioFormat, chunk.buffer);
-      const bufferSource = createBufferSource(audioContext, audioFormat, audioData);
-      const { duration } = bufferSource.buffer;
+      // Data received from Speech SDK is interleaved. It means, 2 channel (A/B) will be sent as "ABABABABAB"
+      // And each sample (A/B) will be a 8 to 32-bit number.
 
-      if (!currentTime) {
-        currentTime = audioContext.currentTime;
-      }
+      // First, we convert 8 to 32-bit number, into a floating-point number, which is required by Web Audio API.
+      const interleavedArrayBuffer = formatAudioDataArrayBufferToFloatArray(audioFormat, chunk.buffer);
 
-      bufferSource.connect(audioContext.destination);
-      bufferSource.start(currentTime);
+      // Then, we deinterleave them back into two array buffer, as "AAAAA" and "BBBBB".
+      const multiChannelArrayBuffer = deinterleave(interleavedArrayBuffer, audioFormat);
 
-      queuedBufferSourceNodes.push(bufferSource);
+      // Lastly, if needed, we will upsample them. If the multiplier is 2x, "AAAAA" will become "AAAAAAAAAA" (with anti-alias).
+      const upsampledMultiChannelArrayBuffer = multiChannelArrayBuffer.map(arrayBuffer =>
+        multiplySampleRate(arrayBuffer, sampleRateMultiplier)
+      );
 
-      lastBufferSource = bufferSource;
-      currentTime += duration;
+      // Queue it to the buffering player.
+      player.push(upsampledMultiChannelArrayBuffer);
     }
 
+    abortPromise.catch(() => player.cancelAll());
+
     if (signal.aborted) {
       throw new Error('aborted');
     }
 
-    if (lastBufferSource) {
-      const { promise, resolve } = createDeferred();
-
-      lastBufferSource.onended = resolve;
-
-      await Promise.race([abortPromise, promise]);
-    }
+    await Promise.race([abortPromise, player.flush()]);
   } finally {
     queuedBufferSourceNodes.forEach(node => node.stop());
   }