microsoft · jbienzms · May 3, 2016 · May 13, 2016 · May 24, 2016 · jwittner
diff --git a/Assets/HoloToolkit/Speech/Scripts/TextToSpeechManager.cs b/Assets/HoloToolkit/Speech/Scripts/TextToSpeechManager.cs
@@ -0,0 +1,191 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in the project root for license information.
+using System;
+using UnityEngine;
+
+#if WINDOWS_UWP
+using Windows.Foundation;
+using Windows.Media.SpeechSynthesis;
+using Windows.Storage.Streams;
+using System.Threading.Tasks;
+#endif
+
+namespace HoloToolkit.Unity
+{
+    /// <summary>
+    /// Enables text to speech using the Windows 10 <see cref="SpeechSynthesizer"/> class.
+    /// </summary>
+    /// <remarks>
+    /// <see cref="SpeechSynthesizer"/> generates speech as a <see cref="SpeechSynthesisStream"/>. 
+    /// This class converts that stream into a Unity <see cref="AudioClip"/> and plays the clip using 
+    /// the <see cref="AudioSource"/> you supply in the inspector. This allows you to position the voice 
+    /// as desired in 3D space. One recommended approach is to place the AudioSource on an empty 
+    /// GameObject that is a child of Main Camera and position it approximately 0.6 units above the 
+    /// camera. This orientation will sound similar to Cortana's speech in the OS.
+    /// </remarks>
+    public class TextToSpeechManager : MonoBehaviour
+    {
+        // Inspector Variables
+        [Tooltip("The audio source where speech will be played.")]
+        public AudioSource audioSource;
+
+        // Member Variables
+        #if WINDOWS_UWP
+        private SpeechSynthesizer synthesizer;
+        #endif
+
+        // Internal Methods
+
+        /// <summary>
+        /// Logs speech text that normally would have been played.
+        /// </summary>
+        /// <param name="text">
+        /// The speech text.
+        /// </param>
+        private void LogSpeech(string text)
+        {
+            Debug.LogFormat("Speech not supported in editor. \"{0}\"", text);
+        }
+
+        #if WINDOWS_UWP
+        /// <summary>
+        /// Executes a function that generates a speech stream and then converts and plays it in Unity.
+        /// </summary>
+        /// <param name="text">
+        /// A raw text version of what's being spoken for use in debug messages when speech isn't supported.
+        /// </param>
+        /// <param name="speakFunc">
+        /// The actual function that will be executed to generate speech.
+        /// </param>
+        private void PlaySpeech(string text, Func<IAsyncOperation<SpeechSynthesisStream>> speakFunc)
+        {
+            // Make sure there's something to speak
+            if (speakFunc == null) throw new ArgumentNullException(nameof(speakFunc));
+
+            if (synthesizer != null)
+            {
+                try
+                {
+                    // Need await, so most of this will be run as a new Task in its own thread.
+                    // This is good since it frees up Unity to keep running anyway.
+                    Task.Run(async () =>
+                    {
+                        // Speak and get stream
+                        var speechStream = await speakFunc();
+
+                        // Get the size of the original stream
+                        var size = speechStream.Size;
+
+                        // Create buffer
+                        byte[] buffer = new byte[(int)size];
+
+                        // Get input stream and the size of the original stream
+                        using (var inputStream = speechStream.GetInputStreamAt(0))
+                        {
+                            // Close the original speech stream to free up memory
+                            speechStream.Dispose();
+
+                            // Create a new data reader off the input stream
+                            using (var dataReader = new DataReader(inputStream))
+                            {
+                                // Load all bytes into the reader
+                                await dataReader.LoadAsync((uint)size);
+
+                                // Copy from reader into buffer
+                                dataReader.ReadBytes(buffer);
+                            }
+                        }
+
+                        // Load buffer as a WAV file
+                        var wav = new Wav(buffer);
+
+                        // The remainder must be done back on Unity's main thread
+                        UnityEngine.WSA.Application.InvokeOnAppThread(() =>
+                        {
+                            // Convert to an audio clip
+                            var clip = wav.ToClip("Speech");
+
+                            // Set the source on the audio clip
+                            audioSource.clip = clip;
+
+                            // Play audio
+                            audioSource.Play();
+                        }, false);
+                    });
+                }
+                catch (Exception ex)
+                {
+                    Debug.LogErrorFormat("Speech generation problem: \"{0}\"", ex.Message);
+                }
+            }
+            else
+            {
+                Debug.LogErrorFormat("Speech not initialized. \"{0}\"", text);
+            }
+        }
+        #endif
+
+        private void StartSpeech()
+        {
+            try
+            {
+                if (audioSource == null) { throw new InvalidOperationException("An AudioSource is required and should be assigned to 'Audio Source' in the inspector."); }
+                #if WINDOWS_UWP
+                synthesizer = new SpeechSynthesizer();
+                #endif
+            }
+            catch (Exception ex)
+            {
+                Debug.LogError("Could not start Speech Synthesis");
+                Debug.LogException(ex);
+            }
+        }
+
+        // MonoBehaviour Methods
+        void Start()
+        {
+            // Start speech
+            StartSpeech();
+        }
+
+        // Public Methods
+
+        /// <summary>
+        /// Speaks the specified SSML markup using text-to-speech.
+        /// </summary>
+        /// <param name="ssml">
+        /// The SSML markup to speak.
+        /// </param>
+        public void SpeakSsml(string ssml)
+        {
+            // Make sure there's something to speak
+            if (string.IsNullOrEmpty(ssml)) { return; }
+
+            // Pass to helper method
+            #if WINDOWS_UWP
+            PlaySpeech(ssml, () => synthesizer.SynthesizeSsmlToStreamAsync(ssml));
+            #else
+            LogSpeech(ssml);
+            #endif
+        }
+
+        /// <summary>
+        /// Speaks the specified text using text-to-speech.
+        /// </summary>
+        /// <param name="text">
+        /// The text to speak.
+        /// </param>
+        public void SpeakText(string text)
+        {
+            // Make sure there's something to speak
+            if (string.IsNullOrEmpty(text)) { return; }
+
+            // Pass to helper method
+            #if WINDOWS_UWP
+            PlaySpeech(text, ()=> synthesizer.SynthesizeTextToStreamAsync(text));
+            #else
+            LogSpeech(text);
+            #endif
+        }
+    }
+}
diff --git a/Assets/HoloToolkit/Speech/Scripts/TextToSpeechManager.cs.meta b/Assets/HoloToolkit/Speech/Scripts/TextToSpeechManager.cs.meta
diff --git a/Assets/HoloToolkit/Speech/Scripts/Wav.cs b/Assets/HoloToolkit/Speech/Scripts/Wav.cs
@@ -0,0 +1,152 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in the project root for license information.
+
+using UnityEngine;
+
+namespace HoloToolkit.Unity
+{
+    /// <summary>
+    /// Supports dynamic loading of WAV data in memory.
+    /// </summary>
+    /// <remarks>
+    /// This class is partially based on the excellent 
+    /// <see href="http://answers.unity3d.com/questions/737002/wav-byte-to-audioclip.html">sample by Jeff Kesselman</see> 
+    /// on the Unity forums. 
+    /// </remarks>
+    public class Wav
+    {
+        #region Static Version
+        // Convert two bytes to one float in the range -1 to 1
+        static private float bytesToFloat(byte firstByte, byte secondByte)
+        {
+            // Convert two bytes to one short (little endian)
+            short s = (short)((secondByte << 8) | firstByte);
+
+            // Convert to range from -1 to (just below) 1
+            return s / 32768.0F;
+        }
+
+        static private int bytesToInt(byte[] bytes, int offset = 0)
+        {
+            int value = 0;
+            for (int i = 0; i < 4; i++)
+            {
+                value |= ((int)bytes[offset + i]) << (i * 8);
+            }
+            return value;
+        }
+        #endregion // Static Version
+
+        #region Instance Version
+        #region Constructors
+        /// <summary>
+        /// Initializes a new <see cref="Wav"/> instance.
+        /// </summary>
+        /// <param name="wav">
+        /// The raw WAV byte data.
+        /// </param>
+        public Wav(byte[] wav)
+        {
+            // Determine if mono or stereo
+            ChannelCount = wav[22];     // Forget byte 23 as 99.999% of WAVs are 1 or 2 channels
+
+            // Get the frequency
+            Frequency = bytesToInt(wav, 24);
+
+            // Get past all the other sub chunks to get to the data subchunk:
+            int pos = 12;   // First subchunk ID from 12 to 16
+
+            // Keep iterating until we find the data chunk (i.e. 64 61 74 61 ...... (i.e. 100 97 116 97 in decimal))
+            while (!(wav[pos] == 100 && wav[pos + 1] == 97 && wav[pos + 2] == 116 && wav[pos + 3] == 97))
+            {
+                pos += 4;
+                int chunkSize = wav[pos] + wav[pos + 1] * 256 + wav[pos + 2] * 65536 + wav[pos + 3] * 16777216;
+                pos += 4 + chunkSize;
+            }
+            pos += 8;
+
+            // Pos is now positioned to start of actual sound data.
+            SampleCount = (wav.Length - pos) / 2;     // 2 bytes per sample (16 bit sound mono)
+            if (ChannelCount == 2) SampleCount /= 2;        // 4 bytes per sample (16 bit stereo)
+
+            // Allocate memory (right will be null if only mono sound)
+            LeftChannel = new float[SampleCount];
+            if (ChannelCount == 2) RightChannel = new float[SampleCount];
+            else RightChannel = null;
+
+            // Write to double array/s:
+            int i = 0;
+            while (pos < wav.Length)
+            {
+                LeftChannel[i] = bytesToFloat(wav[pos], wav[pos + 1]);
+                pos += 2;
+                if (ChannelCount == 2)
+                {
+                    RightChannel[i] = bytesToFloat(wav[pos], wav[pos + 1]);
+                    pos += 2;
+                }
+                i++;
+            }
+        }
+        #endregion // Constructors
+
+        #region Overrides / Event Handlers
+        /// <inheritdoc/>
+        public override string ToString()
+        {
+            return string.Format("[WAV: LeftChannel={0}, RightChannel={1}, ChannelCount={2}, SampleCount={3}, Frequency={4}]", LeftChannel, RightChannel, ChannelCount, SampleCount, Frequency);
+        }
+        #endregion // Overrides / Event Handlers
+
+        #region Public Methods
+        /// <summary>
+        /// Dynamically creates an <see cref="AudioClip"/> that represents the WAV file.
+        /// </summary>
+        /// <param name="name">
+        /// The name of the dynamically generated clip.
+        /// </param>
+        /// <returns>
+        /// The <see cref="AudioClip"/>.
+        /// </returns>
+        public AudioClip ToClip(string name)
+        {
+            // Create the audio clip
+            var clip = AudioClip.Create(name, SampleCount, 1, Frequency, false); // TODO: Support stereo
+
+            // Set the data
+            clip.SetData(LeftChannel, 0);
+
+            // Done
+            return clip;
+        }
+        #endregion // Public Methods
+
+        #region Public Properties
+        /// <summary>
+        /// Gets the number of audio channels.
+        /// </summary>
+        public int ChannelCount { get; internal set; }
+
+        /// <summary>
+        /// Gets the frequency of the audio data.
+        /// </summary>
+        public int Frequency { get; internal set; }
+
+        /// <summary>
+        /// Gets the left channel audio data.
+        /// </summary>
+        public float[] LeftChannel { get; internal set; }
+
+        /// <summary>
+        /// Gets the right channel audio data.
+        /// </summary>
+        public float[] RightChannel { get; internal set; }
+
+        /// <summary>
+        /// Gets the number of samples.
+        /// </summary>
+        public int SampleCount { get; internal set; }
+        #endregion // Public Properties
+        #endregion // Instance Version
+    }
+}
diff --git a/Assets/HoloToolkit/Speech/Scripts/Wav.cs.meta b/Assets/HoloToolkit/Speech/Scripts/Wav.cs.meta