Microsoft Windows Text-to-Speech for Unity

Chad Weisshaar — Thu, 02 Jul 2015 18:42:55 +0000

*EDIT 7/15/2019* I’m updating this post with my latest code and to remove the dependency on UniExtensions

I’ve made a wrapper around the Windows-only Microsoft Speech API for use in Unity. The Microsoft Speech API is a Windows COM capability that first appeared in windows Vista. It is an easy way to get text to speech in a windows application.

This post will go through the steps of making the C++ DLL and the C# behavior for Unity. If you don’t care about the “how” and just want to use the plugin skip to the end for the download and usage instructions.

The goal of my wrapper is to startup the text-to-speech engine when the game loads, and provide a function to speak a string of text. I’ll need this to run in a thread so that it doesn’t disrupt the main Unity thread.

Native C++

The first step is to write the C++ code that will initialize COM, manage a speech queue and call the Speech API.

Declare functions

I’m going to export three functions in the C++ DLL. These functions will be available to the C# code.

#ifdef DLL_EXPORTS
#define DLL_API __declspec(dllexport)
#else
#define DLL_API __declspec(dllimport)
#endif

#include &amp;amp;amp;amp;amp;amp;amp;amp;lt;mutex&amp;amp;amp;amp;amp;amp;amp;amp;gt;
#include &amp;amp;amp;amp;amp;amp;amp;amp;lt;list&amp;amp;amp;amp;amp;amp;amp;amp;gt;
#include &amp;amp;amp;amp;amp;amp;amp;amp;lt;thread&amp;amp;amp;amp;amp;amp;amp;amp;gt;

namespace WindowsVoice {
  extern "C" {
    DLL_API void __cdecl initSpeech();
    DLL_API void __cdecl addToSpeechQueue(const char* text);
    DLL_API void __cdecl clearSpeechQueue();
    DLL_API void __cdecl destroySpeech();
    DLL_API void __cdecl statusMessage(char* msg, int msgLen);
  }

  std::mutex theMutex;
  std::list&amp;amp;amp;amp;amp;amp;amp;amp;lt;wchar_t*&amp;amp;amp;amp;amp;amp;amp;amp;gt; theSpeechQueue;
  std::thread* theSpeechThread = nullptr;
  bool shouldTerminate = false;

  std::wstring theStatusMessage;
}
Init
To startup speech, I need to create a thread for speech to run in, initialize COM and create the Microsoft Voice resource. When it is time to shut down, I need to send a signal to the thread to free the Voice resource, then shutdown COM.
#include "pch.h"
#include "WindowsVoice.h"
#include &amp;amp;amp;amp;amp;amp;amp;lt;sapi.h&amp;amp;amp;amp;amp;amp;amp;gt;

namespace WindowsVoice {

  void speechThreadFunc()
  {
    ISpVoice * pVoice = NULL;

    if (FAILED(::CoInitializeEx(NULL, COINITBASE_MULTITHREADED)))
    {
      theStatusMessage = L"Failed to initialize COM for Voice.";
      return;
    }

    HRESULT hr = CoCreateInstance(CLSID_SpVoice, NULL, CLSCTX_ALL, IID_ISpVoice, (void **)&amp;amp;amp;amp;amp;amp;amp;amp;amp;pVoice);
    if (!SUCCEEDED(hr))
    {
      LPSTR pText = 0;

      ::FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
        NULL, hr, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), pText, 0, NULL);
      LocalFree(pText);
      theStatusMessage = L"Failed to create Voice instance.";
      return;
    }
    theStatusMessage = L"Speech ready.";

    SPVOICESTATUS voiceStatus;
    wchar_t* priorText = nullptr;
    while (!shouldTerminate)
    {
      pVoice-&amp;amp;amp;amp;amp;amp;amp;amp;gt;GetStatus(&amp;amp;amp;amp;amp;amp;amp;amp;amp;voiceStatus, NULL);
      if (voiceStatus.dwRunningState == SPRS_IS_SPEAKING)
      {
        if (priorText == nullptr)
          theStatusMessage = L"Error: SPRS_IS_SPEAKING but text is NULL";
        else
        {
          theStatusMessage = L"Speaking: ";
          theStatusMessage.append(priorText);
          if (!theSpeechQueue.empty())
          {
            theMutex.lock();
            if (lstrcmpW(theSpeechQueue.front(), priorText) == 0)
            {
              delete[] theSpeechQueue.front();
              theSpeechQueue.pop_front();
            }
            theMutex.unlock();
          }
        }
      }
      else
      {
        theStatusMessage = L"Waiting.";
        if (priorText != NULL)
        {
          delete[] priorText;
          priorText = NULL;
        }
        if (!theSpeechQueue.empty())
        {
          theMutex.lock();
          priorText = theSpeechQueue.front();
          theSpeechQueue.pop_front();
          theMutex.unlock();
          pVoice-&amp;amp;amp;amp;amp;amp;amp;amp;gt;Speak(priorText, SPF_IS_XML | SPF_ASYNC, NULL);
        }
      }
      Sleep(50);
    }
    pVoice-&amp;amp;amp;amp;amp;amp;amp;amp;gt;Pause();
    pVoice-&amp;amp;amp;amp;amp;amp;amp;amp;gt;Release();

    theStatusMessage = L"Speech thread terminated.";
  }

  void addToSpeechQueue(const char* text)
  {
    if (text)
    {
      int len = strlen(text) + 1;
      wchar_t *wText = new wchar_t[len];

      memset(wText, 0, len);
      ::MultiByteToWideChar(CP_UTF8, NULL, text, -1, wText, len);

      theMutex.lock();
      theSpeechQueue.push_back(wText);
      theMutex.unlock();
    }
  }
  void clearSpeechQueue()
  {
    theMutex.lock();
    theSpeechQueue.clear();
    theMutex.unlock();
  }
  void initSpeech()
  {
    shouldTerminate = false;
    if (theSpeechThread != nullptr)
    {
      theStatusMessage = L"Windows Voice thread already started.";
      return;
    }
    theStatusMessage = L"Starting Windows Voice.";
    theSpeechThread = new std::thread(WindowsVoice::speechThreadFunc);
  }
  void destroySpeech()
  {
    if (theSpeechThread == nullptr)
    {
      theStatusMessage = L"Speach thread already destroyed or not started.";
      return;
    }
    theStatusMessage = L"Destroying speech.";
    shouldTerminate = true;
    theSpeechThread-&amp;amp;amp;amp;amp;amp;amp;amp;gt;join();
    theSpeechQueue.clear();
    delete theSpeechThread;
    theSpeechThread = nullptr;
    CoUninitialize();
    theStatusMessage = L"Speech destroyed.";
  }
  void statusMessage(char* msg, int msgLen)
  {
    size_t count;
    wcstombs_s(&amp;amp;amp;amp;amp;amp;amp;amp;amp;count, msg, msgLen, theStatusMessage.c_str(), msgLen);
  }
}


BOOL APIENTRY DllMain(HMODULE, DWORD ul_reason_for_call, LPVOID)
{
  switch (ul_reason_for_call)
  {
  case DLL_PROCESS_ATTACH:
  case DLL_THREAD_ATTACH:
  case DLL_THREAD_DETACH:
  case DLL_PROCESS_DETACH:
    break;
  }
  
  return TRUE;
}

Since this code will be executing in the main thread and is modifying the shared speech queue, I need to use a mutex to lock the shared queue. The memory that I allocate in this function will need to be freed by the speech thread after the text has been spoken.
Speech
The while loop in the speechThreadFunc monitors the speech queue and calls Microsoft Speech to do the real work. I also don’t want to repeat the same text, because I often tie speech to button presses and don’t want to queue up a bunch of the same text if the user presses the button multiple times.
For details on building a C++ DLL you can download the Visual Studio files at the bottom of this post.
C#
Now that the C++ DLL is ready, I need to wrap that native code with managed C# code. I’m going to make my C# wrapper a MonoBehaviour so that it is easy to add to a scene.
using UnityEngine;
using System.Text;
using System.Runtime.InteropServices;

public static Coroutine ExecuteLater(this MonoBehaviour behaviour, float delay, System.Action fn)
{
  return behaviour.StartCoroutine(_realExecute(delay, fn));
}

public class WindowsVoice : MonoBehaviour {
  [DllImport("WindowsVoice")]
  public static extern void initSpeech();
  [DllImport("WindowsVoice")]
  public static extern void destroySpeech();
  [DllImport("WindowsVoice")]
  public static extern void addToSpeechQueue(string s);
  [DllImport("WindowsVoice")]
  public static extern void clearSpeechQueue();
  [DllImport("WindowsVoice")]
  public static extern void statusMessage(StringBuilder str, int length);
  public static WindowsVoice theVoice = null;
	void OnEnable () {
    if (theVoice == null)
    {
      theVoice = this;
      initSpeech();
    }
	}
  public void test()
  {
    speak("Testing");
  }
  public static void speak(string msg, float delay = 0f) {
    if (Timeline.theTimeline.QReprocessingEvents)
      return;

if ( delay == 0f )
      addToSpeechQueue(msg);
    else
      theVoice.ExecuteLater(delay, () =&amp;amp;amp;amp;amp;amp;amp;amp;gt; speak(msg));
  }
  void OnDestroy()
  {
    if (theVoice == this)
    {
      Debug.Log("Destroying speech");
      destroySpeech();
      Debug.Log("Speech destroyed");
      theVoice = null;
    }
  }
  public static string GetStatusMessage()
  {
    StringBuilder sb = new StringBuilder(40);
    statusMessage(sb, 40);
    return sb.ToString();
  }
}

IMPORTANT: You must build your game for 64 bit Windows. This plugin will not work for any other configuration.
I’ve named my C++ DLL “WindowsVoice.dll” and I’ve put that DLL and this .cs file in Assets/Plugins.
Using the Plugin
Here is the visual studio project and code needed to build the WindowsVoice.dll. You only need to download this if you want to change how the speech queue works or need a 32 bit DLL. WindowsVoiceProject
Here is the finished plugin with a 64 bit DLL and the WindowsVoice.cs behavior. WindowsVoiceBehavior
Instructions
To use the plugin, unzip the behavior file into /Assets/Plugins/WindowsVoice.
Create an game object in your scene to hold the behavior. This game object will be marked DontDestroyOnLoad, so you only need to put this object in your first scene. It is OK to include an extra copy of this in a second scene if you want. Speech will only be initialized once.
Use the static WindowsVoice.theVoice.speak(string) function to use the text to speech capability anywhere in your scripts. No need to worry about init or cleanup since that is tied to the life cycle of the game object.

WindowsVoice – The Industrious Squirrel

Microsoft Windows Text-to-Speech for Unity

Native C++

Declare functions

Init

Speech

C#

Using the Plugin

Instructions