support azure tts stream (#13)

* support azure stream * delete trash file * wip * wip * wip * wip * wip * wip * wip * fire variable_tts_time_to_first_byte_ms
2025-12-19 08:27:44 +00:00 · 2024-03-24 20:18:38 +07:00
parent 74bfc3152f
commit be6758c3a8
6 changed files with 626 additions and 0 deletions
--- a/mod_azure_tts/LICENSE
+++ b/mod_azure_tts/LICENSE
@@ -0,0 +1,8 @@
+Copyright 2023, Drachtio Communications Services, LLC
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
--- a/mod_azure_tts/Makefile.am
+++ b/mod_azure_tts/Makefile.am
@@ -0,0 +1,10 @@
+include $(top_srcdir)/build/modmake.rulesam
+MODNAME=mod_azure_tts
+
+mod_LTLIBRARIES = mod_azure_tts.la
+mod_azure_tts_la_SOURCES  = mod_azure_tts.c azure_glue.cpp
+mod_azure_tts_la_CFLAGS   = $(AM_CFLAGS)
+mod_azure_tts_la_CXXFLAGS = $(AM_CXXFLAGS) -std=c++14 -I/usr/local/include/MicrosoftSpeechSDK/cxx_api -I/usr/local/include/MicrosoftSpeechSDK/c_api
+
+mod_azure_tts_la_LIBADD   = $(switch_builddir)/libfreeswitch.la
+mod_azure_tts_la_LDFLAGS  = -avoid-version -module -no-undefined -L/usr/local/lib/MicrosoftSpeechSDK/x64 -lMicrosoft.CognitiveServices.Speech.core -shared `pkg-config --libs boost` -lstdc++
--- a/mod_azure_tts/azure_glue.cpp
+++ b/mod_azure_tts/azure_glue.cpp
@@ -0,0 +1,380 @@
+#include "mod_azure_tts.h"
+#include <switch.h>
+#include <speechapi_cxx.h>
+
+#include <boost/circular_buffer.hpp>
+
+#include <cstdlib>
+#include <string>
+#include <chrono>
+
+#define BUFFER_SIZE 8129
+
+typedef boost::circular_buffer<uint16_t> CircularBuffer_t;
+
+using namespace Microsoft::CognitiveServices::Speech;
+
+static std::string fullDirPath;
+
+extern "C" {
+  switch_status_t azure_speech_load() {
+    switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "azure_speech_loading..\n");
+
+    /* create temp folder for cache files */
+    const char* baseDir = std::getenv("JAMBONZ_TMP_CACHE_FOLDER");
+    if (!baseDir) {
+      baseDir = "/var/";
+    }
+    if (strcmp(baseDir, "/") == 0) {
+      switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "failed to create folder %s\n", baseDir);
+      return SWITCH_STATUS_FALSE;
+    }
+
+    fullDirPath = std::string(baseDir) + "jambonz-tts-cache-files";
+
+    // Create the directory with read, write, and execute permissions for everyone
+    mode_t oldMask = umask(0);
+    int result = mkdir(fullDirPath.c_str(), S_IRWXU | S_IRWXG | S_IRWXO);
+    umask(oldMask);
+    if (result != 0) {
+      if (errno != EEXIST) {
+        switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "failed to create folder %s\n", fullDirPath.c_str());
+        fullDirPath = "";
+      }
+      else switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "folder %s already exists\n", fullDirPath.c_str());
+    }
+    else {
+      switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "created folder %s\n", fullDirPath.c_str());
+    }
+
+    switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "azure_speech_loaded..\n");
+
+    return SWITCH_STATUS_SUCCESS;
+  }
+
+    switch_status_t azure_speech_open(azure_t* azure) {
+    return SWITCH_STATUS_SUCCESS;
+  }
+
+  switch_status_t azure_speech_feed_tts(azure_t* a, char* text, switch_speech_flag_t *flags) {
+    const int MAX_CHARS = 20;
+    char tempText[MAX_CHARS + 4]; // +4 for the ellipsis and null terminator
+
+    if (strlen(text) > MAX_CHARS) {
+        strncpy(tempText, text, MAX_CHARS);
+        strcpy(tempText + MAX_CHARS, "...");
+    } else {
+        strcpy(tempText, text);
+    }
+
+    /* open cache file */
+    if (a->cache_audio && fullDirPath.length() > 0) {
+      switch_uuid_t uuid;
+      char uuid_str[SWITCH_UUID_FORMATTED_LENGTH + 1];
+      char outfile[512] = "";
+      int fd;
+
+      switch_uuid_get(&uuid);
+      switch_uuid_format(uuid_str, &uuid);
+
+      switch_snprintf(outfile, sizeof(outfile), "%s%s%s.r8", fullDirPath.c_str(), SWITCH_PATH_SEPARATOR, uuid_str);
+      a->cache_filename = strdup(outfile);
+      switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "writing audio cache file to %s\n", a->cache_filename);
+
+      mode_t oldMask = umask(0);
+      fd = open(outfile, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
+      umask(oldMask);
+      if (fd == -1 ) {
+        switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error opening cache file %s: %s\n", outfile, strerror(errno));
+      }
+      else {
+        a->file = fdopen(fd, "wb");
+        if (!a->file) {
+          close(fd);
+          switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error opening cache file %s: %s\n", outfile, strerror(errno));
+        }
+      }
+    }
+
+    if (!a->api_key) {
+      switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "azure_speech_feed_tts: no api_key provided\n");
+      return SWITCH_STATUS_FALSE;
+    }
+
+    if (!a->language) {
+      switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "azure_speech_feed_tts: no language provided\n");
+      return SWITCH_STATUS_FALSE;
+    }
+
+    if (a->session_id) {
+      int err;
+      switch_codec_implementation_t read_impl;
+      switch_core_session_t *psession = switch_core_session_locate(a->session_id);
+      switch_core_session_get_read_impl(psession, &read_impl);
+      uint32_t samples_per_second = !strcasecmp(read_impl.iananame, "g722") ? read_impl.actual_samples_per_second : read_impl.samples_per_second;
+      a->samples_rate = samples_per_second;
+      if (samples_per_second != 8000 /*Hz*/) {
+        a->resampler = speex_resampler_init(1, 8000, samples_per_second, SWITCH_RESAMPLE_QUALITY, &err);
+        if (0 != err) {
+          switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error initializing resampler: %s.\n", speex_resampler_strerror(err));
+          return SWITCH_STATUS_FALSE;
+        }
+      }
+    }
+
+    std::chrono::time_point<std::chrono::high_resolution_clock>* ptr = new std::chrono::time_point<std::chrono::high_resolution_clock>(std::chrono::high_resolution_clock::now());
+    a->startTime = ptr;
+
+    a->circularBuffer = (void *) new CircularBuffer_t(BUFFER_SIZE);
+
+    auto speechConfig = nullptr != a->endpoint ? 
+			(nullptr != a->api_key ?
+				SpeechConfig::FromEndpoint(a->endpoint, a->api_key) :
+				SpeechConfig::FromEndpoint(a->endpoint)) :
+			SpeechConfig::FromSubscription(a->api_key, a->region ? a->region : "");
+
+    speechConfig->SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat::Raw8Khz16BitMonoPcm);
+    speechConfig->SetSpeechSynthesisLanguage(a->language);
+    speechConfig->SetSpeechSynthesisVoiceName(a->voice_name);
+    if (a->http_proxy_ip) {
+      uint32_t port = a->http_proxy_port && a->http_proxy_port[0] != '\0' ? static_cast<uint32_t>(std::stoul(a->http_proxy_port)) : 80;
+      speechConfig->SetProxy(a->http_proxy_ip, port);
+    }
+
+    if (nullptr != a->endpointId) {
+      switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "azure_speech_feed_tts setting endpoint id: %s\n", a->endpointId);
+			speechConfig->SetEndpointId(a->endpointId);
+		}
+
+    auto speechSynthesizer = SpeechSynthesizer::FromConfig(speechConfig);
+
+    speechSynthesizer->SynthesisStarted += [a](const SpeechSynthesisEventArgs& e) {
+        switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "azure_speech_feed_tts SynthesisStarted\n");
+        a->response_code = 200;
+    };
+
+    speechSynthesizer->Synthesizing += [a](const SpeechSynthesisEventArgs& e) {
+      bool fireEvent = false;
+      CircularBuffer_t *cBuffer = (CircularBuffer_t *) a->circularBuffer;
+      std::vector<uint16_t> pcm_data;
+
+      switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Synthesizing: received data\n");
+
+      if (a->flushed) {
+        return;
+      }
+      {
+        switch_mutex_lock(a->mutex);
+        auto audioData = e.Result->GetAudioData();
+        for (size_t i = 0; i < audioData->size(); i += sizeof(int16_t)) {
+            int16_t value = static_cast<int16_t>((*audioData)[i]) | (static_cast<int16_t>((*audioData)[i + 1]) << 8);
+            pcm_data.push_back(value);
+        }
+
+        /* and write to the file */
+        size_t bytesResampled = pcm_data.size() * sizeof(uint16_t);
+        if (a->file) fwrite(pcm_data.data(), sizeof(uint16_t), pcm_data.size(), a->file);
+
+        // Resize the buffer if necessary
+        if (cBuffer->capacity() - cBuffer->size() < (bytesResampled / sizeof(uint16_t))) {
+
+          //TODO: if buffer exceeds some max size, return CURL_WRITEFUNC_ERROR to abort the transfer
+          cBuffer->set_capacity(cBuffer->size() + std::max((bytesResampled / sizeof(uint16_t)), (size_t)BUFFER_SIZE));
+        }
+
+        /* Push the data into the buffer */
+        cBuffer->insert(cBuffer->end(), pcm_data.data(), pcm_data.data() + pcm_data.size());
+
+        switch_mutex_unlock(a->mutex);
+      }
+
+      if (0 == a->reads++) {
+        fireEvent = true;
+      }
+
+      if (fireEvent && a->session_id) {
+        auto endTime = std::chrono::high_resolution_clock::now();
+        auto startTime = *static_cast<std::chrono::time_point<std::chrono::high_resolution_clock>*>(a->startTime);
+        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime);
+        auto time_to_first_byte_ms = std::to_string(duration.count());
+        switch_core_session_t* session = switch_core_session_locate(a->session_id);
+        if (session) {
+          switch_channel_t *channel = switch_core_session_get_channel(session);
+          if (channel) {
+            switch_event_t *event;
+            if (switch_event_create(&event, SWITCH_EVENT_PLAYBACK_START) == SWITCH_STATUS_SUCCESS) {
+              switch_channel_event_set_data(channel, event);
+              switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "variable_tts_time_to_first_byte_ms", time_to_first_byte_ms.c_str());
+              if (a->cache_filename) {
+                switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "variable_tts_cache_filename", a->cache_filename);
+              }
+              switch_event_fire(&event);
+            } else {
+              switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "speechSynthesizer->Synthesizing: failed to create event\n");
+            }
+          }else {
+            switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "speechSynthesizer->Synthesizing: channel not found\n");
+          }
+          switch_core_session_rwunlock(session);
+        }
+      }
+    };
+
+    speechSynthesizer->SynthesisCompleted += [a](const SpeechSynthesisEventArgs& e) {
+       switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "azure_speech_feed_tts SynthesisCompleted\n");
+       a->draining = 1;
+    };
+
+    speechSynthesizer->SynthesisCanceled += [a](const SpeechSynthesisEventArgs& e) {
+      if (e.Result->Reason == ResultReason::Canceled) {
+        auto cancellation = SpeechSynthesisCancellationDetails::FromResult(e.Result);
+        a->response_code = static_cast<long int>(cancellation->ErrorCode);
+        a->err_msg = strdup(cancellation->ErrorDetails.c_str());
+        switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error synthsize tex %d with error string: %s.\n", static_cast<int>(cancellation->ErrorCode), cancellation->ErrorDetails.c_str());
+      }
+    };
+    // switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "azure_speech_feed_tts before sending synthesize request\n");
+    if (std::strncmp(text, "<speak", 6) == 0) {
+      speechSynthesizer->SpeakSsmlAsync(text);
+    } else {
+      speechSynthesizer->SpeakTextAsync(text);
+    }
+    // switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "azure_speech_feed_tts sent synthesize request\n");
+    return SWITCH_STATUS_SUCCESS;
+  }
+
+  switch_status_t azure_speech_read_tts(azure_t* a, void *data, size_t *datalen, switch_speech_flag_t *flags) {
+    CircularBuffer_t *cBuffer = (CircularBuffer_t *) a->circularBuffer;
+    std::vector<uint16_t> pcm_data;
+
+    {
+      switch_mutex_lock(a->mutex);
+      if (a->response_code > 0 && a->response_code != 200) {
+        switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "azure_speech_read_tts, returning failure\n") ;
+        return SWITCH_STATUS_FALSE;
+      }
+      if (a->flushed) {
+        return SWITCH_STATUS_BREAK;
+      }
+      if (cBuffer->empty()) {
+        if (a->draining) {
+          switch_mutex_unlock(a->mutex);
+          return SWITCH_STATUS_BREAK;
+        }
+        /* no audio available yet so send silence */
+        memset(data, 255, *datalen);
+        switch_mutex_unlock(a->mutex);
+        return SWITCH_STATUS_SUCCESS;
+      }
+      // azure returned 8000hz 16 bit data, we have to take enough data based on call sample rate.
+      size_t size = a->samples_rate ?
+        std::min((*datalen/(2 * a->samples_rate / 8000)), cBuffer->size()) :
+        std::min((*datalen/2), cBuffer->size());
+      pcm_data.insert(pcm_data.end(), cBuffer->begin(), cBuffer->begin() + size);
+      cBuffer->erase(cBuffer->begin(), cBuffer->begin() + size);
+      switch_mutex_unlock(a->mutex);
+    }
+
+    size_t data_size = pcm_data.size();
+
+    if (a->resampler) {
+        std::vector<int16_t> in(pcm_data.begin(), pcm_data.end());
+
+        std::vector<int16_t> out((*datalen));
+        spx_uint32_t in_len = data_size;
+        spx_uint32_t out_len = out.size();
+
+        speex_resampler_process_interleaved_int(a->resampler, in.data(), &in_len, out.data(), &out_len);
+
+        if (out_len > out.size()) {
+          switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Resampler output exceeded maximum buffer size!\n");
+          return SWITCH_STATUS_FALSE;
+        }
+
+        memcpy(data, out.data(), out_len * sizeof(int16_t));
+        *datalen = out_len * sizeof(int16_t);
+    } else {
+        memcpy(data, pcm_data.data(), data_size * sizeof(int16_t));
+        *datalen = data_size * sizeof(int16_t);
+    }
+
+    return SWITCH_STATUS_SUCCESS;
+  }
+
+  switch_status_t azure_speech_flush_tts(azure_t* a) {
+    bool download_complete = a->response_code == 200;
+    switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "azure_speech_flush_tts, download complete? %s\n", download_complete ? "yes" : "no") ;
+
+    CircularBuffer_t *cBuffer = (CircularBuffer_t *) a->circularBuffer;
+    delete cBuffer;
+    a->circularBuffer = nullptr ;
+    delete static_cast<std::chrono::time_point<std::chrono::high_resolution_clock>*>(a->startTime);
+    a->startTime = nullptr;
+
+    a->flushed = 1;
+    if (!download_complete) {
+      if (a->file) {
+        switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "closing audio cache file %s because download was interrupted\n", a->cache_filename);
+        if (fclose(a->file) != 0) {
+          switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "error closing audio cache file\n");
+        }
+        a->file = nullptr ;
+      }
+
+      if (a->cache_filename) {
+        switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "removing audio cache file %s because download was interrupted\n", a->cache_filename);
+        if (unlink(a->cache_filename) != 0) {
+          switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "cleanupConn: error removing audio cache file %s: %d:%s\n", 
+            a->cache_filename, errno, strerror(errno));
+        }
+        free(a->cache_filename);
+        a->cache_filename = nullptr ;
+      }
+    }
+    if (a->session_id) {
+      switch_core_session_t* session = switch_core_session_locate(a->session_id);
+      if (session) {
+        switch_channel_t *channel = switch_core_session_get_channel(session);
+        if (channel) {
+          switch_event_t *event;
+          if (switch_event_create(&event, SWITCH_EVENT_PLAYBACK_STOP) == SWITCH_STATUS_SUCCESS) {
+            switch_channel_event_set_data(channel, event);
+            switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "Playback-File-Type", "tts_stream");
+            switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "variable_tts_azure_response_code", std::to_string(a->response_code).c_str());
+            if (a->cache_filename && download_complete) {
+              switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "variable_tts_cache_filename", a->cache_filename);
+            }
+            if (!download_complete && a->err_msg) {
+              switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "variable_tts_error", a->err_msg);
+            }
+            switch_event_fire(&event);
+          }
+          else {
+            switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "write_cb: failed to create event\n");
+          }
+        }
+        else {
+          switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "write_cb: channel not found\n");
+        }
+        switch_core_session_rwunlock(session);
+      }
+    }
+
+    return SWITCH_STATUS_SUCCESS;
+  }
+
+  switch_status_t azure_speech_close(azure_t* a) {
+    switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "azure_speech_close\n") ;
+    if (a->resampler) {
+      speex_resampler_destroy(a->resampler);
+    }
+
+    a->resampler = NULL;
+    return SWITCH_STATUS_SUCCESS;
+  }
+
+  switch_status_t azure_speech_unload() {
+    return SWITCH_STATUS_SUCCESS;
+  }
+
+} 
--- a/mod_azure_tts/azure_glue.h
+++ b/mod_azure_tts/azure_glue.h
@@ -0,0 +1,12 @@
+#ifndef __AZURE_TTS_GLUE_H__
+#define __AZURE_TTS_GLUE_H__
+
+switch_status_t azure_speech_load();
+switch_status_t azure_speech_open(azure_t* azure);
+switch_status_t azure_speech_feed_tts(azure_t* azure, char* text, switch_speech_flag_t *flags);
+switch_status_t azure_speech_read_tts(azure_t* azure, void *data, size_t *datalen, switch_speech_flag_t *flags);
+switch_status_t azure_speech_flush_tts(azure_t* azure);
+switch_status_t azure_speech_close(azure_t* azure);
+switch_status_t azure_speech_unload();
+
+#endif
--- a/mod_azure_tts/mod_azure_tts.c
+++ b/mod_azure_tts/mod_azure_tts.c
@@ -0,0 +1,178 @@
+#include "mod_azure_tts.h"
+#include "azure_glue.h"
+
+SWITCH_MODULE_LOAD_FUNCTION(mod_azure_tts_load);
+SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_azure_tts_shutdown);
+SWITCH_MODULE_DEFINITION(mod_azure_tts, mod_azure_tts_load, mod_azure_tts_shutdown, NULL);
+
+static void clearAzure(azure_t* a, int freeAll) {
+  switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "clearAzure\n");
+
+  if (a->cache_filename) free(a->cache_filename);
+  if (a->api_key) free(a->api_key);
+  if (a->language) free(a->language);
+  if (a->region) free(a->region);
+  if (a->endpoint) free(a->endpoint);
+  if (a->endpointId) free(a->endpointId);
+  if (a->err_msg) free(a->err_msg);
+  if (a->http_proxy_ip) free(a->http_proxy_ip);
+  if (a->http_proxy_port) free(a->http_proxy_port);
+
+  
+  a->cache_filename = NULL;
+  a->api_key = NULL;
+  a->language = NULL;
+  a->region = NULL;
+  a->endpoint = NULL;
+  a->endpointId = NULL;
+  a->err_msg = NULL;
+  a->http_proxy_ip = NULL;
+  a->http_proxy_port = NULL;
+
+
+  if (freeAll) {
+    if (a->voice_name) free(a->voice_name);
+    if (a->session_id) free(a->session_id);
+    a->voice_name = NULL;
+    a->session_id = NULL;
+  }
+
+}
+
+static azure_t * createOrRetrievePrivateData(switch_speech_handle_t *sh) {
+  azure_t *a = (azure_t *) sh->private_info;  
+  if (!a) {
+    a = switch_core_alloc(sh->memory_pool, sizeof(*a));
+  	sh->private_info = a;
+    memset(a, 0, sizeof(*a));
+    switch_mutex_init(&a->mutex, SWITCH_MUTEX_NESTED, sh->memory_pool);
+    switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "allocated azure_t\n");
+  }
+  return a;
+}
+
+switch_status_t a_speech_open(switch_speech_handle_t *sh, const char *voice_name, int rate, int channels, switch_speech_flag_t *flags)
+{
+  azure_t *a = createOrRetrievePrivateData(sh);
+  a->voice_name = strdup(voice_name);
+  a->rate = rate;
+  switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "a_speech_open voice: %s, rate %d, channels %d\n", voice_name, rate, channels);
+  return azure_speech_open(a);
+}
+
+static switch_status_t a_speech_close(switch_speech_handle_t *sh, switch_speech_flag_t *flags)
+{
+  switch_status_t rc;
+  azure_t *a = createOrRetrievePrivateData(sh);
+  switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "a_speech_close\n");
+
+  switch_mutex_destroy(a->mutex);
+
+  rc = azure_speech_close(a);
+  clearAzure(a, 1);
+  return rc;
+}
+
+/**
+ * Freeswitch will call this function to feed us text to speak
+ */
+static switch_status_t a_speech_feed_tts(switch_speech_handle_t *sh, char *text, switch_speech_flag_t *flags)
+{
+  azure_t *a = createOrRetrievePrivateData(sh);
+  a->draining = 0;
+  a->reads = 0;
+  a->flushed = 0;
+  a->samples_rate = 0;
+
+  switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "a_speech_feed_tts\n");
+
+  return azure_speech_feed_tts(a, text, flags);
+}
+
+/**
+ * Freeswitch calls periodically to get some rendered audio in L16 format. We can provide up to 8k of audio at a time.
+ */
+static switch_status_t a_speech_read_tts(switch_speech_handle_t *sh, void *data, size_t *datalen, switch_speech_flag_t *flags)
+{
+  azure_t *a = createOrRetrievePrivateData(sh);
+  switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "a_speech_read_tts\n");
+  return azure_speech_read_tts(a, data, datalen, flags);
+}
+
+/**
+ * This is called at the end, not sure exactly what we need to do here..
+ */
+static void a_speech_flush_tts(switch_speech_handle_t *sh)
+{
+  azure_t *a = createOrRetrievePrivateData(sh);
+  switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "w_speech_flush_tts\n");
+  azure_speech_flush_tts(a);
+
+  clearAzure(a, 0);
+}
+
+static void a_text_param_tts(switch_speech_handle_t *sh, char *param, const char *val)
+{
+  azure_t *a = createOrRetrievePrivateData(sh);
+  switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "a_text_param_tts: %s=%s\n", param, val);
+  if (0 == strcmp(param, "api_key")) {
+    if (a->api_key) free(a->api_key);
+    a->api_key = strdup(val);
+  } else if (0 == strcmp(param, "region")) {
+    if (a->region) free(a->region);
+    a->region = strdup(val);
+  } else if (0 == strcmp(param, "voice")) {
+    if (a->voice_name) free(a->voice_name);
+    a->voice_name = strdup(val);
+  } else if (0 == strcmp(param, "language")) {
+    if (a->language) free(a->language);
+    a->language = strdup(val);
+  } else if (0 == strcmp(param, "endpoint")) {
+    if (a->endpoint) free(a->endpoint);
+    a->endpoint = strdup(val);
+  } else if (0 == strcmp(param, "endpointId")) {
+    if (a->endpointId) free(a->endpointId);
+    a->endpointId = strdup(val);
+  } else if (0 == strcmp(param, "http_proxy_ip")) {
+    if (a->http_proxy_ip) free(a->http_proxy_ip);
+    a->http_proxy_ip = strdup(val);
+  } else if (0 == strcmp(param, "http_proxy_port")) {
+    if (a->http_proxy_port) free(a->http_proxy_port);
+    a->http_proxy_port = strdup(val);
+  } else if (0 == strcmp(param, "session-uuid")) {
+    if (a->session_id) free(a->session_id);
+    a->session_id = strdup(val);
+  } else if (0 == strcmp(param, "write_cache_file") && switch_true(val)) {
+    a->cache_audio = 1;
+  }
+}
+
+static void a_numeric_param_tts(switch_speech_handle_t *sh, char *param, int val)
+{
+}
+static void a_float_param_tts(switch_speech_handle_t *sh, char *param, double val)
+{
+}
+
+SWITCH_MODULE_LOAD_FUNCTION(mod_azure_tts_load)
+{
+  switch_speech_interface_t *speech_interface;
+
+  *module_interface = switch_loadable_module_create_module_interface(pool, modname);
+  speech_interface = switch_loadable_module_create_interface(*module_interface, SWITCH_SPEECH_INTERFACE);
+  speech_interface->interface_name = "microsoft";
+  speech_interface->speech_open = a_speech_open;
+  speech_interface->speech_close = a_speech_close;
+  speech_interface->speech_feed_tts = a_speech_feed_tts;
+  speech_interface->speech_read_tts = a_speech_read_tts;
+	speech_interface->speech_flush_tts = a_speech_flush_tts;
+	speech_interface->speech_text_param_tts = a_text_param_tts;
+	speech_interface->speech_numeric_param_tts = a_numeric_param_tts;
+	speech_interface->speech_float_param_tts = a_float_param_tts;
+  return azure_speech_load();
+}
+
+SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_azure_tts_shutdown)
+{
+  return azure_speech_unload();
+}
--- a/mod_azure_tts/mod_azure_tts.h
+++ b/mod_azure_tts/mod_azure_tts.h
@@ -0,0 +1,38 @@
+#ifndef __MOD_AZURE_TTS_H__
+#define __MOD_AZURE_TTS_H__
+
+#include <switch.h>
+#include <speex/speex_resampler.h>
+
+typedef struct azure_data {
+  char *voice_name;
+  char *api_key;
+  char *region;
+  char *language;
+  char *endpoint;
+  char *endpointId;
+  char *http_proxy_ip;
+  char *http_proxy_port;
+
+  /* result data */
+  long response_code;
+  char *session_id;
+  char *cache_filename;
+  char *err_msg;
+
+  int rate;
+  int draining;
+  int reads;
+  int cache_audio;
+  int flushed;
+  uint32_t samples_rate;
+
+  void *startTime;
+
+  FILE *file;
+  SpeexResamplerState *resampler;
+  void *circularBuffer;
+  switch_mutex_t *mutex;
+} azure_t;
+
+#endif