diff --git a/mod_elevenlabs_tts/elevenlabs_glue.cpp b/mod_elevenlabs_tts/elevenlabs_glue.cpp index 5f22561..61efe60 100644 --- a/mod_elevenlabs_tts/elevenlabs_glue.cpp +++ b/mod_elevenlabs_tts/elevenlabs_glue.cpp @@ -31,6 +31,7 @@ #include #include "mod_elevenlabs_tts.h" +#include #define TXNID_LEN (255) #define URL_LEN (1024) @@ -820,6 +821,7 @@ extern "C" { CURL* easy = createEasyHandle(); el->conn = (void *) conn ; + el->sample_rate = 0; conn->elevenlabs = el; conn->easy = easy; conn->global = &global; @@ -830,6 +832,23 @@ extern "C" { el->circularBuffer = (void *) new CircularBuffer_t(8192); + if (el->session_id) { + int err; + switch_codec_implementation_t read_impl; + switch_core_session_t *psession = switch_core_session_locate(el->session_id); + switch_core_session_get_read_impl(psession, &read_impl); + uint32_t samples_per_second = !strcasecmp(read_impl.iananame, "g722") ? read_impl.actual_samples_per_second : read_impl.samples_per_second; + el->sample_rate = samples_per_second; + // elevenlabs output is PCMU 8000 + if (samples_per_second != 8000 /*Hz*/) { + el->resampler = speex_resampler_init(1, 8000, samples_per_second, SWITCH_RESAMPLE_QUALITY, &err); + if (0 != err) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error initializing resampler: %s.\n", speex_resampler_strerror(err)); + return SWITCH_STATUS_FALSE; + } + } + } + std::ostringstream api_key_stream; api_key_stream << "xi-api-key: " << el->api_key; @@ -881,7 +900,6 @@ extern "C" { { switch_mutex_lock(el->mutex); ConnInfo_t *conn = (ConnInfo_t *) el->conn; - if (el->response_code > 0 && el->response_code != 200) { switch_mutex_unlock(el->mutex); switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "elevenlabs_speech_read_tts, returning failure\n") ; @@ -901,14 +919,35 @@ extern "C" { switch_mutex_unlock(el->mutex); return SWITCH_STATUS_SUCCESS; } - size_t size = std::min((*datalen/2), cBuffer->size()); + size_t size = el->sample_rate ? + std::min((*datalen/(2 * el->sample_rate / 8000)), cBuffer->size()) : + std::min((*datalen/2), cBuffer->size()); pcm_data.insert(pcm_data.end(), cBuffer->begin(), cBuffer->begin() + size); cBuffer->erase(cBuffer->begin(), cBuffer->begin() + size); switch_mutex_unlock(el->mutex); } - memcpy(data, pcm_data.data(), pcm_data.size() * sizeof(uint16_t)); - *datalen = pcm_data.size() * sizeof(uint16_t); + size_t data_size = pcm_data.size(); + + if (el->resampler) { + std::vector in(pcm_data.begin(), pcm_data.end()); + + std::vector out((*datalen)); + spx_uint32_t in_len = data_size; + spx_uint32_t out_len = out.size(); + speex_resampler_process_interleaved_int(el->resampler, in.data(), &in_len, out.data(), &out_len); + + if (out_len > out.size()) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Resampler output exceeded maximum buffer size!\n"); + return SWITCH_STATUS_FALSE; + } + + memcpy(data, out.data(), out_len * sizeof(int16_t)); + *datalen = out_len * sizeof(int16_t); + } else { + memcpy(data, pcm_data.data(), pcm_data.size() * sizeof(uint16_t)); + *datalen = pcm_data.size() * sizeof(uint16_t); + } return SWITCH_STATUS_SUCCESS; } @@ -922,8 +961,15 @@ extern "C" { delete cBuffer; el->circularBuffer = nullptr ; + // destroy resampler + if (el->resampler) { + speex_resampler_destroy(el->resampler); + el->resampler = NULL; + } + if (conn) { conn->flushed = true; + if (!download_complete) { if (conn->file) { switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "closing audio cache file %s because download was interrupted\n", el->cache_filename); diff --git a/mod_elevenlabs_tts/mod_elevenlabs_tts.h b/mod_elevenlabs_tts/mod_elevenlabs_tts.h index bcc40f6..c8a9ac0 100644 --- a/mod_elevenlabs_tts/mod_elevenlabs_tts.h +++ b/mod_elevenlabs_tts/mod_elevenlabs_tts.h @@ -5,6 +5,7 @@ #include #include #include +#include struct elevenlabs_data { char *session_id; @@ -30,6 +31,7 @@ struct elevenlabs_data { char *cache_filename; int rate; + uint32_t sample_rate; void *conn; FILE *file; @@ -38,6 +40,7 @@ struct elevenlabs_data { int draining; int reads; int cache_audio; + SpeexResamplerState *resampler; }; typedef struct elevenlabs_data elevenlabs_t; diff --git a/mod_whisper_tts/whisper_glue.cpp b/mod_whisper_tts/whisper_glue.cpp index 95bb35a..630ee07 100644 --- a/mod_whisper_tts/whisper_glue.cpp +++ b/mod_whisper_tts/whisper_glue.cpp @@ -786,10 +786,10 @@ extern "C" { return SWITCH_STATUS_FALSE; } - if (mpg123_param(mh, MPG123_FORCE_RATE, 8000 /*Hz*/, 0) != MPG123_OK) { - switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error mpg123_param!\n"); + if (mpg123_param(mh, MPG123_FLAGS, MPG123_MONO_MIX, 0) != MPG123_OK) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error forcing single channel!\n"); return SWITCH_STATUS_FALSE; - } + } CURL* easy = createEasyHandle(); w->conn = (void *) conn ; @@ -800,10 +800,23 @@ extern "C" { conn->hdr_list = NULL ; conn->file = w->file; conn->body = json; - conn->flushed = false; + conn->flushed = false; + w->circularBuffer = (void *) new CircularBuffer_t(8192); + if (w->session_id) { + int err; + switch_codec_implementation_t read_impl; + switch_core_session_t *psession = switch_core_session_locate(w->session_id); + switch_core_session_get_read_impl(psession, &read_impl); + uint32_t samples_per_second = !strcasecmp(read_impl.iananame, "g722") ? read_impl.actual_samples_per_second : read_impl.samples_per_second; + if (mpg123_param(mh, MPG123_FORCE_RATE, samples_per_second /*Hz*/, 0) != MPG123_OK) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error mpg123_param!\n"); + return SWITCH_STATUS_FALSE; + } + } + std::ostringstream api_key_stream; api_key_stream << "Authorization: Bearer " << w->api_key; @@ -851,7 +864,6 @@ extern "C" { { switch_mutex_lock(w->mutex); ConnInfo_t *conn = (ConnInfo_t *) w->conn; - if (w->response_code > 0 && w->response_code != 200) { switch_mutex_unlock(w->mutex); switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "whisper_speech_read_tts, returning failure\n") ;