fix elevenlabs and whisper for multiple codec (#14)

* fix elevenlabs and whisper for multiple codec

Signed-off-by: Quan HL <quan.luuhoang8@gmail.com>

* fix review comments

Signed-off-by: Quan HL <quan.luuhoang8@gmail.com>

---------

Signed-off-by: Quan HL <quan.luuhoang8@gmail.com>
This commit is contained in:
Hoan Luu Huu
2024-03-12 20:52:35 +07:00
committed by GitHub
parent 06ab877f68
commit d6ae0a6a39
3 changed files with 70 additions and 9 deletions

View File

@@ -31,6 +31,7 @@
#include <boost/algorithm/string.hpp> #include <boost/algorithm/string.hpp>
#include "mod_elevenlabs_tts.h" #include "mod_elevenlabs_tts.h"
#include <speex/speex_resampler.h>
#define TXNID_LEN (255) #define TXNID_LEN (255)
#define URL_LEN (1024) #define URL_LEN (1024)
@@ -820,6 +821,7 @@ extern "C" {
CURL* easy = createEasyHandle(); CURL* easy = createEasyHandle();
el->conn = (void *) conn ; el->conn = (void *) conn ;
el->sample_rate = 0;
conn->elevenlabs = el; conn->elevenlabs = el;
conn->easy = easy; conn->easy = easy;
conn->global = &global; conn->global = &global;
@@ -830,6 +832,23 @@ extern "C" {
el->circularBuffer = (void *) new CircularBuffer_t(8192); el->circularBuffer = (void *) new CircularBuffer_t(8192);
if (el->session_id) {
int err;
switch_codec_implementation_t read_impl;
switch_core_session_t *psession = switch_core_session_locate(el->session_id);
switch_core_session_get_read_impl(psession, &read_impl);
uint32_t samples_per_second = !strcasecmp(read_impl.iananame, "g722") ? read_impl.actual_samples_per_second : read_impl.samples_per_second;
el->sample_rate = samples_per_second;
// elevenlabs output is PCMU 8000
if (samples_per_second != 8000 /*Hz*/) {
el->resampler = speex_resampler_init(1, 8000, samples_per_second, SWITCH_RESAMPLE_QUALITY, &err);
if (0 != err) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error initializing resampler: %s.\n", speex_resampler_strerror(err));
return SWITCH_STATUS_FALSE;
}
}
}
std::ostringstream api_key_stream; std::ostringstream api_key_stream;
api_key_stream << "xi-api-key: " << el->api_key; api_key_stream << "xi-api-key: " << el->api_key;
@@ -881,7 +900,6 @@ extern "C" {
{ {
switch_mutex_lock(el->mutex); switch_mutex_lock(el->mutex);
ConnInfo_t *conn = (ConnInfo_t *) el->conn; ConnInfo_t *conn = (ConnInfo_t *) el->conn;
if (el->response_code > 0 && el->response_code != 200) { if (el->response_code > 0 && el->response_code != 200) {
switch_mutex_unlock(el->mutex); switch_mutex_unlock(el->mutex);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "elevenlabs_speech_read_tts, returning failure\n") ; switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "elevenlabs_speech_read_tts, returning failure\n") ;
@@ -901,14 +919,35 @@ extern "C" {
switch_mutex_unlock(el->mutex); switch_mutex_unlock(el->mutex);
return SWITCH_STATUS_SUCCESS; return SWITCH_STATUS_SUCCESS;
} }
size_t size = std::min((*datalen/2), cBuffer->size()); size_t size = el->sample_rate ?
std::min((*datalen/(2 * el->sample_rate / 8000)), cBuffer->size()) :
std::min((*datalen/2), cBuffer->size());
pcm_data.insert(pcm_data.end(), cBuffer->begin(), cBuffer->begin() + size); pcm_data.insert(pcm_data.end(), cBuffer->begin(), cBuffer->begin() + size);
cBuffer->erase(cBuffer->begin(), cBuffer->begin() + size); cBuffer->erase(cBuffer->begin(), cBuffer->begin() + size);
switch_mutex_unlock(el->mutex); switch_mutex_unlock(el->mutex);
} }
memcpy(data, pcm_data.data(), pcm_data.size() * sizeof(uint16_t)); size_t data_size = pcm_data.size();
*datalen = pcm_data.size() * sizeof(uint16_t);
if (el->resampler) {
std::vector<int16_t> in(pcm_data.begin(), pcm_data.end());
std::vector<int16_t> out((*datalen));
spx_uint32_t in_len = data_size;
spx_uint32_t out_len = out.size();
speex_resampler_process_interleaved_int(el->resampler, in.data(), &in_len, out.data(), &out_len);
if (out_len > out.size()) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Resampler output exceeded maximum buffer size!\n");
return SWITCH_STATUS_FALSE;
}
memcpy(data, out.data(), out_len * sizeof(int16_t));
*datalen = out_len * sizeof(int16_t);
} else {
memcpy(data, pcm_data.data(), pcm_data.size() * sizeof(uint16_t));
*datalen = pcm_data.size() * sizeof(uint16_t);
}
return SWITCH_STATUS_SUCCESS; return SWITCH_STATUS_SUCCESS;
} }
@@ -922,8 +961,15 @@ extern "C" {
delete cBuffer; delete cBuffer;
el->circularBuffer = nullptr ; el->circularBuffer = nullptr ;
// destroy resampler
if (el->resampler) {
speex_resampler_destroy(el->resampler);
el->resampler = NULL;
}
if (conn) { if (conn) {
conn->flushed = true; conn->flushed = true;
if (!download_complete) { if (!download_complete) {
if (conn->file) { if (conn->file) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "closing audio cache file %s because download was interrupted\n", el->cache_filename); switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "closing audio cache file %s because download was interrupted\n", el->cache_filename);

View File

@@ -5,6 +5,7 @@
#include <fcntl.h> #include <fcntl.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <unistd.h> #include <unistd.h>
#include <speex/speex_resampler.h>
struct elevenlabs_data { struct elevenlabs_data {
char *session_id; char *session_id;
@@ -30,6 +31,7 @@ struct elevenlabs_data {
char *cache_filename; char *cache_filename;
int rate; int rate;
uint32_t sample_rate;
void *conn; void *conn;
FILE *file; FILE *file;
@@ -38,6 +40,7 @@ struct elevenlabs_data {
int draining; int draining;
int reads; int reads;
int cache_audio; int cache_audio;
SpeexResamplerState *resampler;
}; };
typedef struct elevenlabs_data elevenlabs_t; typedef struct elevenlabs_data elevenlabs_t;

View File

@@ -786,10 +786,10 @@ extern "C" {
return SWITCH_STATUS_FALSE; return SWITCH_STATUS_FALSE;
} }
if (mpg123_param(mh, MPG123_FORCE_RATE, 8000 /*Hz*/, 0) != MPG123_OK) { if (mpg123_param(mh, MPG123_FLAGS, MPG123_MONO_MIX, 0) != MPG123_OK) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error mpg123_param!\n"); switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error forcing single channel!\n");
return SWITCH_STATUS_FALSE; return SWITCH_STATUS_FALSE;
} }
CURL* easy = createEasyHandle(); CURL* easy = createEasyHandle();
w->conn = (void *) conn ; w->conn = (void *) conn ;
@@ -800,10 +800,23 @@ extern "C" {
conn->hdr_list = NULL ; conn->hdr_list = NULL ;
conn->file = w->file; conn->file = w->file;
conn->body = json; conn->body = json;
conn->flushed = false; conn->flushed = false;
w->circularBuffer = (void *) new CircularBuffer_t(8192); w->circularBuffer = (void *) new CircularBuffer_t(8192);
if (w->session_id) {
int err;
switch_codec_implementation_t read_impl;
switch_core_session_t *psession = switch_core_session_locate(w->session_id);
switch_core_session_get_read_impl(psession, &read_impl);
uint32_t samples_per_second = !strcasecmp(read_impl.iananame, "g722") ? read_impl.actual_samples_per_second : read_impl.samples_per_second;
if (mpg123_param(mh, MPG123_FORCE_RATE, samples_per_second /*Hz*/, 0) != MPG123_OK) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error mpg123_param!\n");
return SWITCH_STATUS_FALSE;
}
}
std::ostringstream api_key_stream; std::ostringstream api_key_stream;
api_key_stream << "Authorization: Bearer " << w->api_key; api_key_stream << "Authorization: Bearer " << w->api_key;
@@ -851,7 +864,6 @@ extern "C" {
{ {
switch_mutex_lock(w->mutex); switch_mutex_lock(w->mutex);
ConnInfo_t *conn = (ConnInfo_t *) w->conn; ConnInfo_t *conn = (ConnInfo_t *) w->conn;
if (w->response_code > 0 && w->response_code != 200) { if (w->response_code > 0 && w->response_code != 200) {
switch_mutex_unlock(w->mutex); switch_mutex_unlock(w->mutex);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "whisper_speech_read_tts, returning failure\n") ; switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "whisper_speech_read_tts, returning failure\n") ;