fix elevenlabs and whisper for multiple codec (#14)

* fix elevenlabs and whisper for multiple codec

Signed-off-by: Quan HL <quan.luuhoang8@gmail.com>

* fix review comments

Signed-off-by: Quan HL <quan.luuhoang8@gmail.com>

---------

Signed-off-by: Quan HL <quan.luuhoang8@gmail.com>
This commit is contained in:
Hoan Luu Huu
2024-03-12 20:52:35 +07:00
committed by GitHub
parent 06ab877f68
commit d6ae0a6a39
3 changed files with 70 additions and 9 deletions

View File

@@ -31,6 +31,7 @@
#include <boost/algorithm/string.hpp>
#include "mod_elevenlabs_tts.h"
#include <speex/speex_resampler.h>
#define TXNID_LEN (255)
#define URL_LEN (1024)
@@ -820,6 +821,7 @@ extern "C" {
CURL* easy = createEasyHandle();
el->conn = (void *) conn ;
el->sample_rate = 0;
conn->elevenlabs = el;
conn->easy = easy;
conn->global = &global;
@@ -830,6 +832,23 @@ extern "C" {
el->circularBuffer = (void *) new CircularBuffer_t(8192);
if (el->session_id) {
int err;
switch_codec_implementation_t read_impl;
switch_core_session_t *psession = switch_core_session_locate(el->session_id);
switch_core_session_get_read_impl(psession, &read_impl);
uint32_t samples_per_second = !strcasecmp(read_impl.iananame, "g722") ? read_impl.actual_samples_per_second : read_impl.samples_per_second;
el->sample_rate = samples_per_second;
// elevenlabs output is PCMU 8000
if (samples_per_second != 8000 /*Hz*/) {
el->resampler = speex_resampler_init(1, 8000, samples_per_second, SWITCH_RESAMPLE_QUALITY, &err);
if (0 != err) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error initializing resampler: %s.\n", speex_resampler_strerror(err));
return SWITCH_STATUS_FALSE;
}
}
}
std::ostringstream api_key_stream;
api_key_stream << "xi-api-key: " << el->api_key;
@@ -881,7 +900,6 @@ extern "C" {
{
switch_mutex_lock(el->mutex);
ConnInfo_t *conn = (ConnInfo_t *) el->conn;
if (el->response_code > 0 && el->response_code != 200) {
switch_mutex_unlock(el->mutex);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "elevenlabs_speech_read_tts, returning failure\n") ;
@@ -901,14 +919,35 @@ extern "C" {
switch_mutex_unlock(el->mutex);
return SWITCH_STATUS_SUCCESS;
}
size_t size = std::min((*datalen/2), cBuffer->size());
size_t size = el->sample_rate ?
std::min((*datalen/(2 * el->sample_rate / 8000)), cBuffer->size()) :
std::min((*datalen/2), cBuffer->size());
pcm_data.insert(pcm_data.end(), cBuffer->begin(), cBuffer->begin() + size);
cBuffer->erase(cBuffer->begin(), cBuffer->begin() + size);
switch_mutex_unlock(el->mutex);
}
memcpy(data, pcm_data.data(), pcm_data.size() * sizeof(uint16_t));
*datalen = pcm_data.size() * sizeof(uint16_t);
size_t data_size = pcm_data.size();
if (el->resampler) {
std::vector<int16_t> in(pcm_data.begin(), pcm_data.end());
std::vector<int16_t> out((*datalen));
spx_uint32_t in_len = data_size;
spx_uint32_t out_len = out.size();
speex_resampler_process_interleaved_int(el->resampler, in.data(), &in_len, out.data(), &out_len);
if (out_len > out.size()) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Resampler output exceeded maximum buffer size!\n");
return SWITCH_STATUS_FALSE;
}
memcpy(data, out.data(), out_len * sizeof(int16_t));
*datalen = out_len * sizeof(int16_t);
} else {
memcpy(data, pcm_data.data(), pcm_data.size() * sizeof(uint16_t));
*datalen = pcm_data.size() * sizeof(uint16_t);
}
return SWITCH_STATUS_SUCCESS;
}
@@ -922,8 +961,15 @@ extern "C" {
delete cBuffer;
el->circularBuffer = nullptr ;
// destroy resampler
if (el->resampler) {
speex_resampler_destroy(el->resampler);
el->resampler = NULL;
}
if (conn) {
conn->flushed = true;
if (!download_complete) {
if (conn->file) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "closing audio cache file %s because download was interrupted\n", el->cache_filename);

View File

@@ -5,6 +5,7 @@
#include <fcntl.h>
#include <sys/stat.h>
#include <unistd.h>
#include <speex/speex_resampler.h>
struct elevenlabs_data {
char *session_id;
@@ -30,6 +31,7 @@ struct elevenlabs_data {
char *cache_filename;
int rate;
uint32_t sample_rate;
void *conn;
FILE *file;
@@ -38,6 +40,7 @@ struct elevenlabs_data {
int draining;
int reads;
int cache_audio;
SpeexResamplerState *resampler;
};
typedef struct elevenlabs_data elevenlabs_t;

View File

@@ -786,10 +786,10 @@ extern "C" {
return SWITCH_STATUS_FALSE;
}
if (mpg123_param(mh, MPG123_FORCE_RATE, 8000 /*Hz*/, 0) != MPG123_OK) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error mpg123_param!\n");
if (mpg123_param(mh, MPG123_FLAGS, MPG123_MONO_MIX, 0) != MPG123_OK) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error forcing single channel!\n");
return SWITCH_STATUS_FALSE;
}
}
CURL* easy = createEasyHandle();
w->conn = (void *) conn ;
@@ -800,10 +800,23 @@ extern "C" {
conn->hdr_list = NULL ;
conn->file = w->file;
conn->body = json;
conn->flushed = false;
conn->flushed = false;
w->circularBuffer = (void *) new CircularBuffer_t(8192);
if (w->session_id) {
int err;
switch_codec_implementation_t read_impl;
switch_core_session_t *psession = switch_core_session_locate(w->session_id);
switch_core_session_get_read_impl(psession, &read_impl);
uint32_t samples_per_second = !strcasecmp(read_impl.iananame, "g722") ? read_impl.actual_samples_per_second : read_impl.samples_per_second;
if (mpg123_param(mh, MPG123_FORCE_RATE, samples_per_second /*Hz*/, 0) != MPG123_OK) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error mpg123_param!\n");
return SWITCH_STATUS_FALSE;
}
}
std::ostringstream api_key_stream;
api_key_stream << "Authorization: Bearer " << w->api_key;
@@ -851,7 +864,6 @@ extern "C" {
{
switch_mutex_lock(w->mutex);
ConnInfo_t *conn = (ConnInfo_t *) w->conn;
if (w->response_code > 0 && w->response_code != 200) {
switch_mutex_unlock(w->mutex);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "whisper_speech_read_tts, returning failure\n") ;