verbio stt/tts mods (#65)

* verbio tts mod

Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com>

* mod_verbio_transcribe

* wip

* wip

* wip

* wip

* wip

Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com>

* wip

Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com>

* wip

Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com>

* verbio stt

Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com>

* wip

Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com>

* wip

* wip

Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com>

---------

Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com>
This commit is contained in:
Hoan Luu Huu
2024-05-29 19:05:54 +07:00
committed by GitHub
parent 466dec7a6f
commit c945a2de5a
15 changed files with 1943 additions and 0 deletions

View File

@@ -0,0 +1,8 @@
Copyright 2023, Drachtio Communications Services, LLC
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@@ -0,0 +1,10 @@
include $(top_srcdir)/build/modmake.rulesam
MODNAME=mod_verbio_transcribe
mod_LTLIBRARIES = mod_verbio_transcribe.la
mod_verbio_transcribe_la_SOURCES = mod_verbio_transcribe.c verbio_glue.cpp
mod_verbio_transcribe_la_CFLAGS = $(AM_CFLAGS)
mod_verbio_transcribe_la_CXXFLAGS = -I $(top_srcdir)/libs/verbio-asr-grpc-api/stubs $(AM_CXXFLAGS) -std=c++17
mod_verbio_transcribe_la_LIBADD = $(switch_builddir)/libfreeswitch.la
mod_verbio_transcribe_la_LDFLAGS = -avoid-version -module -no-undefined -shared -lstdc++ -lboost_system -lboost_thread

View File

@@ -0,0 +1,3 @@
# mod_verbio_transcribe
A Freeswitch module that generates real-time transcriptions on a Freeswitch channel by using the Verbio Microsoft streaming transcription API

View File

@@ -0,0 +1,201 @@
/*
*
* mod_verbio_transcribe.c -- Freeswitch module for using verbio streaming transcribe api
*
*/
#include "mod_verbio_transcribe.h"
#include "verbio_glue.h"
/* Prototypes */
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_verbio_transcribe_shutdown);
SWITCH_MODULE_LOAD_FUNCTION(mod_verbio_transcribe_load);
SWITCH_MODULE_DEFINITION(mod_verbio_transcribe, mod_verbio_transcribe_load, mod_verbio_transcribe_shutdown, NULL);
static switch_status_t do_stop(switch_core_session_t *session, char* bugname);
static void responseHandler(switch_core_session_t* session, const char* eventName, const char * json, const char* bugname, int finished) {
switch_event_t *event;
switch_channel_t *channel = switch_core_session_get_channel(session);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "responseHandler event %s, body %s.\n", eventName, json);
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, eventName);
switch_channel_event_set_data(channel, event);
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "verbio");
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-session-finished", finished ? "true" : "false");
if (finished) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "responseHandler returning event %s, from finished recognition session\n", eventName);
}
if (json) switch_event_add_body(event, "%s", json);
if (bugname) switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "media-bugname", bugname);
switch_event_fire(&event);
}
static switch_bool_t capture_callback(switch_media_bug_t *bug, void *user_data, switch_abc_type_t type)
{
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
switch (type) {
case SWITCH_ABC_TYPE_INIT:
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Got SWITCH_ABC_TYPE_INIT.\n");
break;
case SWITCH_ABC_TYPE_CLOSE:
{
struct cap_cb* cb = (struct cap_cb*) switch_core_media_bug_get_user_data(bug);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Got SWITCH_ABC_TYPE_CLOSE.\n");
verbio_speech_session_cleanup(session, 1, cb->bugname);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Finished SWITCH_ABC_TYPE_CLOSE.\n");
}
break;
case SWITCH_ABC_TYPE_READ:
return verbio_speech_frame(bug, user_data);
break;
case SWITCH_ABC_TYPE_WRITE:
default:
break;
}
return SWITCH_TRUE;
}
static switch_status_t start_capture(switch_core_session_t *session, switch_media_bug_flag_t flags, char* lang, int interim, char* bugname)
{
switch_channel_t *channel = switch_core_session_get_channel(session);
switch_media_bug_t *bug;
switch_status_t status;
void *pUserData;
if (switch_channel_get_private(channel, bugname)) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Verbio removing bug from previous transcribe\n");
do_stop(session, bugname);
}
if (switch_channel_pre_answer(channel) != SWITCH_STATUS_SUCCESS) {
return SWITCH_STATUS_FALSE;
}
if (SWITCH_STATUS_FALSE == verbio_speech_session_init(session, responseHandler,
flags & SMBF_STEREO ? 2 : 1/*channels*/,lang, interim, bugname, &pUserData)) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error initializing verbio speech session.\n");
return SWITCH_STATUS_FALSE;
}
if ((status = switch_core_media_bug_add(session, bugname, NULL, capture_callback, pUserData, 0, flags, &bug)) != SWITCH_STATUS_SUCCESS) {
return status;
}
switch_channel_set_private(channel, bugname, bug);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "added media bug for verbio transcribe\n");
return SWITCH_STATUS_SUCCESS;
}
static switch_status_t do_stop(switch_core_session_t *session, char* bugname)
{
switch_status_t status = SWITCH_STATUS_SUCCESS;
switch_channel_t *channel = switch_core_session_get_channel(session);
switch_media_bug_t *bug = switch_channel_get_private(channel, bugname);
if (bug) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "do_stop: Received user command command to stop transcribe.\n");
status = verbio_speech_session_cleanup(session, 0, bugname);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "do_stop: stopped transcribe.\n");
}
return status;
}
#define TRANSCRIBE_API_SYNTAX "<uuid> [start|stop] lang-code [interim] [stereo|mono] [bugname]"
SWITCH_STANDARD_API(verbio_transcribe_function)
{
char *mycmd = NULL, *argv[6] = { 0 };
int argc = 0;
switch_status_t status = SWITCH_STATUS_FALSE;
switch_media_bug_flag_t flags = SMBF_READ_STREAM /* | SMBF_WRITE_STREAM | SMBF_READ_PING */;
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
}
if (zstr(cmd) || zstr(argv[0])) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s %s %s.\n", cmd, argv[0], argv[1]);
stream->write_function(stream, "-USAGE: %s\n", TRANSCRIBE_API_SYNTAX);
goto done;
} else {
switch_core_session_t *lsession = NULL;
if ((lsession = switch_core_session_locate(argv[0]))) {
if (!strcasecmp(argv[1], "stop")) {
char *bugname = argc > 2 ? argv[2] : MY_BUG_NAME;
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "Verbio stop transcribing %s\n", bugname);
status = do_stop(lsession, bugname);
} else if (!strcasecmp(argv[1], "start")) {
char* lang = argv[2];
int interim = argc > 3 && !strcmp(argv[3], "interim");
char *bugname = argc > 5 ? argv[5] : MY_BUG_NAME;
if (argc > 4 && !strcmp(argv[4], "stereo")) {
flags |= SMBF_WRITE_STREAM ;
flags |= SMBF_STEREO;
}
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "start transcribing %s %s %s\n",
lang, interim ? "interim": "complete", bugname);
status = start_capture(lsession, flags, lang, interim, bugname);
}
switch_core_session_rwunlock(lsession);
}
}
if (status == SWITCH_STATUS_SUCCESS) {
stream->write_function(stream, "+OK Success\n");
} else {
stream->write_function(stream, "-ERR Operation Failed\n");
}
done:
switch_safe_free(mycmd);
return SWITCH_STATUS_SUCCESS;
}
SWITCH_MODULE_LOAD_FUNCTION(mod_verbio_transcribe_load)
{
switch_api_interface_t *api_interface;
/* create/register custom event message type */
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_RESULTS) != SWITCH_STATUS_SUCCESS) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_RESULTS);
return SWITCH_STATUS_TERM;
}
/* connect my internal structure to the blank pointer passed to me */
*module_interface = switch_loadable_module_create_module_interface(pool, modname);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "verbio Speech Transcription API loading..\n");
if (SWITCH_STATUS_FALSE == verbio_speech_init()) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Failed initializing verbio speech interface\n");
}
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "verbio Speech Transcription API successfully loaded\n");
SWITCH_ADD_API(api_interface, "uuid_verbio_transcribe", "verbio Speech Transcription API", verbio_transcribe_function, TRANSCRIBE_API_SYNTAX);
switch_console_set_complete("add uuid_verbio_transcribe start lang-code [interim|final] [stereo|mono] [bugname]");
switch_console_set_complete("add uuid_verbio_transcribe stop ");
/* indicate that the module should continue to be loaded */
return SWITCH_STATUS_SUCCESS;
}
/*
Called when the system shuts down
Macro expands to: switch_status_t mod_verbio_transcribe_shutdown() */
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_verbio_transcribe_shutdown)
{
verbio_speech_cleanup();
switch_event_free_subclass(TRANSCRIBE_EVENT_RESULTS);
return SWITCH_STATUS_SUCCESS;
}

View File

@@ -0,0 +1,48 @@
#ifndef __MOD_VERBIO_TRANSCRIBE_H__
#define __MOD_VERBIO_TRANSCRIBE_H__
#include <switch.h>
#include <speex/speex_resampler.h>
#include <unistd.h>
#define MY_BUG_NAME "verbio_transcribe"
#define MAX_ENGINE_VERSION_LEN (2)
#define MAX_BUG_LEN (64)
#define MAX_SESSION_ID (256)
#define LONG_TEXT_LEN (1024)
#define MAX_LANGUAGE_LEN (6)
#define TRANSCRIBE_EVENT_RESULTS "verbio_transcribe::transcription"
#define TRANSCRIBE_EVENT_ERROR "jambonz_transcribe::error"
/* per-channel data */
typedef void (*responseHandler_t)(switch_core_session_t* session, const char* event, const char * json, const char* bugname, int finished);
struct cap_cb {
switch_mutex_t *mutex;
char sessionId[MAX_SESSION_ID+1];
char bugname[MAX_BUG_LEN+1];
char access_token[LONG_TEXT_LEN + 1];
char language[MAX_LANGUAGE_LEN + 1];
char inline_grammar[LONG_TEXT_LEN + 1];
char grammar_uri[LONG_TEXT_LEN + 1];
char label[MAX_SESSION_ID+1];
uint32_t engine_version;
uint32_t topic;
uint32_t enable_formatting;
uint32_t enable_diarization;
uint32_t channels;
uint32_t interim;
uint32_t recognition_timeout;
uint32_t speech_complete_timeout;
uint32_t speech_incomplete_timeout;
uint32_t finished;
SpeexResamplerState *resampler;
void* streamer;
responseHandler_t responseHandler;
switch_thread_t* thread;
};
#endif

View File

@@ -0,0 +1,51 @@
/**
* (very) simple and limited circular buffer,
* supporting only the use case of doing all of the adds
* and then subsquently retrieves.
*
*/
class SimpleBuffer {
public:
SimpleBuffer(uint32_t chunkSize, uint32_t numChunks) : numItems(0),
m_numChunks(numChunks), m_chunkSize(chunkSize) {
m_pData = new char[chunkSize * numChunks];
m_pNextWrite = m_pData;
}
~SimpleBuffer() {
delete [] m_pData;
}
void add(void *data, uint32_t datalen) {
if (datalen % m_chunkSize != 0) return;
int numChunks = datalen / m_chunkSize;
for (int i = 0; i < numChunks; i++) {
memcpy(m_pNextWrite, data, m_chunkSize);
data = static_cast<char*>(data) + m_chunkSize;
if (numItems < m_numChunks) numItems++;
uint32_t offset = (m_pNextWrite - m_pData) / m_chunkSize;
if (offset >= m_numChunks - 1) m_pNextWrite = m_pData;
else m_pNextWrite += m_chunkSize;
}
}
char* getNextChunk() {
if (numItems--) {
char *p = m_pNextWrite;
uint32_t offset = (m_pNextWrite - m_pData) / m_chunkSize;
if (offset >= m_numChunks - 1) m_pNextWrite = m_pData;
else m_pNextWrite += m_chunkSize;
return p;
}
return nullptr;
}
uint32_t getNumItems() { return numItems;}
private:
char *m_pData;
uint32_t numItems;
uint32_t m_chunkSize;
uint32_t m_numChunks;
char* m_pNextWrite;
};

View File

@@ -0,0 +1,455 @@
#include <cstdlib>
#include <algorithm>
#include <future>
#include <switch.h>
#include <switch_json.h>
#include <grpc++/grpc++.h>
#include <google/protobuf/util/json_util.h>
#include "speechcenter/recognizer/v1/recognition.grpc.pb.h"
namespace verbio_asr = speechcenter::recognizer::v1;
#include "mod_verbio_transcribe.h"
#include "simple_buffer.h"
#define CHUNKSIZE (320)
namespace {
int case_insensitive_match(std::string s1, std::string s2) {
std::transform(s1.begin(), s1.end(), s1.begin(), ::tolower);
std::transform(s2.begin(), s2.end(), s2.begin(), ::tolower);
if(s1.compare(s2) == 0)
return 1; //The strings are same
return 0; //not matched
}
}
class GStreamer {
public:
GStreamer(cap_cb *cb) :
m_writesDone(false),
m_connected(false),
m_interim(cb->interim),
m_audioBuffer(CHUNKSIZE, 15) {
strncpy(m_sessionId, cb->sessionId, 256);
auto channelCreds = grpc::SslCredentials(grpc::SslCredentialsOptions());
m_channel = grpc::CreateChannel(
"us.speechcenter.verbio.com",
grpc::CompositeChannelCredentials(
grpc::SslCredentials(grpc::SslCredentialsOptions()),
grpc::AccessTokenCredentials(cb->access_token)));
if (!m_channel) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "GStreamer %p failed creating grpc channel\n", this);
throw std::runtime_error(std::string("Error creating grpc channel"));
}
m_stub = std::move(verbio_asr::Recognizer::NewStub(m_channel));
auto* config = m_request.mutable_config();
// RecognitionParameters
auto* params = config->mutable_parameters();
params->set_language(cb->language);
auto* pcm = params->mutable_pcm();
pcm->set_sample_rate_hz(8000);
params->set_audio_channels_number(cb->channels);
params->set_enable_formatting(cb->enable_formatting);
auto* resource = config->mutable_resource();
resource->set_topic(static_cast<verbio_asr::RecognitionResource_Topic>(cb->topic));
if (!zstr(cb->inline_grammar) || !zstr(cb->grammar_uri)) {
auto* grammar = resource->mutable_grammar();
if (cb->inline_grammar) {
grammar->set_inline_grammar(cb->inline_grammar);
} else if (cb->grammar_uri) {
grammar->set_grammar_uri(cb->grammar_uri);
}
}
config->set_version(static_cast<verbio_asr::RecognitionConfig_AsrVersion>(cb->engine_version));
if (cb->label) {
config->add_label(cb->label);
}
if (cb->recognition_timeout || cb->speech_complete_timeout || cb->speech_incomplete_timeout) {
auto* timer = config->mutable_configuration();
timer->set_start_input_timers(true);
if (cb->recognition_timeout) {
timer->set_recognition_timeout(cb->recognition_timeout);
}
if (cb->speech_complete_timeout) {
timer->set_speech_complete_timeout(cb->speech_complete_timeout);
}
if (cb->speech_incomplete_timeout) {
timer->set_speech_incomplete_timeout(cb->speech_incomplete_timeout);
}
}
}
~GStreamer() {
}
void connect() {
assert(!m_connected);
// Begin a stream.
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p creating streamer\n", this);
m_streamer = m_stub->StreamingRecognize(&m_context);
m_connected = true;
// read thread is waiting on this
m_promise.set_value();
// Write the first request, containing the config only.
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p sending initial message\n", this);
bool ok = m_streamer->Write(m_request);
m_request.clear_config();
// send any buffered audio
int nFrames = m_audioBuffer.getNumItems();
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p got stream ready, %d buffered frames\n", this, nFrames);
if (nFrames) {
char *p;
do {
p = m_audioBuffer.getNextChunk();
if (p) {
write(p, CHUNKSIZE);
}
} while (p);
}
}
bool write(void* data, uint32_t datalen) {
if (!m_connected) {
if (datalen % CHUNKSIZE == 0) {
m_audioBuffer.add(data, datalen);
}
return true;
}
m_request.clear_audio();
m_request.set_audio(data, datalen);
bool ok = m_streamer->Write(m_request);
return ok;
}
uint32_t nextMessageSize(void) {
uint32_t size = 0;
m_streamer->NextMessageSize(&size);
return size;
}
bool read(verbio_asr::RecognitionStreamingResponse* response) {
return m_streamer->Read(response);
}
grpc::Status finish() {
return m_streamer->Finish();
}
void writesDone() {
// grpc crashes if we call this twice on a stream
if (!m_connected) {
cancelConnect();
}
else if (!m_writesDone) {
m_streamer->WritesDone();
m_writesDone = true;
}
}
bool waitForConnect() {
std::shared_future<void> sf(m_promise.get_future());
sf.wait();
return m_connected;
}
void cancelConnect() {
assert(!m_connected);
m_promise.set_value();
}
bool isConnected() {
return m_connected;
}
private:
grpc::ClientContext m_context;
std::shared_ptr<grpc::Channel> m_channel;
std::unique_ptr<verbio_asr::Recognizer::Stub> m_stub;
verbio_asr::RecognitionStreamingRequest m_request;
std::unique_ptr< grpc::ClientReaderWriterInterface<verbio_asr::RecognitionStreamingRequest, verbio_asr::RecognitionStreamingResponse> > m_streamer;
bool m_writesDone;
bool m_connected;
bool m_interim;
std::string m_language;
std::promise<void> m_promise;
SimpleBuffer m_audioBuffer;
char m_sessionId[256];
};
static void *SWITCH_THREAD_FUNC grpc_read_thread(switch_thread_t *thread, void *obj) {
struct cap_cb *cb = (struct cap_cb *) obj;
GStreamer* streamer = (GStreamer *) cb->streamer;
bool connected = streamer->waitForConnect();
if (!connected) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "verbio transcribe grpc read thread exiting since we didnt connect\n") ;
return nullptr;
}
// Read responses.
verbio_asr::RecognitionStreamingResponse response;
while (streamer->read(&response)) { // Returns false when no more to read.
if (response.has_error()) {
// handle error
const auto& error = response.error();
auto reason = error.reason();
cJSON* json = cJSON_CreateObject();
cJSON_AddStringToObject(json, "type", "error");
cJSON_AddStringToObject(json, "error", reason.c_str());
char* json_string = cJSON_PrintUnformatted(json);
switch_core_session_t* session = switch_core_session_locate(cb->sessionId);
if (!session) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "grpc_read_thread: session %s is gone!\n", cb->sessionId) ;
return nullptr;
}
cb->responseHandler(session, TRANSCRIBE_EVENT_ERROR, json_string, cb->bugname, cb->finished);
switch_core_session_rwunlock(session);
// clean
free(json_string);
cJSON_Delete(json);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer recognition error %s\n", reason.c_str());
break;
} else if (!response.has_result()) {
// there is no available results yet.
continue;
} else {
const auto& result = response.result();
if (response.result().alternatives_size() > 0) {
const auto& alternative = response.result().alternatives(0);
if (alternative.words_size() == 0) {
continue;
}
}
std::string json_string;
google::protobuf::util::JsonPrintOptions options;
options.always_print_primitive_fields = true;
options.preserve_proto_field_names = true;
absl::Status status = google::protobuf::util::MessageToJsonString(result, &json_string, options);
if (!status.ok()) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Cannot parse verbio result, error: %s", status.ToString()) ;
} else {
switch_core_session_t* session = switch_core_session_locate(cb->sessionId);
if (!session) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "grpc_read_thread: session %s is gone!\n", cb->sessionId) ;
return nullptr;
}
cb->responseHandler(session, TRANSCRIBE_EVENT_RESULTS, json_string.c_str(), cb->bugname, cb->finished);
switch_core_session_rwunlock(session);
}
}
}
return nullptr;
}
extern "C" {
switch_status_t verbio_speech_init() {
return SWITCH_STATUS_SUCCESS;
}
switch_status_t verbio_speech_cleanup() {
return SWITCH_STATUS_SUCCESS;
}
switch_status_t verbio_speech_session_init(switch_core_session_t *session, responseHandler_t responseHandler,
uint32_t channels, char* lang, int interim, char* bugname, void **ppUserData) {
switch_channel_t *channel = switch_core_session_get_channel(session);
switch_memory_pool_t *pool = switch_core_session_get_pool(session);
auto read_codec = switch_core_session_get_read_codec(session);
uint32_t sampleRate = read_codec->implementation->actual_samples_per_second;
struct cap_cb *cb;
int err;
cb =(struct cap_cb *) switch_core_session_alloc(session, sizeof(*cb));
strncpy(cb->sessionId, switch_core_session_get_uuid(session), MAX_SESSION_ID);
strncpy(cb->bugname, bugname, MAX_BUG_LEN);
cb->channels = channels;
cb->interim = interim;
cb->finished = 0;
// Read Verbio configuration from channel variables
const char* var;
if (var = switch_channel_get_variable(channel, "VERBIO_ACCESS_TOKEN")) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Using channel vars for verbio authentication\n");
strncpy(cb->access_token, var, LONG_TEXT_LEN);
}
else if (std::getenv("VERBIO_ACCESS_TOKEN")) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Using env vars for verbio authentication\n");
strncpy(cb->access_token, std::getenv("VERBIO_ACCESS_TOKEN"), LONG_TEXT_LEN);
}
else {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "No channel vars or env vars for verbio authentication. Stop initiating Verbio connection\n");
return SWITCH_STATUS_FALSE;
}
cb->enable_formatting = switch_true(switch_channel_get_variable(channel, "VERBIO_ENABLE_FORMATTING"));
cb->enable_diarization = switch_true(switch_channel_get_variable(channel, "VERBIO_ENABLE_DIARIZATION"));
strncpy(cb->language, lang, MAX_LANGUAGE_LEN);
if (var = switch_channel_get_variable(channel, "VERBIO_ENGINE_VERSION")) {
cb->engine_version = atoi(var);
}
if (var = switch_channel_get_variable(channel, "VERBIO_TOPIC")) {
cb->topic = atoi(var);
} else {
cb->topic = 0;
}
if (var = switch_channel_get_variable(channel, "VERBIO_INLINE_GRAMMAR")) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "xhoaluu1 %s\n", var);
strncpy(cb->inline_grammar, var, LONG_TEXT_LEN);
}
if (var = switch_channel_get_variable(channel, "VERBIO_GRAMMAR_URI")) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "xhoaluu2 %s\n", var);
strncpy(cb->grammar_uri, var, LONG_TEXT_LEN);
}
if (var = switch_channel_get_variable(channel, "VERBIO_LABEL")) {
strncpy(cb->label, var, MAX_SESSION_ID);
}
if (var = switch_channel_get_variable(channel, "VERBIO_RECOGNITION_TIMEOUT")) {
cb->recognition_timeout = atoi(var);
}
if (var = switch_channel_get_variable(channel, "VERBIO_SPEECH_COMPLETE_TIMEOUT")) {
cb->speech_complete_timeout = atoi(var);
}
if (var = switch_channel_get_variable(channel, "VERBIO_SPEECH_INCOMPLETE_TIMEOUT")) {
cb->speech_incomplete_timeout = atoi(var);
}
if (switch_mutex_init(&cb->mutex, SWITCH_MUTEX_NESTED, pool) != SWITCH_STATUS_SUCCESS) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error initializing mutex\n");
return SWITCH_STATUS_FALSE;
}
if (sampleRate != 8000) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "verbio_speech_session_init: initializing resampler\n");
cb->resampler = speex_resampler_init(channels, sampleRate, 8000, SWITCH_RESAMPLE_QUALITY, &err);
if (0 != err) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error initializing resampler: %s.\n",
switch_channel_get_name(channel), speex_resampler_strerror(err));
return SWITCH_STATUS_FALSE;
}
} else {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "%s: no resampling needed for this call\n", switch_channel_get_name(channel));
}
cb->responseHandler = responseHandler;
GStreamer *streamer = NULL;
try {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "verbio_speech_session_init: allocating streamer\n");
streamer = new GStreamer(cb);
cb->streamer = streamer;
} catch (std::exception& e) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error initializing gstreamer: %s.\n",
switch_channel_get_name(channel), e.what());
return SWITCH_STATUS_FALSE;
}
streamer->connect();
// create the read thread
switch_threadattr_t *thd_attr = NULL;
switch_threadattr_create(&thd_attr, pool);
switch_threadattr_stacksize_set(thd_attr, SWITCH_THREAD_STACKSIZE);
switch_thread_create(&cb->thread, thd_attr, grpc_read_thread, cb, pool);
*ppUserData = cb;
return SWITCH_STATUS_SUCCESS;
}
switch_status_t verbio_speech_session_cleanup(switch_core_session_t *session, int channelIsClosing, char* bugname) {
switch_channel_t *channel = switch_core_session_get_channel(session);
switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, bugname);
if (bug) {
struct cap_cb *cb = (struct cap_cb *) switch_core_media_bug_get_user_data(bug);
switch_mutex_lock(cb->mutex);
switch_channel_set_private(channel, cb->bugname, NULL);
// close connection and get final responses
GStreamer* streamer = (GStreamer *) cb->streamer;
if (streamer) {
streamer->writesDone();
cb->finished = 1;
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "verbio_speech_session_cleanup: GStreamer (%p) waiting for read thread to complete\n", (void*)streamer);
switch_status_t st;
switch_thread_join(&st, cb->thread);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "verbio_speech_session_cleanup: GStreamer (%p) read thread completed\n", (void*)streamer);
delete streamer;
cb->streamer = NULL;
}
if (cb->resampler) {
speex_resampler_destroy(cb->resampler);
}
if (!channelIsClosing) {
switch_core_media_bug_remove(session, &bug);
}
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "verbio_speech_session_cleanup: Closed stream\n");
switch_mutex_unlock(cb->mutex);
switch_mutex_destroy(cb->mutex);
cb->mutex = nullptr;
return SWITCH_STATUS_SUCCESS;
}
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s Bug is not attached.\n", switch_channel_get_name(channel));
return SWITCH_STATUS_FALSE;
}
switch_bool_t verbio_speech_frame(switch_media_bug_t *bug, void* user_data) {
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
struct cap_cb *cb = (struct cap_cb *) user_data;
if (cb->streamer) {
GStreamer* streamer = (GStreamer *) cb->streamer;
uint8_t data[SWITCH_RECOMMENDED_BUFFER_SIZE];
switch_frame_t frame = {};
frame.data = data;
frame.buflen = SWITCH_RECOMMENDED_BUFFER_SIZE;
if (switch_mutex_trylock(cb->mutex) == SWITCH_STATUS_SUCCESS) {
while (switch_core_media_bug_read(bug, &frame, SWITCH_TRUE) == SWITCH_STATUS_SUCCESS && !switch_test_flag((&frame), SFF_CNG)) {
if (frame.datalen) {
if (cb->resampler) {
spx_int16_t out[SWITCH_RECOMMENDED_BUFFER_SIZE];
spx_uint32_t out_len = SWITCH_RECOMMENDED_BUFFER_SIZE;
spx_uint32_t in_len = frame.samples;
size_t written;
speex_resampler_process_interleaved_int(cb->resampler,
(const spx_int16_t *) frame.data,
(spx_uint32_t *) &in_len,
&out[0],
&out_len);
streamer->write( &out[0], sizeof(spx_int16_t) * out_len);
}
else {
streamer->write( frame.data, sizeof(spx_int16_t) * frame.samples);
}
}
}
switch_mutex_unlock(cb->mutex);
}
}
return SWITCH_TRUE;
}
}

View File

@@ -0,0 +1,11 @@
#ifndef __VERBIO_GLUE_H__
#define __VERBIO_GLUE_H__
switch_status_t verbio_speech_init();
switch_status_t verbio_speech_cleanup();
switch_status_t verbio_speech_session_init(switch_core_session_t *session, responseHandler_t responseHandler,
uint32_t channels, char* lang, int interim, char* bugname, void **ppUserData);
switch_status_t verbio_speech_session_cleanup(switch_core_session_t *session, int channelIsClosing, char* bugname);
switch_bool_t verbio_speech_frame(switch_media_bug_t *bug, void* user_data);
#endif

8
mod_verbio_tts/LICENSE Normal file
View File

@@ -0,0 +1,8 @@
Copyright 2023, Drachtio Communications Services, LLC
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@@ -0,0 +1,10 @@
include $(top_srcdir)/build/modmake.rulesam
MODNAME=mod_verbio_tts
mod_LTLIBRARIES = mod_verbio_tts.la
mod_verbio_tts_la_SOURCES = mod_verbio_tts.c verbio_glue.cpp
mod_verbio_tts_la_CFLAGS = $(AM_CFLAGS)
mod_verbio_tts_la_CXXFLAGS = $(AM_CXXFLAGS) -std=c++17
mod_verbio_tts_la_LIBADD = $(switch_builddir)/libfreeswitch.la
mod_verbio_tts_la_LDFLAGS = -avoid-version -module -no-undefined -shared -lstdc++ -lboost_system -lboost_thread

3
mod_verbio_tts/README.md Normal file
View File

@@ -0,0 +1,3 @@
# mod_verbio_tts
A Freeswitch module that allows speak text to speech audio from Verbio stream.

View File

@@ -0,0 +1,153 @@
#include "mod_verbio_tts.h"
#include "verbio_glue.h"
SWITCH_MODULE_LOAD_FUNCTION(mod_verbio_tts_load);
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_verbio_tts_shutdown);
SWITCH_MODULE_DEFINITION(mod_verbio_tts, mod_verbio_tts_load, mod_verbio_tts_shutdown, NULL);
static void clearverbio(verbio_t* v, int freeAll) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "clearverbio\n");
if (v->access_token) free(v->access_token);
if (v->ct) free(v->ct);
if (v->err_msg) free(v->err_msg);
if (v->name_lookup_time_ms) free(v->name_lookup_time_ms);
if (v->connect_time_ms) free(v->connect_time_ms);
if (v->final_response_time_ms) free(v->final_response_time_ms);
if (v->cache_filename) free(v->cache_filename);
v->access_token = NULL;
v->ct = NULL;
v->err_msg = NULL;
v->name_lookup_time_ms = NULL;
v->connect_time_ms = NULL;
v->final_response_time_ms = NULL;
v->cache_filename = NULL;
if (freeAll) {
if (v->voice_name) free(v->voice_name);
if (v->session_id) free(v->session_id);
v->voice_name = NULL;
v->session_id = NULL;
}
}
static verbio_t * createOrRetrievePrivateData(switch_speech_handle_t *sh) {
verbio_t *v = (verbio_t *) sh->private_info;
if (!v) {
v = switch_core_alloc(sh->memory_pool, sizeof(*v));
sh->private_info = v;
memset(v, 0, sizeof(*v));
switch_mutex_init(&v->mutex, SWITCH_MUTEX_NESTED, sh->memory_pool);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "allocated verbio_t\n");
}
return v;
}
switch_status_t v_speech_open(switch_speech_handle_t *sh, const char *voice_name, int rate, int channels, switch_speech_flag_t *flags)
{
verbio_t *v = createOrRetrievePrivateData(sh);
v->voice_name = strdup(voice_name);
v->rate = rate;
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "v_speech_open voice: %s, rate %d, channels %d\n", voice_name, rate, channels);
return verbio_speech_open(v);
}
static switch_status_t v_speech_close(switch_speech_handle_t *sh, switch_speech_flag_t *flags)
{
switch_status_t rc;
verbio_t *v = createOrRetrievePrivateData(sh);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "v_speech_close\n");
switch_mutex_destroy(v->mutex);
rc = verbio_speech_close(v);
clearverbio(v, 1);
return rc;
}
/**
* Freeswitch will call this function to feed us text to speak
*/
static switch_status_t v_speech_feed_tts(switch_speech_handle_t *sh, char *text, switch_speech_flag_t *flags)
{
verbio_t *v = createOrRetrievePrivateData(sh);
v->draining = 0;
v->reads = 0;
v->response_code = 0;
v->err_msg = NULL;
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "v_speech_feed_tts\n");
return verbio_speech_feed_tts(v, text, flags);
}
/**
* Freeswitch calls periodically to get some rendered audio in L16 format. We can provide up to 8k of audio at a time.
*/
static switch_status_t v_speech_read_tts(switch_speech_handle_t *sh, void *data, size_t *datalen, switch_speech_flag_t *flags)
{
verbio_t *v = createOrRetrievePrivateData(sh);
return verbio_speech_read_tts(v, data, datalen, flags);
}
/**
* This is called at the end, not sure exactly what we need to do here..
*/
static void v_speech_flush_tts(switch_speech_handle_t *sh)
{
verbio_t *v = createOrRetrievePrivateData(sh);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "v_speech_flush_tts\n");
verbio_speech_flush_tts(v);
clearverbio(v, 0);
}
static void v_text_param_tts(switch_speech_handle_t *sh, char *param, const char *val)
{
verbio_t *v = createOrRetrievePrivateData(sh);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "v_text_param_tts: %s=%s\n", param, val);
if (0 == strcmp(param, "access_token")) {
if (v->access_token) free(v->access_token);
v->access_token = strdup(val);
} else if (0 == strcmp(param, "voice")) {
if (v->voice_name) free(v->voice_name);
v->voice_name = strdup(val);
} else if (0 == strcmp(param, "session-uuid")) {
if (v->session_id) free(v->session_id);
v->session_id = strdup(val);
} else if (0 == strcmp(param, "write_cache_file") && switch_true(val)) {
v->cache_audio = 1;
}
}
static void v_numeric_param_tts(switch_speech_handle_t *sh, char *param, int val)
{
}
static void v_float_param_tts(switch_speech_handle_t *sh, char *param, double val)
{
}
SWITCH_MODULE_LOAD_FUNCTION(mod_verbio_tts_load)
{
switch_speech_interface_t *speech_interface;
*module_interface = switch_loadable_module_create_module_interface(pool, modname);
speech_interface = switch_loadable_module_create_interface(*module_interface, SWITCH_SPEECH_INTERFACE);
speech_interface->interface_name = "verbio";
speech_interface->speech_open = v_speech_open;
speech_interface->speech_close = v_speech_close;
speech_interface->speech_feed_tts = v_speech_feed_tts;
speech_interface->speech_read_tts = v_speech_read_tts;
speech_interface->speech_flush_tts = v_speech_flush_tts;
speech_interface->speech_text_param_tts = v_text_param_tts;
speech_interface->speech_numeric_param_tts = v_numeric_param_tts;
speech_interface->speech_float_param_tts = v_float_param_tts;
return verbio_speech_load();
}
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_verbio_tts_shutdown)
{
return verbio_speech_unload();
}

View File

@@ -0,0 +1,33 @@
#ifndef __MOD_VERBIO_TTS_H__
#define __MOD_VERBIO_TTS_H__
#include <switch.h>
#include <speex/speex_resampler.h>
typedef struct verbio_data {
char *voice_name;
char *access_token;
/* result data */
long response_code;
char *ct;
char *name_lookup_time_ms;
char *connect_time_ms;
char *final_response_time_ms;
char *err_msg;
char *cache_filename;
char *session_id;
int rate;
int draining;
int reads;
int cache_audio;
void *conn;
void *circularBuffer;
switch_mutex_t *mutex;
FILE *file;
SpeexResamplerState *resampler;
} verbio_t;
#endif

View File

@@ -0,0 +1,937 @@
#include "mod_verbio_tts.h"
#include <switch.h>
#include <switch_json.h>
#include <curl/curl.h>
#include <cstdlib>
#include <boost/circular_buffer.hpp>
#include <boost/thread.hpp>
#include <boost/asio.hpp>
#include <boost/asio/ssl.hpp>
#include <boost/pool/object_pool.hpp>
#include <boost/bind/bind.hpp>
#include <boost/tokenizer.hpp>
#include <boost/foreach.hpp>
#include <boost/asio.hpp>
#include <boost/assign/list_of.hpp>
#include <boost/algorithm/string.hpp>
#include <speex/speex_resampler.h>
#define BUFFER_GROW_SIZE (80000)
typedef boost::circular_buffer<uint16_t> CircularBuffer_t;
/* Global information, common to all connections */
typedef struct
{
CURLM *multi;
int still_running;
} GlobalInfo_t;
static GlobalInfo_t global;
/* Information associated with a specific easy handle */
typedef struct
{
CURL *easy;
verbio_t* verbio;
char* body;
struct curl_slist *hdr_list;
GlobalInfo_t *global;
char error[CURL_ERROR_SIZE];
FILE* file;
std::chrono::time_point<std::chrono::high_resolution_clock> startTime;
bool flushed;
bool has_last_byte;
uint8_t last_byte;
} ConnInfo_t;
static boost::object_pool<ConnInfo_t> pool ;
static std::map<curl_socket_t, boost::asio::ip::tcp::socket *> socket_map;
static boost::asio::io_service io_service;
static boost::asio::deadline_timer timer(io_service);
static std::string fullDirPath;
static std::thread worker_thread;
std::string secondsToMillisecondsString(double seconds) {
// Convert to milliseconds
double milliseconds = seconds * 1000.0;
// Truncate to remove fractional part
long milliseconds_long = static_cast<long>(milliseconds);
// Convert to string
return std::to_string(milliseconds_long);
}
static CURL* createEasyHandle(void) {
CURL* easy = curl_easy_init();
if(!easy) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "curl_easy_init() failed!\n");
return nullptr ;
}
curl_easy_setopt(easy, CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(easy, CURLOPT_USERAGENT, "jambonz/0.8.5");
// set connect timeout to 3 seconds and total timeout to 109 seconds
curl_easy_setopt(easy, CURLOPT_CONNECTTIMEOUT_MS, 3000L);
curl_easy_setopt(easy, CURLOPT_TIMEOUT, 10L);
return easy ;
}
static void cleanupConn(ConnInfo_t *conn) {
auto v = conn->verbio;
if( conn->hdr_list ) {
curl_slist_free_all(conn->hdr_list);
conn->hdr_list = nullptr ;
}
curl_easy_cleanup(conn->easy);
if (conn->file) {
if (fclose(conn->file) != 0) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "cleanupConn: error closing audio cache file\n");
}
conn->file = nullptr ;
}
v->conn = nullptr ;
v->draining = 1;
memset(conn, 0, sizeof(ConnInfo_t));
pool.destroy(conn) ;
}
/* Check for completed transfers, and remove their easy handles */
void check_multi_info(GlobalInfo_t *g) {
CURLMsg *msg;
int msgs_left;
ConnInfo_t *conn;
CURL *easy;
CURLcode res;
while((msg = curl_multi_info_read(g->multi, &msgs_left))) {
if(msg->msg == CURLMSG_DONE) {
long response_code;
double namelookup=0, connect=0, total=0 ;
char *ct = NULL ;
easy = msg->easy_handle;
res = msg->data.result;
curl_easy_getinfo(easy, CURLINFO_PRIVATE, &conn);
curl_easy_getinfo(easy, CURLINFO_RESPONSE_CODE, &response_code);
curl_easy_getinfo(easy, CURLINFO_CONTENT_TYPE, &ct);
curl_easy_getinfo(easy, CURLINFO_NAMELOOKUP_TIME, &namelookup);
curl_easy_getinfo(easy, CURLINFO_CONNECT_TIME, &connect);
curl_easy_getinfo(easy, CURLINFO_TOTAL_TIME, &total);
auto v = conn->verbio;
v->response_code = response_code;
if (ct) v->ct = strdup(ct);
std::string name_lookup_ms = secondsToMillisecondsString(namelookup);
std::string connect_ms = secondsToMillisecondsString(connect);
std::string final_response_time_ms = secondsToMillisecondsString(total);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG,
"mod_verbio_tts: response: %ld, content-type %s,"
"dns(ms): %" CURL_FORMAT_CURL_OFF_T ".%06ld, "
"connect(ms): %" CURL_FORMAT_CURL_OFF_T ".%06ld, "
"total(ms): %" CURL_FORMAT_CURL_OFF_T ".%06ld\n",
response_code, ct,
(long)(namelookup), (long)(fmod(namelookup, 1.0) * 1000000),
(long)(connect), (long)(fmod(connect, 1.0) * 1000000),
(long)(total), (long)(fmod(total, 1.0) * 1000000));
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "name lookup time: %s\n", name_lookup_ms.c_str());
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "connect time: %s\n", connect_ms.c_str());
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "final response time: %s\n", final_response_time_ms.c_str());
v->name_lookup_time_ms = strdup(name_lookup_ms.c_str());
v->connect_time_ms = strdup(connect_ms.c_str());
v->final_response_time_ms = strdup(final_response_time_ms.c_str());
curl_multi_remove_handle(g->multi, easy);
cleanupConn(conn);
}
}
}
int mcode_test(const char *where, CURLMcode code) {
if(CURLM_OK != code) {
const char *s;
switch(code) {
case CURLM_CALL_MULTI_PERFORM:
s = "CURLM_CALL_MULTI_PERFORM";
break;
case CURLM_BAD_HANDLE:
s = "CURLM_BAD_HANDLE";
break;
case CURLM_BAD_EASY_HANDLE:
s = "CURLM_BAD_EASY_HANDLE";
break;
case CURLM_OUT_OF_MEMORY:
s = "CURLM_OUT_OF_MEMORY";
break;
case CURLM_INTERNAL_ERROR:
s = "CURLM_INTERNAL_ERROR";
break;
case CURLM_UNKNOWN_OPTION:
s = "CURLM_UNKNOWN_OPTION";
break;
case CURLM_LAST:
s = "CURLM_LAST";
break;
default:
s = "CURLM_unknown";
break;
case CURLM_BAD_SOCKET:
s = "CURLM_BAD_SOCKET";
break;
}
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "mcode_test ERROR: %s returns %s:%d\n", where, s, code);
return -1;
}
return 0 ;
}
static void remsock(int *f, GlobalInfo_t *g) {
if(f) {
free(f);
f = NULL;
}
}
/* Called by asio when there is an action on a socket */
static void event_cb(GlobalInfo_t *g, curl_socket_t s, int action, const boost::system::error_code & error, int *fdp) {
int f = *fdp;
//switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "event_cb socket %#X has action %d\n", s, action) ;
// Socket already POOL REMOVED.
if (f == CURL_POLL_REMOVE) {
//switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "event_cb socket %#X removed\n", s);
remsock(fdp, g);
return;
}
if(socket_map.find(s) == socket_map.end()) {
//switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "event_cb: socket %#X already closed\n, s");
return;
}
/* make sure the event matches what are wanted */
if(f == action || f == CURL_POLL_INOUT) {
if(error) {
action = CURL_CSELECT_ERR;
}
CURLMcode rc = curl_multi_socket_action(g->multi, s, action, &g->still_running);
mcode_test("event_cb: curl_multi_socket_action", rc);
check_multi_info(g);
if(g->still_running <= 0) {
timer.cancel();
}
/* keep on watching.
* the socket may have been closed and/or fdp may have been changed
* in curl_multi_socket_action(), so check them both */
if(!error && socket_map.find(s) != socket_map.end() &&
(f == action || f == CURL_POLL_INOUT)) {
boost::asio::ip::tcp::socket *tcp_socket = socket_map.find(s)->second;
if(action == CURL_POLL_IN) {
tcp_socket->async_read_some(boost::asio::null_buffers(),
boost::bind(&event_cb, g, s,
action, boost::placeholders::_1, fdp));
}
if(action == CURL_POLL_OUT) {
tcp_socket->async_write_some(boost::asio::null_buffers(),
boost::bind(&event_cb, g, s,
action, boost::placeholders::_1, fdp));
}
}
}
}
/* socket functions */
static void setsock(int *fdp, curl_socket_t s, CURL *e, int act, int oldact, GlobalInfo_t *g) {
std::map<curl_socket_t, boost::asio::ip::tcp::socket *>::iterator it = socket_map.find(s);
if(it == socket_map.end()) {
//switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "setsock: socket %#X not found\n, s");
return;
}
boost::asio::ip::tcp::socket * tcp_socket = it->second;
*fdp = act;
if(act == CURL_POLL_IN) {
if(oldact != CURL_POLL_IN && oldact != CURL_POLL_INOUT) {
tcp_socket->async_read_some(boost::asio::null_buffers(),
boost::bind(&event_cb, g, s,
CURL_POLL_IN, boost::placeholders::_1, fdp));
}
}
else if(act == CURL_POLL_OUT) {
if(oldact != CURL_POLL_OUT && oldact != CURL_POLL_INOUT) {
tcp_socket->async_write_some(boost::asio::null_buffers(),
boost::bind(&event_cb, g, s,
CURL_POLL_OUT, boost::placeholders::_1, fdp));
}
}
else if(act == CURL_POLL_INOUT) {
if(oldact != CURL_POLL_IN && oldact != CURL_POLL_INOUT) {
tcp_socket->async_read_some(boost::asio::null_buffers(),
boost::bind(&event_cb, g, s,
CURL_POLL_IN, boost::placeholders::_1, fdp));
}
if(oldact != CURL_POLL_OUT && oldact != CURL_POLL_INOUT) {
tcp_socket->async_write_some(boost::asio::null_buffers(),
boost::bind(&event_cb, g, s,
CURL_POLL_OUT, boost::placeholders::_1, fdp));
}
}
}
static void addsock(curl_socket_t s, CURL *easy, int action, GlobalInfo_t *g) {
/* fdp is used to store current action */
int *fdp = (int *) calloc(sizeof(int), 1);
setsock(fdp, s, easy, action, 0, g);
curl_multi_assign(g->multi, s, fdp);
}
static int sock_cb(CURL *e, curl_socket_t s, int what, void *cbp, void *sockp) {
GlobalInfo_t *g = &global;
int *actionp = (int *) sockp;
static const char *whatstr[] = { "none", "IN", "OUT", "INOUT", "REMOVE"};
if(what == CURL_POLL_REMOVE) {
*actionp = what;
}
else {
if(!actionp) {
addsock(s, e, what, g);
}
else {
setsock(actionp, s, e, what, *actionp, g);
}
}
return 0;
}
static void threadFunc() {
/* to make sure the event loop doesn't terminate when there is no work to do */
io_service.reset() ;
boost::asio::io_service::work work(io_service);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "mod_verbio_tts threadFunc - starting\n");
for(;;) {
try {
io_service.run() ;
break ;
}
catch( std::exception& e) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "mod_verbio_tts threadFunc - Error: %s\n", e.what());
}
}
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "mod_verbio_tts threadFunc - ending\n");
}
/* Called by asio when our timeout expires */
static void timer_cb(const boost::system::error_code & error, GlobalInfo_t *g)
{
//switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "timer_cb\n");
if(!error) {
CURLMcode rc = curl_multi_socket_action(g->multi, CURL_SOCKET_TIMEOUT, 0, &g->still_running);
mcode_test("timer_cb: curl_multi_socket_action", rc);
check_multi_info(g);
}
}
int multi_timer_cb(CURLM *multi, long timeout_ms, GlobalInfo_t *g) {
/* cancel running timer */
timer.cancel();
if(timeout_ms >= 0) {
// from libcurl 7.88.1-10+deb12u4 does not allow call curl_multi_socket_action or curl_multi_perform in curl_multi callback directly
timer.expires_from_now(boost::posix_time::millisec(timeout_ms ? timeout_ms : 1));
timer.async_wait(boost::bind(&timer_cb, boost::placeholders::_1, g));
}
return 0;
}
/* CURLOPT_WRITEFUNCTION */
static size_t write_cb(void *ptr, size_t size, size_t nmemb, ConnInfo_t *conn) {
bool fireEvent = false;
uint8_t *data = (uint8_t *) ptr;
size_t bytes_received = size * nmemb;
size_t total_bytes_to_process;
auto v = conn->verbio;
CircularBuffer_t *cBuffer = (CircularBuffer_t *) v->circularBuffer;
if (conn->flushed || cBuffer == nullptr) {
/* this will abort the transfer */
return 0;
}
// Buffer to hold combined data if there is unprocessed byte from the last call.
std::unique_ptr<uint8_t[]> combinedData;
if (conn->has_last_byte) {
conn->has_last_byte = false; // We'll handle the last_byte now, so toggle the flag off
// Allocate memory for the new data array
combinedData.reset(new uint8_t[bytes_received + 1]);
// Prepend the last byte from previous call
combinedData[0] = conn->last_byte;
// Copy the new data following the prepended byte
memcpy(combinedData.get() + 1, data, bytes_received);
// Point our data pointer to the new array
data = combinedData.get();
total_bytes_to_process = bytes_received + 1;
} else {
total_bytes_to_process = bytes_received;
}
// If we now have an odd total, save the last byte for next time
if ((total_bytes_to_process % sizeof(int16_t)) != 0) {
conn->last_byte = data[total_bytes_to_process - 1];
conn->has_last_byte = true;
total_bytes_to_process--;
}
int16_t* inputData = reinterpret_cast<int16_t*>(data);
if (0 == v->reads++) {
fireEvent = true;
// Verbio return PCM linear16 WAV file which contains 44 bytes headers, remove that.
inputData += 22;
total_bytes_to_process -= 44;
}
size_t numSamples = total_bytes_to_process / sizeof(int16_t);
{
switch_mutex_lock(v->mutex);
if (v->response_code > 0 && v->response_code != 200) {
std::string body((char *) ptr, bytes_received);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "write_cb: received body %s\n", body.c_str());
v->err_msg = strdup(body.c_str());
switch_mutex_unlock(v->mutex);
return 0;
}
/* cache file will stay in the mp3 format for size (smaller) and simplicity */
if (conn->file) fwrite(inputData, sizeof(int16_t), numSamples, conn->file);
// Resize the buffer if necessary
if (cBuffer->capacity() - cBuffer->size() < numSamples) {
//switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "write_cb growing buffer\n");
//TODO: if buffer exceeds some max size, return CURL_WRITEFUNC_ERROR to abort the transfer
cBuffer->set_capacity(cBuffer->size() + std::max(numSamples, (size_t)BUFFER_GROW_SIZE));
}
/* Push the data into the buffer */
cBuffer->insert(cBuffer->end(), inputData, inputData + numSamples);
switch_mutex_unlock(v->mutex);
}
if (fireEvent && v->session_id) {
auto endTime = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(endTime - conn->startTime);
auto time_to_first_byte_ms = std::to_string(duration.count());
switch_core_session_t* session = switch_core_session_locate(v->session_id);
if (session) {
switch_channel_t *channel = switch_core_session_get_channel(session);
switch_core_session_rwunlock(session);
if (channel) {
switch_event_t *event;
if (switch_event_create(&event, SWITCH_EVENT_PLAYBACK_START) == SWITCH_STATUS_SUCCESS) {
switch_channel_event_set_data(channel, event);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "write_cb: firing playback-started\n");
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "Playback-File-Type", "tts_stream");
if (v->name_lookup_time_ms) {
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "variable_tts_verbio_name_lookup_time_ms", v->name_lookup_time_ms);
}
if (v->connect_time_ms) {
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "variable_tts_verbio_connect_time_ms", v->connect_time_ms);
}
if (v->final_response_time_ms) {
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "variable_tts_verbio_final_response_time_ms", v->final_response_time_ms);
}
if (v->voice_name) {
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "variable_tts_verbio_voice_name", v->voice_name);
}
if (v->cache_filename) {
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "variable_tts_cache_filename", v->cache_filename);
}
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "variable_tts_time_to_first_byte_ms", time_to_first_byte_ms.c_str());
switch_event_fire(&event);
}
else {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "write_cb: failed to create event\n");
}
}
else {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "write_cb: channel not found\n");
}
}
else {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "write_cb: session %s not found\n", v->session_id);
}
}
return size*nmemb;
}
static bool parseHeader(const std::string& str, std::string& header, std::string& value) {
std::vector<std::string> parts;
boost::split(parts, str, boost::is_any_of(":"), boost::token_compress_on);
if (parts.size() != 2)
return false;
header = boost::trim_copy(parts[0]);
value = boost::trim_copy(parts[1]);
return true;
}
static int extract_response_code(const std::string& input) {
std::size_t space_pos = input.find(' ');
if (space_pos == std::string::npos) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Invalid HTTP response format %s\n", input.c_str());
return 0;
}
std::size_t code_start_pos = space_pos + 1;
std::size_t code_end_pos = input.find(' ', code_start_pos);
if (code_end_pos == std::string::npos) {
code_end_pos = input.length();
}
std::string code_str = input.substr(code_start_pos, code_end_pos - code_start_pos);
int response_code = std::stoi(code_str);
return response_code;
}
static size_t header_callback(char *buffer, size_t size, size_t nitems, ConnInfo_t *conn) {
size_t bytes_received = size * nitems;
const std::string prefix = "HTTP/";
verbio_t* v = conn->verbio;
std::string header, value;
std::string input(buffer, bytes_received);
if (parseHeader(input, header, value)) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "recv header: %s with value %s\n", header.c_str(), value.c_str());
}
else {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "header_callback: %s\n", input.c_str());
if (input.rfind(prefix, 0) == 0) {
try {
v->response_code = extract_response_code(input);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "header_callback: parsed response code: %ld\n", v->response_code);
} catch (const std::invalid_argument& e) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "header_callback: invalid response code %s\n", input.substr(prefix.length()).c_str());
}
}
}
return bytes_received;
}
/* CURLOPT_OPENSOCKETFUNCTION */
static curl_socket_t opensocket(void *clientp, curlsocktype purpose, struct curl_sockaddr *address) {
curl_socket_t sockfd = CURL_SOCKET_BAD;
//switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "opensocket: %d\n", purpose);
/* restrict to IPv4 */
if(purpose == CURLSOCKTYPE_IPCXN && address->family == AF_INET) {
/* create a tcp socket object */
boost::asio::ip::tcp::socket *tcp_socket = new boost::asio::ip::tcp::socket(io_service);
/* open it and get the native handle*/
boost::system::error_code ec;
tcp_socket->open(boost::asio::ip::tcp::v4(), ec);
if(ec) {
/* An error occurred */
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't open socket [%ld][%s]\n", ec, ec.message().c_str());
}
else {
sockfd = tcp_socket->native_handle();
/* save it for monitoring */
socket_map.insert(std::pair<curl_socket_t, boost::asio::ip::tcp::socket *>(sockfd, tcp_socket));
}
}
return sockfd;
}
/* CURLOPT_CLOSESOCKETFUNCTION */
static int close_socket(void *clientp, curl_socket_t item) {
//switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "close_socket : %#X\n", item);
std::map<curl_socket_t, boost::asio::ip::tcp::socket *>::iterator it = socket_map.find(item);
if(it != socket_map.end()) {
delete it->second;
socket_map.erase(it);
}
return 0;
}
extern "C" {
switch_status_t verbio_speech_load() {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "verbio_speech_loading..\n");
memset(&global, 0, sizeof(GlobalInfo_t));
global.multi = curl_multi_init();
if (!global.multi) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "verbio_speech_load curl_multi_init() failed, exiting!\n");
return SWITCH_STATUS_FALSE;
}
curl_multi_setopt(global.multi, CURLMOPT_SOCKETFUNCTION, sock_cb);
curl_multi_setopt(global.multi, CURLMOPT_SOCKETDATA, &global);
curl_multi_setopt(global.multi, CURLMOPT_TIMERFUNCTION, multi_timer_cb);
curl_multi_setopt(global.multi, CURLMOPT_TIMERDATA, &global);
curl_multi_setopt(global.multi, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
/* create temp folder for cache files */
const char* baseDir = std::getenv("JAMBONZ_TMP_CACHE_FOLDER");
if (!baseDir) {
baseDir = "/tmp/";
}
if (strcmp(baseDir, "/") == 0) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "failed to create folder %s\n", baseDir);
return SWITCH_STATUS_FALSE;
}
fullDirPath = std::string(baseDir) + "tts-cache-files";
// Create the directory with read, write, and execute permissions for everyone
mode_t oldMask = umask(0);
int result = mkdir(fullDirPath.c_str(), S_IRWXU | S_IRWXG | S_IRWXO);
umask(oldMask);
if (result != 0) {
if (errno != EEXIST) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "failed to create folder %s\n", fullDirPath.c_str());
fullDirPath = "";
}
else switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "folder %s already exists\n", fullDirPath.c_str());
}
else {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "created folder %s\n", fullDirPath.c_str());
}
/* start worker thread that handles transfers*/
std::thread t(threadFunc) ;
worker_thread.swap( t ) ;
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "verbio_speech_loaded..\n");
return SWITCH_STATUS_SUCCESS;
}
switch_status_t verbio_speech_unload() {
/* stop the ASIO IO service */
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "verbio_speech_unload: stopping io service\n");
io_service.stop();
/* Join the worker thread */
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "verbio_speech_unload: wait for worker thread to complete\n");
if (worker_thread.joinable()) {
worker_thread.join();
}
/* cleanup curl multi handle*/
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "verbio_speech_unload: release curl multi\n");
curl_multi_cleanup(global.multi);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "verbio_speech_unload: completed\n");
return SWITCH_STATUS_SUCCESS;
}
switch_status_t verbio_speech_open(verbio_t* verbio) {
return SWITCH_STATUS_SUCCESS;
}
switch_status_t verbio_speech_feed_tts(verbio_t* v, char* text, switch_speech_flag_t *flags) {
CURLMcode rc;
const int MAX_CHARS = 20;
char tempText[MAX_CHARS + 4]; // +4 for the ellipsis and null terminator
if (strlen(text) > MAX_CHARS) {
strncpy(tempText, text, MAX_CHARS);
strcpy(tempText + MAX_CHARS, "...");
} else {
strcpy(tempText, text);
}
/* open cache file */
if (v->cache_audio && fullDirPath.length() > 0) {
switch_uuid_t uuid;
char uuid_str[SWITCH_UUID_FORMATTED_LENGTH + 1];
char outfile[512] = "";
int fd;
switch_uuid_get(&uuid);
switch_uuid_format(uuid_str, &uuid);
switch_snprintf(outfile, sizeof(outfile), "%s%s%s.r8", fullDirPath.c_str(), SWITCH_PATH_SEPARATOR, uuid_str);
v->cache_filename = strdup(outfile);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "writing audio cache file to %s\n", v->cache_filename);
mode_t oldMask = umask(0);
fd = open(outfile, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
umask(oldMask);
if (fd == -1 ) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error opening cache file %s: %s\n", outfile, strerror(errno));
}
else {
v->file = fdopen(fd, "wb");
if (!v->file) {
close(fd);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error opening cache file %s: %s\n", outfile, strerror(errno));
}
}
}
if (!v->access_token) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "verbio_speech_feed_tts: no access_token provided\n");
return SWITCH_STATUS_FALSE;
}
/* format url*/
std::string url = "https://us.rest.speechcenter.verbio.com/api/v1/synthesize";
/* create the JSON body */
cJSON * jResult = cJSON_CreateObject();
cJSON_AddStringToObject(jResult, "voice_id", v->voice_name);
cJSON_AddStringToObject(jResult, "output_sample_rate", "8k");
cJSON_AddStringToObject(jResult, "output_encoding", "pcm16");
cJSON_AddStringToObject(jResult, "text", text);
char *json = cJSON_PrintUnformatted(jResult);
cJSON_Delete(jResult);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "verbio_speech_feed_tts: [%s] [%s]\n", url.c_str(), tempText);
ConnInfo_t *conn = pool.malloc() ;
CURL* easy = createEasyHandle();
v->conn = (void *) conn ;
conn->verbio = v;
conn->easy = easy;
conn->global = &global;
conn->hdr_list = NULL ;
conn->file = v->file;
conn->body = json;
conn->flushed = false;
conn->has_last_byte = false;
conn->last_byte = 0;
v->circularBuffer = (void *) new CircularBuffer_t(BUFFER_GROW_SIZE);
// Always use verbio at rate 8000 for helping cache audio from jambonz.
if (v->rate != 8000) {
int err;
v->resampler = speex_resampler_init(1, 8000, v->rate, SWITCH_RESAMPLE_QUALITY, &err);
if (0 != err) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error initializing resampler: %s.\n", speex_resampler_strerror(err));
return SWITCH_STATUS_FALSE;
}
}
std::ostringstream api_key_stream;
api_key_stream << "Authorization: Bearer " << v->access_token;
curl_easy_setopt(easy, CURLOPT_URL, url.c_str());
curl_easy_setopt(easy, CURLOPT_WRITEFUNCTION, write_cb);
curl_easy_setopt(easy, CURLOPT_WRITEDATA, conn);
curl_easy_setopt(easy, CURLOPT_ERRORBUFFER, conn->error);
curl_easy_setopt(easy, CURLOPT_PRIVATE, conn);
curl_easy_setopt(easy, CURLOPT_VERBOSE, 0L);
curl_easy_setopt(easy, CURLOPT_NOPROGRESS, 1L);
curl_easy_setopt(easy, CURLOPT_HEADERFUNCTION, header_callback);
curl_easy_setopt(easy, CURLOPT_HEADERDATA, conn);
/* call this function to get a socket */
curl_easy_setopt(easy, CURLOPT_OPENSOCKETFUNCTION, opensocket);
/* call this function to close a socket */
curl_easy_setopt(easy, CURLOPT_CLOSESOCKETFUNCTION, close_socket);
conn->hdr_list = curl_slist_append(conn->hdr_list, api_key_stream.str().c_str());
conn->hdr_list = curl_slist_append(conn->hdr_list, "Content-Type: application/json");
curl_easy_setopt(easy, CURLOPT_HTTPHEADER, conn->hdr_list);
curl_easy_setopt(easy, CURLOPT_POSTFIELDS, conn->body);
//curl_easy_setopt(easy, CURLOPT_POSTFIELDSIZE, body.length());
// libcurl adding random byte to the response body that creates white noise to audio file
// https://github.com/curl/curl/issues/10525
const bool disable_http_2 = switch_true(std::getenv("DISABLE_HTTP2_FOR_TTS_STREAMING"));
curl_easy_setopt(easy, CURLOPT_HTTP_VERSION, disable_http_2 ? CURL_HTTP_VERSION_1_1 : CURL_HTTP_VERSION_2_0);
rc = curl_multi_add_handle(global.multi, conn->easy);
mcode_test("new_conn: curl_multi_add_handle", rc);
/* start a timer to measure the duration until we receive first byte of audio */
conn->startTime = std::chrono::high_resolution_clock::now();
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "verbio_speech_feed_tts: called curl_multi_add_handle\n");
return SWITCH_STATUS_SUCCESS;
}
switch_status_t verbio_speech_read_tts(verbio_t* v, void *data, size_t *datalen, switch_speech_flag_t *flags) {
CircularBuffer_t *cBuffer = (CircularBuffer_t *) v->circularBuffer;
std::vector<uint16_t> pcm_data;
{
switch_mutex_lock(v->mutex);
ConnInfo_t *conn = (ConnInfo_t *) v->conn;
if (v->response_code > 0 && v->response_code != 200) {
switch_mutex_unlock(v->mutex);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "verbio_speech_read_tts, returning failure\n") ;
return SWITCH_STATUS_FALSE;
}
if (conn && conn->flushed) {
switch_mutex_unlock(v->mutex);
return SWITCH_STATUS_BREAK;
}
if (cBuffer->empty()) {
if (v->draining) {
switch_mutex_unlock(v->mutex);
return SWITCH_STATUS_BREAK;
}
/* no audio available yet so send silence */
memset(data, 255, *datalen);
switch_mutex_unlock(v->mutex);
return SWITCH_STATUS_SUCCESS;
}
size_t size = std::min((*datalen/(2 * v->rate / 8000)), cBuffer->size());
pcm_data.insert(pcm_data.end(), cBuffer->begin(), cBuffer->begin() + size);
cBuffer->erase(cBuffer->begin(), cBuffer->begin() + size);
switch_mutex_unlock(v->mutex);
}
size_t data_size = pcm_data.size();
if (v->resampler) {
std::vector<int16_t> in(pcm_data.begin(), pcm_data.end());
std::vector<int16_t> out((*datalen));
spx_uint32_t in_len = data_size;
spx_uint32_t out_len = out.size();
speex_resampler_process_interleaved_int(v->resampler, in.data(), &in_len, out.data(), &out_len);
if (out_len > out.size()) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Resampler output exceeded maximum buffer size!\n");
return SWITCH_STATUS_FALSE;
}
memcpy(data, out.data(), out_len * sizeof(int16_t));
*datalen = out_len * sizeof(int16_t);
} else {
memcpy(data, pcm_data.data(), pcm_data.size() * sizeof(uint16_t));
*datalen = pcm_data.size() * sizeof(uint16_t);
}
return SWITCH_STATUS_SUCCESS;
}
switch_status_t verbio_speech_flush_tts(verbio_t* v) {
bool download_complete = v->response_code == 200;
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "verbio_speech_flush_tts, download complete? %s\n", download_complete ? "yes" : "no") ;
ConnInfo_t *conn = (ConnInfo_t *) v->conn;
CircularBuffer_t *cBuffer = (CircularBuffer_t *) v->circularBuffer;
delete cBuffer;
v->circularBuffer = nullptr ;
// destroy resampler
if (v->resampler) {
speex_resampler_destroy(v->resampler);
v->resampler = NULL;
}
if (conn) {
conn->flushed = true;
if (!download_complete) {
if (conn->file) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "closing audio cache file %s because download was interrupted\n", v->cache_filename);
if (fclose(conn->file) != 0) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "error closing audio cache file\n");
}
conn->file = nullptr ;
}
if (v->cache_filename) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "removing audio cache file %s because download was interrupted\n", v->cache_filename);
if (unlink(v->cache_filename) != 0) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "cleanupConn: error removing audio cache file %s: %d:%s\n",
v->cache_filename, errno, strerror(errno));
}
free(v->cache_filename);
v->cache_filename = nullptr ;
}
}
}
if (v->session_id) {
switch_core_session_t* session = switch_core_session_locate(v->session_id);
if (session) {
switch_channel_t *channel = switch_core_session_get_channel(session);
switch_core_session_rwunlock(session);
if (channel) {
switch_event_t *event;
if (switch_event_create(&event, SWITCH_EVENT_PLAYBACK_STOP) == SWITCH_STATUS_SUCCESS) {
switch_channel_event_set_data(channel, event);
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "Playback-File-Type", "tts_stream");
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "variable_tts_verbio_response_code", std::to_string(v->response_code).c_str());
if (v->cache_filename && v->response_code == 200) {
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "variable_tts_cache_filename", v->cache_filename);
}
if (v->response_code != 200 && v->err_msg) {
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "variable_tts_error", v->err_msg);
}
switch_event_fire(&event);
}
else {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "write_cb: failed to create event\n");
}
}
else {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "write_cb: channel not found\n");
}
}
}
return SWITCH_STATUS_SUCCESS;
}
switch_status_t verbio_speech_close(verbio_t* w) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "verbio_speech_close\n") ;
return SWITCH_STATUS_SUCCESS;
}
}

View File

@@ -0,0 +1,12 @@
#ifndef __VERBIO_GLUE_H__
#define __VERBIO_GLUE_H__
switch_status_t verbio_speech_load();
switch_status_t verbio_speech_open(verbio_t* verbio);
switch_status_t verbio_speech_feed_tts(verbio_t* verbio, char* text, switch_speech_flag_t *flags);
switch_status_t verbio_speech_read_tts(verbio_t* verbio, void *data, size_t *datalen, switch_speech_flag_t *flags);
switch_status_t verbio_speech_flush_tts(verbio_t* verbio);
switch_status_t verbio_speech_close(verbio_t* verbio);
switch_status_t verbio_speech_unload();
#endif