Files
freeswitch-modules/mod_google_transcribe/generic_google_glue.h
Andrew Golledge 4e57f73c7e Add Support for Google Cloud Speech-To-Text V2 library in mod_google_transcribe (#23)
* Introduce Google Speech-To-Text V2 library

* Add sign-off to previous commit

Signed-off-by: Andrew Golledge <andreas.golledge@gmail.com>

---------

Signed-off-by: Andrew Golledge <andreas.golledge@gmail.com>
2024-03-23 12:02:48 -04:00

248 lines
10 KiB
C++

#ifndef __GENERIC_GOOGLE_GLUE_H__
#define __GENERIC_GOOGLE_GLUE_H__
#include <switch_json.h>
template<typename Streamer>
switch_bool_t google_speech_frame(switch_media_bug_t *bug, void* user_data) {
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
struct cap_cb *cb = (struct cap_cb *) user_data;
if (cb->streamer && (!cb->wants_single_utterance || !cb->got_end_of_utterance)) {
Streamer* streamer = (Streamer *) cb->streamer;
uint8_t data[SWITCH_RECOMMENDED_BUFFER_SIZE];
switch_frame_t frame = {};
frame.data = data;
frame.buflen = SWITCH_RECOMMENDED_BUFFER_SIZE;
if (switch_mutex_trylock(cb->mutex) == SWITCH_STATUS_SUCCESS) {
while (switch_core_media_bug_read(bug, &frame, SWITCH_TRUE) == SWITCH_STATUS_SUCCESS && !switch_test_flag((&frame), SFF_CNG)) {
if (frame.datalen) {
if (cb->vad && !streamer->isConnected()) {
switch_vad_state_t state = switch_vad_process(cb->vad, (int16_t*) frame.data, frame.samples);
if (state == SWITCH_VAD_STATE_START_TALKING) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "detected speech, connect to google speech now\n");
streamer->connect();
cb->responseHandler(session, "vad_detected", cb->bugname);
}
}
if (cb->resampler) {
spx_int16_t out[SWITCH_RECOMMENDED_BUFFER_SIZE];
spx_uint32_t out_len = SWITCH_RECOMMENDED_BUFFER_SIZE;
spx_uint32_t in_len = frame.samples;
size_t written;
speex_resampler_process_interleaved_int(cb->resampler,
(const spx_int16_t *) frame.data,
(spx_uint32_t *) &in_len,
&out[0],
&out_len);
streamer->write( &out[0], sizeof(spx_int16_t) * out_len);
}
else {
streamer->write( frame.data, sizeof(spx_int16_t) * frame.samples);
}
}
}
switch_mutex_unlock(cb->mutex);
}
}
return SWITCH_TRUE;
}
template<typename Streamer>
switch_status_t google_speech_session_init(switch_core_session_t *session, responseHandler_t responseHandler,
switch_thread_start_t func, uint32_t to_rate, uint32_t samples_per_second, uint32_t channels, char* lang,
int interim, char *bugname, int single_utterance, int separate_recognition, int max_alternatives,
int profanity_filter, int word_time_offset, int punctuation, const char* model, int enhanced,
const char* hints, char* play_file, void **ppUserData) {
switch_channel_t *channel = switch_core_session_get_channel(session);
auto read_codec = switch_core_session_get_read_codec(session);
uint32_t sampleRate = read_codec->implementation->actual_samples_per_second;
struct cap_cb *cb;
int err;
cb =(struct cap_cb *) switch_core_session_alloc(session, sizeof(*cb));
strncpy(cb->sessionId, switch_core_session_get_uuid(session), MAX_SESSION_ID);
strncpy(cb->bugname, bugname, MAX_BUG_LEN);
cb->got_end_of_utterance = 0;
cb->wants_single_utterance = single_utterance;
if (play_file != NULL){
cb->play_file = 1;
}
switch_mutex_init(&cb->mutex, SWITCH_MUTEX_NESTED, switch_core_session_get_pool(session));
if (sampleRate != to_rate) {
cb->resampler = speex_resampler_init(channels, sampleRate, to_rate, SWITCH_RESAMPLE_QUALITY, &err);
if (0 != err) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error initializing resampler: %s.\n",
switch_channel_get_name(channel), speex_resampler_strerror(err));
return SWITCH_STATUS_FALSE;
}
} else {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "%s: no resampling needed for this call\n", switch_channel_get_name(channel));
}
cb->responseHandler = responseHandler;
// allocate vad if we are delaying connecting to the recognizer until we detect speech
if (switch_channel_var_true(channel, "START_RECOGNIZING_ON_VAD")) {
cb->vad = switch_vad_init(sampleRate, channels);
if (cb->vad) {
const char* var;
int mode = 2;
int silence_ms = 150;
int voice_ms = 250;
int debug = 0;
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_MODE")) {
mode = atoi(var);
}
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_SILENCE_MS")) {
silence_ms = atoi(var);
}
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_VOICE_MS")) {
voice_ms = atoi(var);
}
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_VOICE_MS")) {
voice_ms = atoi(var);
}
switch_vad_set_mode(cb->vad, mode);
switch_vad_set_param(cb->vad, "silence_ms", silence_ms);
switch_vad_set_param(cb->vad, "voice_ms", voice_ms);
switch_vad_set_param(cb->vad, "debug", debug);
}
}
Streamer *streamer = NULL;
try {
streamer = new Streamer(session, channels, lang, interim, to_rate, sampleRate, single_utterance, separate_recognition, max_alternatives,
profanity_filter, word_time_offset, punctuation, model, enhanced, hints);
cb->streamer = streamer;
} catch (std::exception& e) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error initializing gstreamer: %s.\n",
switch_channel_get_name(channel), e.what());
return SWITCH_STATUS_FALSE;
}
if (!cb->vad) streamer->connect();
// create the read thread
switch_threadattr_t *thd_attr = NULL;
switch_memory_pool_t *pool = switch_core_session_get_pool(session);
switch_threadattr_create(&thd_attr, pool);
switch_threadattr_stacksize_set(thd_attr, SWITCH_THREAD_STACKSIZE);
switch_thread_create(&cb->thread, thd_attr, func, cb, pool);
*ppUserData = cb;
return SWITCH_STATUS_SUCCESS;
}
template<typename Streamer>
switch_status_t google_speech_session_cleanup(switch_core_session_t *session, int channelIsClosing, switch_media_bug_t *bug) {
switch_channel_t *channel = switch_core_session_get_channel(session);
if (bug) {
struct cap_cb *cb = (struct cap_cb *) switch_core_media_bug_get_user_data(bug);
switch_mutex_lock(cb->mutex);
if (!switch_channel_get_private(channel, cb->bugname)) {
// race condition
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s Bug is not attached (race).\n", switch_channel_get_name(channel));
switch_mutex_unlock(cb->mutex);
return SWITCH_STATUS_FALSE;
}
switch_channel_set_private(channel, cb->bugname, NULL);
// stop playback if available
if (cb->play_file == 1){
if (switch_channel_test_flag(channel, CF_BROADCAST)) {
switch_channel_stop_broadcast(channel);
} else {
switch_channel_set_flag_value(channel, CF_BREAK, 1);
}
}
// close connection and get final responses
Streamer* streamer = (Streamer *) cb->streamer;
if (streamer) {
streamer->writesDone();
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "google_speech_session_cleanup: GStreamer (%p) waiting for read thread to complete\n", (void*)streamer);
switch_status_t st;
switch_thread_join(&st, cb->thread);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "google_speech_session_cleanup: GStreamer (%p) read thread completed\n", (void*)streamer);
delete streamer;
cb->streamer = NULL;
}
if (cb->resampler) {
speex_resampler_destroy(cb->resampler);
}
if (cb->vad) {
switch_vad_destroy(&cb->vad);
cb->vad = nullptr;
}
if (!channelIsClosing) {
switch_core_media_bug_remove(session, &bug);
}
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "google_speech_session_cleanup: Closed stream\n");
switch_mutex_unlock(cb->mutex);
return SWITCH_STATUS_SUCCESS;
}
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s Bug is not attached.\n", switch_channel_get_name(channel));
return SWITCH_STATUS_FALSE;
}
template<typename PhraseSet>
void google_speech_configure_grammar_hints(switch_core_session_t *session, switch_channel_t *channel, const char* hints, PhraseSet* phrase_set) {
float boost = -1;
// get boost setting for the phrase set in its entirety
if (switch_true(switch_channel_get_variable(channel, "GOOGLE_SPEECH_HINTS_BOOST"))) {
boost = (float) atof(switch_channel_get_variable(channel, "GOOGLE_SPEECH_HINTS_BOOST"));
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "boost value: %f\n", boost);
phrase_set->set_boost(boost);
}
// hints are either a simple comma-separated list of phrases, or a json array of objects
// containing a phrase and a boost value
auto *jHint = cJSON_Parse((char *) hints);
if (jHint) {
int i = 0;
cJSON *jPhrase = NULL;
cJSON_ArrayForEach(jPhrase, jHint) {
cJSON *jItem = cJSON_GetObjectItem(jPhrase, "phrase");
if (jItem) {
auto* phrase = phrase_set->add_phrases();
phrase->set_value(cJSON_GetStringValue(jItem));
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "phrase: %s\n", phrase->value().c_str());
if (cJSON_GetObjectItem(jPhrase, "boost")) {
phrase->set_boost((float) cJSON_GetObjectItem(jPhrase, "boost")->valuedouble);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "boost value: %f\n", phrase->boost());
}
i++;
}
}
cJSON_Delete(jHint);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "added %d hints\n", i);
}
else {
char *phrases[500] = { 0 };
int argc = switch_separate_string((char *) hints, ',', phrases, 500);
for (int i = 0; i < argc; i++) {
auto* phrase = phrase_set->add_phrases();
phrase->set_value(phrases[i]);
}
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "added %d hints\n", argc);
}
}
#endif