From 4ce95e6d275ceda0f404f5c9df6d81596b0b5a1d Mon Sep 17 00:00:00 2001 From: Hoan Luu Huu <110280845+xquanluu@users.noreply.github.com> Date: Fri, 12 Apr 2024 18:24:46 +0700 Subject: [PATCH] support new parameters for google v2 (#31) * support new parameters for google v2 Signed-off-by: Hoan HL * add enable_voice_activity_events Signed-off-by: Hoan HL * changes to start and end timeout --------- Signed-off-by: Hoan HL Co-authored-by: Dave Horton --- mod_google_transcribe/google_glue_v2.cpp | 54 ++++++++++++++++++- mod_google_transcribe/mod_google_transcribe.c | 20 +++++++ mod_google_transcribe/mod_google_transcribe.h | 2 + 3 files changed, 74 insertions(+), 2 deletions(-) diff --git a/mod_google_transcribe/google_glue_v2.cpp b/mod_google_transcribe/google_glue_v2.cpp index a5f652b..665ab1f 100644 --- a/mod_google_transcribe/google_glue_v2.cpp +++ b/mod_google_transcribe/google_glue_v2.cpp @@ -18,6 +18,8 @@ using google::cloud::speech::v2::SpeechRecognitionAlternative; using google::cloud::speech::v2::PhraseSet; using google::cloud::speech::v2::PhraseSet_Phrase; using google::cloud::speech::v2::StreamingRecognizeResponse_SpeechEventType_END_OF_SINGLE_UTTERANCE; +using google::cloud::speech::v2::StreamingRecognizeResponse_SpeechEventType_SPEECH_ACTIVITY_BEGIN; +using google::cloud::speech::v2::StreamingRecognizeResponse_SpeechEventType_SPEECH_ACTIVITY_END; using google::cloud::speech::v2::ExplicitDecodingConfig_AudioEncoding_LINEAR16; using google::cloud::speech::v2::RecognitionFeatures_MultiChannelMode_SEPARATE_RECOGNITION_PER_CHANNEL; using google::cloud::speech::v2::SpeechAdaptation_AdaptationPhraseSet; @@ -158,12 +160,54 @@ GStreamer:: diarization_config->set_max_speaker_count(count); } } + if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_TRANSCRIPTION_NORMALIZATION")) { + // parse JSON string + cJSON *json_array = cJSON_Parse(var); + + int array_size = cJSON_GetArraySize(json_array); + + for(int i=0; imutable_transcript_normalization()->add_entries(); + + std::string search_string = cJSON_GetObjectItem(json_item, "search")->valuestring; + std::string replacement_string = cJSON_GetObjectItem(json_item, "replace")->valuestring; + bool case_sensitive = cJSON_GetObjectItem(json_item, "case_sensitive")->valueint != 0; + + entry->set_search(search_string); + entry->set_replace(replacement_string); + entry->set_case_sensitive(case_sensitive); + + switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, + "TRANSCRIPTION_NORMALIZATION search %s, replace %s, set_case_sensitive %d\n", search_string.c_str(), replacement_string.c_str(), case_sensitive); + } + // clean json + cJSON_Delete(json_array); + } + } + if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_START_TIMEOUT_MS")) { + auto ms = atoi(var); + streaming_config->mutable_streaming_features()->mutable_voice_activity_timeout()->mutable_speech_start_timeout()->set_nanos(ms * 1000000); + switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "setting speech_start_timeout to %d milliseconds\n", ms); + } + + if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_END_TIMEOUT_MS")) { + auto ms = atoi(var); + streaming_config->mutable_streaming_features()->mutable_voice_activity_timeout()->mutable_speech_end_timeout()->set_nanos(ms * 1000000); + switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "setting speech_end_timeout to %d milliseconds\n", ms); + } + + if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_ENABLE_VOICE_ACTIVITY_EVENTS")) { + bool enabled = !strcmp(var, "true") ? 1 : 0; + streaming_config->mutable_streaming_features()->set_enable_voice_activity_events(enabled); + switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "setting enable_voice_activity_events to %d \n", enabled); } m_request.set_recognizer(recognizer); switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "using recognizer: %s\n", recognizer.c_str()); - // This must be set whether a recognizer id is provided orr not, because it cannot be configured as part of a recognizer. + // This must be set whether a recognizer id is provided or not, because it cannot be configured as part of a recognizer. if (interim > 0) { streaming_config->mutable_streaming_features()->set_interim_results(interim > 0); } @@ -277,6 +321,12 @@ static void *SWITCH_THREAD_FUNC grpc_read_thread(switch_thread_t *thread, void * streamer->writesDone(); } } + else if (speech_event_type == StreamingRecognizeResponse_SpeechEventType_SPEECH_ACTIVITY_BEGIN) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "grpc_read_thread: got SPEECH_ACTIVITY_BEGIN\n") ; + } + else if (speech_event_type == StreamingRecognizeResponse_SpeechEventType_SPEECH_ACTIVITY_END) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "grpc_read_thread: got SPEECH_ACTIVITY_END\n") ; + } switch_core_session_rwunlock(session); switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "grpc_read_thread: got %d responses\n", response.results_size()); } @@ -296,7 +346,7 @@ static void *SWITCH_THREAD_FUNC grpc_read_thread(switch_thread_t *thread, void * cb->responseHandler(session, "no_audio", cb->bugname); } } - else { + else if (status.error_code() != 0) { cJSON* json = cJSON_CreateObject(); cJSON_AddStringToObject(json, "type", "error"); cJSON_AddItemToObject(json, "error_code", cJSON_CreateNumber(status.error_code())); diff --git a/mod_google_transcribe/mod_google_transcribe.c b/mod_google_transcribe/mod_google_transcribe.c index 6c561ff..29196ed 100644 --- a/mod_google_transcribe/mod_google_transcribe.c +++ b/mod_google_transcribe/mod_google_transcribe.c @@ -76,6 +76,16 @@ static void responseHandler(switch_core_session_t* session, const char * json, c switch_channel_event_set_data(channel, event); switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google"); } + else if (0 == strcmp("start_of_speech", json)) { + switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_START_OF_SPEECH); + switch_channel_event_set_data(channel, event); + switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google"); + } + else if (0 == strcmp("end_of_speech", json)) { + switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_END_OF_SPEECH); + switch_channel_event_set_data(channel, event); + switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google"); + } else if (0 == strcmp("end_of_transcript", json)) { switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_END_OF_TRANSCRIPT); switch_channel_event_set_data(channel, event); @@ -506,6 +516,14 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_transcribe_load) switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_END_OF_UTTERANCE); return SWITCH_STATUS_TERM; } + if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_START_OF_SPEECH) != SWITCH_STATUS_SUCCESS) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_START_OF_SPEECH); + return SWITCH_STATUS_TERM; + } + if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_END_OF_SPEECH) != SWITCH_STATUS_SUCCESS) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_END_OF_SPEECH); + return SWITCH_STATUS_TERM; + } if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_START_OF_TRANSCRIPT) != SWITCH_STATUS_SUCCESS) { switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_START_OF_TRANSCRIPT); return SWITCH_STATUS_TERM; @@ -556,6 +574,8 @@ SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_transcribe_shutdown) google_speech_cleanup(); switch_event_free_subclass(TRANSCRIBE_EVENT_RESULTS); switch_event_free_subclass(TRANSCRIBE_EVENT_END_OF_UTTERANCE); + switch_event_free_subclass(TRANSCRIBE_EVENT_START_OF_SPEECH); + switch_event_free_subclass(TRANSCRIBE_EVENT_END_OF_SPEECH); switch_event_free_subclass(TRANSCRIBE_EVENT_START_OF_TRANSCRIPT); switch_event_free_subclass(TRANSCRIBE_EVENT_END_OF_TRANSCRIPT); switch_event_free_subclass(TRANSCRIBE_EVENT_NO_AUDIO_DETECTED); diff --git a/mod_google_transcribe/mod_google_transcribe.h b/mod_google_transcribe/mod_google_transcribe.h index 745d1f7..280aa36 100644 --- a/mod_google_transcribe/mod_google_transcribe.h +++ b/mod_google_transcribe/mod_google_transcribe.h @@ -11,6 +11,8 @@ #define MY_BUG_NAME "google_transcribe" #define TRANSCRIBE_EVENT_RESULTS "google_transcribe::transcription" #define TRANSCRIBE_EVENT_END_OF_UTTERANCE "google_transcribe::end_of_utterance" +#define TRANSCRIBE_EVENT_START_OF_SPEECH "google_transcribe::start_of_speech" +#define TRANSCRIBE_EVENT_END_OF_SPEECH "google_transcribe::end_of_speech" #define TRANSCRIBE_EVENT_START_OF_TRANSCRIPT "google_transcribe::start_of_transcript" #define TRANSCRIBE_EVENT_END_OF_TRANSCRIPT "google_transcribe::end_of_transcript" #define TRANSCRIBE_EVENT_NO_AUDIO_DETECTED "google_transcribe::no_audio_detected"