support new parameters for google v2 (#31)

* support new parameters for google v2 Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com> * add enable_voice_activity_events Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com> * changes to start and end timeout --------- Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com> Co-authored-by: Dave Horton <daveh@beachdognet.com>
2026-01-25 02:08:27 +00:00 · 2024-04-12 18:24:46 +07:00
parent f0d15c57a2
commit 4ce95e6d27
3 changed files with 74 additions and 2 deletions
--- a/mod_google_transcribe/google_glue_v2.cpp
+++ b/mod_google_transcribe/google_glue_v2.cpp
@@ -18,6 +18,8 @@ using google::cloud::speech::v2::SpeechRecognitionAlternative;
 using google::cloud::speech::v2::PhraseSet;
 using google::cloud::speech::v2::PhraseSet_Phrase;
 using google::cloud::speech::v2::StreamingRecognizeResponse_SpeechEventType_END_OF_SINGLE_UTTERANCE;
 using google::cloud::speech::v2::StreamingRecognizeResponse_SpeechEventType_SPEECH_ACTIVITY_BEGIN;
 using google::cloud::speech::v2::StreamingRecognizeResponse_SpeechEventType_SPEECH_ACTIVITY_END;
 using google::cloud::speech::v2::ExplicitDecodingConfig_AudioEncoding_LINEAR16;
 using google::cloud::speech::v2::RecognitionFeatures_MultiChannelMode_SEPARATE_RECOGNITION_PER_CHANNEL;
 using google::cloud::speech::v2::SpeechAdaptation_AdaptationPhraseSet;
@@ -158,12 +160,54 @@ GStreamer<StreamingRecognizeRequest, StreamingRecognizeResponse, Speech::Stub>::
                diarization_config->set_max_speaker_count(count);
            }
        }
        if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_TRANSCRIPTION_NORMALIZATION")) {
          // parse JSON string
        cJSON *json_array = cJSON_Parse(var);
        int array_size = cJSON_GetArraySize(json_array);
        for(int i=0; i<array_size; i++) {
            cJSON* json_item = cJSON_GetArrayItem(json_array, i);
            auto entry = config->mutable_transcript_normalization()->add_entries();
            std::string search_string = cJSON_GetObjectItem(json_item, "search")->valuestring;
            std::string replacement_string = cJSON_GetObjectItem(json_item, "replace")->valuestring;
            bool case_sensitive = cJSON_GetObjectItem(json_item, "case_sensitive")->valueint != 0;
            entry->set_search(search_string);
            entry->set_replace(replacement_string);
            entry->set_case_sensitive(case_sensitive);
            switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG,
              "TRANSCRIPTION_NORMALIZATION search %s, replace %s, set_case_sensitive %d\n", search_string.c_str(), replacement_string.c_str(), case_sensitive);
        }
        // clean json
        cJSON_Delete(json_array);
      }
    }
    if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_START_TIMEOUT_MS")) {
      auto ms = atoi(var);
      streaming_config->mutable_streaming_features()->mutable_voice_activity_timeout()->mutable_speech_start_timeout()->set_nanos(ms * 1000000);
      switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "setting speech_start_timeout to %d milliseconds\n", ms);
    }
    if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_END_TIMEOUT_MS")) {
      auto ms = atoi(var);
      streaming_config->mutable_streaming_features()->mutable_voice_activity_timeout()->mutable_speech_end_timeout()->set_nanos(ms * 1000000);
      switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "setting speech_end_timeout to %d milliseconds\n", ms);
    }
    if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_ENABLE_VOICE_ACTIVITY_EVENTS")) {
      bool enabled = !strcmp(var, "true") ? 1 : 0;
      streaming_config->mutable_streaming_features()->set_enable_voice_activity_events(enabled);
      switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "setting enable_voice_activity_events to %d \n", enabled);
    }
    m_request.set_recognizer(recognizer);
    switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "using recognizer: %s\n", recognizer.c_str());
-    // This must be set whether a recognizer id is provided orr not, because it cannot be configured as part of a recognizer.
+    // This must be set whether a recognizer id is provided or not, because it cannot be configured as part of a recognizer.
    if (interim > 0) {
        streaming_config->mutable_streaming_features()->set_interim_results(interim > 0);
    }
@@ -277,6 +321,12 @@ static void *SWITCH_THREAD_FUNC grpc_read_thread(switch_thread_t *thread, void *
        streamer->writesDone();
      }
    }
    else if (speech_event_type == StreamingRecognizeResponse_SpeechEventType_SPEECH_ACTIVITY_BEGIN) {
      switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "grpc_read_thread: got SPEECH_ACTIVITY_BEGIN\n") ;
    }
    else if (speech_event_type == StreamingRecognizeResponse_SpeechEventType_SPEECH_ACTIVITY_END) {
      switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "grpc_read_thread: got SPEECH_ACTIVITY_END\n") ;
    }
    switch_core_session_rwunlock(session);
    switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "grpc_read_thread: got %d responses\n", response.results_size());
  }
@@ -296,7 +346,7 @@ static void *SWITCH_THREAD_FUNC grpc_read_thread(switch_thread_t *thread, void *
          cb->responseHandler(session, "no_audio", cb->bugname);
        }
      }
-      else {
+      else if (status.error_code() != 0) {
        cJSON* json = cJSON_CreateObject();
        cJSON_AddStringToObject(json, "type", "error");
        cJSON_AddItemToObject(json, "error_code", cJSON_CreateNumber(status.error_code()));
--- a/mod_google_transcribe/mod_google_transcribe.c
+++ b/mod_google_transcribe/mod_google_transcribe.c
@@ -76,6 +76,16 @@ static void responseHandler(switch_core_session_t* session, const char * json, c
 		switch_channel_event_set_data(channel, event);
 		switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
 	}
 	else if (0 == strcmp("start_of_speech", json)) {
 		switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_START_OF_SPEECH);
 		switch_channel_event_set_data(channel, event);
 		switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
 	}
 	else if (0 == strcmp("end_of_speech", json)) {
 		switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_END_OF_SPEECH);
 		switch_channel_event_set_data(channel, event);
 		switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
 	}
 	else if (0 == strcmp("end_of_transcript", json)) {
 		switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_END_OF_TRANSCRIPT);
 		switch_channel_event_set_data(channel, event);
@@ -506,6 +516,14 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_transcribe_load)
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_END_OF_UTTERANCE);
 		return SWITCH_STATUS_TERM;
 	}
 	if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_START_OF_SPEECH) != SWITCH_STATUS_SUCCESS) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_START_OF_SPEECH);
 		return SWITCH_STATUS_TERM;
 	}
 	if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_END_OF_SPEECH) != SWITCH_STATUS_SUCCESS) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_END_OF_SPEECH);
 		return SWITCH_STATUS_TERM;
 	}
 	if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_START_OF_TRANSCRIPT) != SWITCH_STATUS_SUCCESS) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_START_OF_TRANSCRIPT);
 		return SWITCH_STATUS_TERM;
@@ -556,6 +574,8 @@ SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_transcribe_shutdown)
 	google_speech_cleanup();
 	switch_event_free_subclass(TRANSCRIBE_EVENT_RESULTS);
 	switch_event_free_subclass(TRANSCRIBE_EVENT_END_OF_UTTERANCE);
 	switch_event_free_subclass(TRANSCRIBE_EVENT_START_OF_SPEECH);
 	switch_event_free_subclass(TRANSCRIBE_EVENT_END_OF_SPEECH);
 	switch_event_free_subclass(TRANSCRIBE_EVENT_START_OF_TRANSCRIPT);
 	switch_event_free_subclass(TRANSCRIBE_EVENT_END_OF_TRANSCRIPT);
 	switch_event_free_subclass(TRANSCRIBE_EVENT_NO_AUDIO_DETECTED);
--- a/mod_google_transcribe/mod_google_transcribe.h
+++ b/mod_google_transcribe/mod_google_transcribe.h
@@ -11,6 +11,8 @@
 #define MY_BUG_NAME "google_transcribe"
 #define TRANSCRIBE_EVENT_RESULTS "google_transcribe::transcription"
 #define TRANSCRIBE_EVENT_END_OF_UTTERANCE "google_transcribe::end_of_utterance"
 #define TRANSCRIBE_EVENT_START_OF_SPEECH "google_transcribe::start_of_speech"
 #define TRANSCRIBE_EVENT_END_OF_SPEECH "google_transcribe::end_of_speech"
 #define TRANSCRIBE_EVENT_START_OF_TRANSCRIPT "google_transcribe::start_of_transcript"
 #define TRANSCRIBE_EVENT_END_OF_TRANSCRIPT "google_transcribe::end_of_transcript"
 #define TRANSCRIBE_EVENT_NO_AUDIO_DETECTED "google_transcribe::no_audio_detected"