#ifndef __GENERIC_GOOGLE_GLUE_H__ #define __GENERIC_GOOGLE_GLUE_H__ #include template switch_bool_t google_speech_frame(switch_media_bug_t *bug, void* user_data) { switch_core_session_t *session = switch_core_media_bug_get_session(bug); struct cap_cb *cb = (struct cap_cb *) user_data; if (cb->streamer && (!cb->wants_single_utterance || !cb->got_end_of_utterance)) { Streamer* streamer = (Streamer *) cb->streamer; uint8_t data[SWITCH_RECOMMENDED_BUFFER_SIZE]; switch_frame_t frame = {}; frame.data = data; frame.buflen = SWITCH_RECOMMENDED_BUFFER_SIZE; if (switch_mutex_trylock(cb->mutex) == SWITCH_STATUS_SUCCESS) { while (switch_core_media_bug_read(bug, &frame, SWITCH_TRUE) == SWITCH_STATUS_SUCCESS && !switch_test_flag((&frame), SFF_CNG)) { if (frame.datalen) { if (cb->vad && !streamer->isConnected()) { switch_vad_state_t state = switch_vad_process(cb->vad, (int16_t*) frame.data, frame.samples); if (state == SWITCH_VAD_STATE_START_TALKING) { switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "detected speech, connect to google speech now\n"); streamer->connect(); cb->responseHandler(session, "vad_detected", cb->bugname); } } if (cb->resampler) { spx_int16_t out[SWITCH_RECOMMENDED_BUFFER_SIZE]; spx_uint32_t out_len = SWITCH_RECOMMENDED_BUFFER_SIZE; spx_uint32_t in_len = frame.samples; size_t written; speex_resampler_process_interleaved_int(cb->resampler, (const spx_int16_t *) frame.data, (spx_uint32_t *) &in_len, &out[0], &out_len); streamer->write( &out[0], sizeof(spx_int16_t) * out_len); } else { streamer->write( frame.data, sizeof(spx_int16_t) * frame.samples); } } } switch_mutex_unlock(cb->mutex); } } return SWITCH_TRUE; } template switch_status_t google_speech_session_init(switch_core_session_t *session, responseHandler_t responseHandler, switch_thread_start_t func, uint32_t to_rate, uint32_t samples_per_second, uint32_t channels, char* lang, int interim, char *bugname, int single_utterance, int separate_recognition, int max_alternatives, int profanity_filter, int word_time_offset, int punctuation, const char* model, int enhanced, const char* hints, char* play_file, void **ppUserData) { switch_channel_t *channel = switch_core_session_get_channel(session); auto read_codec = switch_core_session_get_read_codec(session); uint32_t sampleRate = read_codec->implementation->actual_samples_per_second; struct cap_cb *cb; int err; cb =(struct cap_cb *) switch_core_session_alloc(session, sizeof(*cb)); strncpy(cb->sessionId, switch_core_session_get_uuid(session), MAX_SESSION_ID); strncpy(cb->bugname, bugname, MAX_BUG_LEN); cb->got_end_of_utterance = 0; cb->wants_single_utterance = single_utterance; if (play_file != NULL){ cb->play_file = 1; } switch_mutex_init(&cb->mutex, SWITCH_MUTEX_NESTED, switch_core_session_get_pool(session)); if (sampleRate != to_rate) { cb->resampler = speex_resampler_init(channels, sampleRate, to_rate, SWITCH_RESAMPLE_QUALITY, &err); if (0 != err) { switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error initializing resampler: %s.\n", switch_channel_get_name(channel), speex_resampler_strerror(err)); return SWITCH_STATUS_FALSE; } } else { switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "%s: no resampling needed for this call\n", switch_channel_get_name(channel)); } cb->responseHandler = responseHandler; // allocate vad if we are delaying connecting to the recognizer until we detect speech if (switch_channel_var_true(channel, "START_RECOGNIZING_ON_VAD")) { cb->vad = switch_vad_init(sampleRate, channels); if (cb->vad) { const char* var; int mode = 2; int silence_ms = 150; int voice_ms = 250; int debug = 0; if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_MODE")) { mode = atoi(var); } if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_SILENCE_MS")) { silence_ms = atoi(var); } if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_VOICE_MS")) { voice_ms = atoi(var); } if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_VOICE_MS")) { voice_ms = atoi(var); } switch_vad_set_mode(cb->vad, mode); switch_vad_set_param(cb->vad, "silence_ms", silence_ms); switch_vad_set_param(cb->vad, "voice_ms", voice_ms); switch_vad_set_param(cb->vad, "debug", debug); } } Streamer *streamer = NULL; try { streamer = new Streamer(session, channels, lang, interim, to_rate, sampleRate, single_utterance, separate_recognition, max_alternatives, profanity_filter, word_time_offset, punctuation, model, enhanced, hints); cb->streamer = streamer; } catch (std::exception& e) { switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error initializing gstreamer: %s.\n", switch_channel_get_name(channel), e.what()); return SWITCH_STATUS_FALSE; } if (!cb->vad) streamer->connect(); // create the read thread switch_threadattr_t *thd_attr = NULL; switch_memory_pool_t *pool = switch_core_session_get_pool(session); switch_threadattr_create(&thd_attr, pool); switch_threadattr_stacksize_set(thd_attr, SWITCH_THREAD_STACKSIZE); switch_thread_create(&cb->thread, thd_attr, func, cb, pool); *ppUserData = cb; return SWITCH_STATUS_SUCCESS; } template switch_status_t google_speech_session_cleanup(switch_core_session_t *session, int channelIsClosing, switch_media_bug_t *bug) { switch_channel_t *channel = switch_core_session_get_channel(session); if (bug) { struct cap_cb *cb = (struct cap_cb *) switch_core_media_bug_get_user_data(bug); switch_mutex_lock(cb->mutex); if (!switch_channel_get_private(channel, cb->bugname)) { // race condition switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s Bug is not attached (race).\n", switch_channel_get_name(channel)); switch_mutex_unlock(cb->mutex); return SWITCH_STATUS_FALSE; } switch_channel_set_private(channel, cb->bugname, NULL); // stop playback if available if (cb->play_file == 1){ if (switch_channel_test_flag(channel, CF_BROADCAST)) { switch_channel_stop_broadcast(channel); } else { switch_channel_set_flag_value(channel, CF_BREAK, 1); } } // close connection and get final responses Streamer* streamer = (Streamer *) cb->streamer; if (streamer) { streamer->writesDone(); switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "google_speech_session_cleanup: GStreamer (%p) waiting for read thread to complete\n", (void*)streamer); switch_status_t st; switch_thread_join(&st, cb->thread); switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "google_speech_session_cleanup: GStreamer (%p) read thread completed\n", (void*)streamer); delete streamer; cb->streamer = NULL; } if (cb->resampler) { speex_resampler_destroy(cb->resampler); } if (cb->vad) { switch_vad_destroy(&cb->vad); cb->vad = nullptr; } if (!channelIsClosing) { switch_core_media_bug_remove(session, &bug); } switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "google_speech_session_cleanup: Closed stream\n"); switch_mutex_unlock(cb->mutex); return SWITCH_STATUS_SUCCESS; } switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s Bug is not attached.\n", switch_channel_get_name(channel)); return SWITCH_STATUS_FALSE; } template void google_speech_configure_grammar_hints(switch_core_session_t *session, switch_channel_t *channel, const char* hints, PhraseSet* phrase_set) { float boost = -1; // get boost setting for the phrase set in its entirety if (switch_true(switch_channel_get_variable(channel, "GOOGLE_SPEECH_HINTS_BOOST"))) { boost = (float) atof(switch_channel_get_variable(channel, "GOOGLE_SPEECH_HINTS_BOOST")); switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "boost value: %f\n", boost); phrase_set->set_boost(boost); } // hints are either a simple comma-separated list of phrases, or a json array of objects // containing a phrase and a boost value auto *jHint = cJSON_Parse((char *) hints); if (jHint) { int i = 0; cJSON *jPhrase = NULL; cJSON_ArrayForEach(jPhrase, jHint) { cJSON *jItem = cJSON_GetObjectItem(jPhrase, "phrase"); if (jItem) { auto* phrase = phrase_set->add_phrases(); phrase->set_value(cJSON_GetStringValue(jItem)); switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "phrase: %s\n", phrase->value().c_str()); if (cJSON_GetObjectItem(jPhrase, "boost")) { phrase->set_boost((float) cJSON_GetObjectItem(jPhrase, "boost")->valuedouble); switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "boost value: %f\n", phrase->boost()); } i++; } } cJSON_Delete(jHint); switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "added %d hints\n", i); } else { char *phrases[500] = { 0 }; int argc = switch_separate_string((char *) hints, ',', phrases, 500); for (int i = 0; i < argc; i++) { auto* phrase = phrase_set->add_phrases(); phrase->set_value(phrases[i]); } switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "added %d hints\n", argc); } } #endif