#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mod_jambonz_transcribe.h" #include "simple_buffer.h" #include "parser.hpp" #include "audio_pipe.hpp" #define RTP_PACKETIZATION_PERIOD 20 #define FRAME_SIZE_8000 320 /*which means each 20ms frame as 320 bytes at 8 khz (1 channel only)*/ namespace { static bool hasDefaultCredentials = false; static const char* defaultApiKey = nullptr; static const char *requestedBufferSecs = std::getenv("MOD_AUDIO_FORK_BUFFER_SECS"); static int nAudioBufferSecs = std::max(1, std::min(requestedBufferSecs ? ::atoi(requestedBufferSecs) : 2, 5)); static const char *requestedNumServiceThreads = std::getenv("MOD_AUDIO_FORK_SERVICE_THREADS"); static unsigned int nServiceThreads = std::max(1, std::min(requestedNumServiceThreads ? ::atoi(requestedNumServiceThreads) : 1, 5)); static unsigned int idxCallCount = 0; static uint32_t playCount = 0; static int parse_ws_uri(switch_channel_t *channel, const char* szServerUri, char* host, char *path, unsigned int* pPort, int* pSslFlags) { int i = 0, offset; char server[MAX_WS_URL_LEN + MAX_PATH_LEN]; char *saveptr; int flags = LCCSCF_USE_SSL; if (switch_true(switch_channel_get_variable(channel, "MOD_AUDIO_FORK_ALLOW_SELFSIGNED"))) { switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "parse_ws_uri - allowing self-signed certs\n"); flags |= LCCSCF_ALLOW_SELFSIGNED; } if (switch_true(switch_channel_get_variable(channel, "MOD_AUDIO_FORK_SKIP_SERVER_CERT_HOSTNAME_CHECK"))) { switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "parse_ws_uri - skipping hostname check\n"); flags |= LCCSCF_SKIP_SERVER_CERT_HOSTNAME_CHECK; } if (switch_true(switch_channel_get_variable(channel, "MOD_AUDIO_FORK_ALLOW_EXPIRED"))) { switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "parse_ws_uri - allowing expired certs\n"); flags |= LCCSCF_ALLOW_EXPIRED; } // get the scheme strncpy(server, szServerUri, MAX_WS_URL_LEN + MAX_PATH_LEN); if (0 == strncmp(server, "https://", 8) || 0 == strncmp(server, "HTTPS://", 8)) { *pSslFlags = flags; offset = 8; *pPort = 443; } else if (0 == strncmp(server, "wss://", 6) || 0 == strncmp(server, "WSS://", 6)) { *pSslFlags = flags; offset = 6; *pPort = 443; } else if (0 == strncmp(server, "http://", 7) || 0 == strncmp(server, "HTTP://", 7)) { offset = 7; *pSslFlags = 0; *pPort = 80; } else if (0 == strncmp(server, "ws://", 5) || 0 == strncmp(server, "WS://", 5)) { offset = 5; *pSslFlags = 0; *pPort = 80; } else { switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "parse_ws_uri - error parsing uri %s: invalid scheme\n", szServerUri);; return 0; } std::string strHost(server + offset); std::regex re("^(.+?):?(\\d+)?(/.*)?$"); std::smatch matches; if(std::regex_search(strHost, matches, re)) { /* for (int i = 0; i < matches.length(); i++) { switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "parse_ws_uri - %d: %s\n", i, matches[i].str().c_str()); } */ strncpy(host, matches[1].str().c_str(), MAX_WS_URL_LEN); if (matches[2].str().length() > 0) { *pPort = atoi(matches[2].str().c_str()); } if (matches[3].str().length() > 0) { strncpy(path, matches[3].str().c_str(), MAX_PATH_LEN); } else { strcpy(path, "/"); } } else { switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "parse_ws_uri - invalid format %s\n", strHost.c_str()); return 0; } switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "parse_ws_uri - host %s, path %s\n", host, path); return 1; } static void reaper(private_t *tech_pvt) { std::shared_ptr pAp; pAp.reset((jambonz::AudioPipe *)tech_pvt->pAudioPipe); tech_pvt->pAudioPipe = nullptr; std::thread t([pAp, tech_pvt]{ pAp->finish(); pAp->waitForClose(); switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "%s (%u) got remote close\n", tech_pvt->sessionId, tech_pvt->id); }); t.detach(); } static void destroy_tech_pvt(private_t *tech_pvt) { switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "%s (%u) destroy_tech_pvt\n", tech_pvt->sessionId, tech_pvt->id); if (tech_pvt) { if (tech_pvt->mutex) { switch_mutex_destroy(tech_pvt->mutex); tech_pvt->mutex = nullptr; } if (tech_pvt->resampler) { speex_resampler_destroy(tech_pvt->resampler); tech_pvt->resampler = NULL; } } } static void sendStartMessage(switch_channel_t *channel, private_t* tech_pvt) { auto *pAudioPipe = static_cast(tech_pvt->pAudioPipe); const char* var; bool hasOptions = false; if (nullptr == pAudioPipe) return; cJSON* json = cJSON_CreateObject(); cJSON_AddStringToObject(json, "type", "start"); cJSON_AddStringToObject(json, "language", tech_pvt->language); cJSON_AddStringToObject(json, "format", "raw"); cJSON_AddStringToObject(json, "encoding", "LINEAR16"); cJSON_AddBoolToObject(json, "interimResults", tech_pvt->interim); cJSON_AddNumberToObject(json, "sampleRateHz", tech_pvt->sampling); if (var = switch_channel_get_variable(channel, "JAMBONZ_STT_OPTIONS")) { cJSON* jOptions = cJSON_Parse(var); if (jOptions) { hasOptions = true; cJSON_AddItemToObject(json, "options", jOptions); } } if (!hasOptions) { cJSON* jOptions = cJSON_CreateObject(); cJSON_AddItemToObject(json, "options", jOptions); } char* jsonString = cJSON_PrintUnformatted(json); /* send */ pAudioPipe->bufferForSending(jsonString); /* cleanup */ free(jsonString); cJSON_Delete(json); } static void eventCallback(const char* sessionId, const char* bugname, jambonz::AudioPipe::NotifyEvent_t event, const char* message, bool finished) { switch_core_session_t* session = switch_core_session_locate(sessionId); if (session) { switch_channel_t *channel = switch_core_session_get_channel(session); switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, bugname); if (bug) { private_t* tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug); if (tech_pvt) { switch (event) { case jambonz::AudioPipe::CONNECT_SUCCESS: switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "connection successful\n"); tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_CONNECT_SUCCESS, NULL, tech_pvt->bugname, finished); sendStartMessage(channel, tech_pvt); break; case jambonz::AudioPipe::CONNECT_FAIL: { // first thing: we can no longer access the AudioPipe std::stringstream json; json << "{\"reason\":\"" << message << "\"}"; tech_pvt->pAudioPipe = nullptr; tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_CONNECT_FAIL, (char *) json.str().c_str(), tech_pvt->bugname, finished); switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_NOTICE, "connection failed: %s\n", message); } break; case jambonz::AudioPipe::CONNECTION_DROPPED: // first thing: we can no longer access the AudioPipe tech_pvt->pAudioPipe = nullptr; tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_DISCONNECT, NULL, tech_pvt->bugname, finished); switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "connection dropped from far end\n"); break; case jambonz::AudioPipe::CONNECTION_CLOSED_GRACEFULLY: // first thing: we can no longer access the AudioPipe tech_pvt->pAudioPipe = nullptr; switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "connection closed gracefully\n"); break; case jambonz::AudioPipe::MESSAGE: { cJSON* jMessage = cJSON_Parse(message); if (!jMessage) { switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "discarding incoming message (not json): %s\n", message); break; } const char* type = cJSON_GetStringValue(cJSON_GetObjectItem(jMessage, "type")); if (type && 0 == strcmp(type, "error")) { tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_ERROR, message, tech_pvt->bugname, finished); } else if (type && 0 == strcmp(type, "transcription")) { tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_RESULTS, message, tech_pvt->bugname, finished); } else { switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "discarding incoming message: %s\n", message); } cJSON_Delete(jMessage); } break; default: switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_NOTICE, "got unexpected msg from jambonz %d:%s\n", event, message); break; } } } switch_core_session_rwunlock(session); } } switch_status_t fork_data_init(private_t *tech_pvt, switch_core_session_t *session, int sampling, int desiredSampling, int channels, char *lang, int interim, char* bugname, responseHandler_t responseHandler) { int err; switch_codec_implementation_t read_impl; switch_channel_t *channel = switch_core_session_get_channel(session); char host[MAX_WS_URL_LEN], path[MAX_PATH_LEN]; unsigned int port; int sslFlags; const char* url = switch_channel_get_variable(channel, "JAMBONZ_STT_URL"); if (!parse_ws_uri(channel, url, &host[0], &path[0], &port, &sslFlags)) { switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "invalid websocket uri: %s\n", url); return SWITCH_STATUS_FALSE; } switch_core_session_get_read_impl(session, &read_impl); memset(tech_pvt, 0, sizeof(private_t)); switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "host: %s, port: %d, path: %s\n", host, port, path); strncpy(tech_pvt->sessionId, switch_core_session_get_uuid(session), MAX_SESSION_ID); strncpy(tech_pvt->host, host, MAX_WS_URL_LEN); strncpy(tech_pvt->path, path, MAX_PATH_LEN); tech_pvt->port = port; tech_pvt->sslFlags = sslFlags; strncpy(tech_pvt->language, lang, MAX_LANG_LEN); tech_pvt->interim = interim; tech_pvt->sampling = desiredSampling; tech_pvt->responseHandler = responseHandler; tech_pvt->channels = channels; tech_pvt->id = ++idxCallCount; tech_pvt->buffer_overrun_notified = 0; strncpy(tech_pvt->bugname, bugname, MAX_BUG_LEN); size_t buflen = LWS_PRE + (FRAME_SIZE_8000 * desiredSampling / 8000 * channels * 1000 / RTP_PACKETIZATION_PERIOD * nAudioBufferSecs); const char* apiKey = switch_channel_get_variable(channel, "JAMBONZ_STT_API_KEY"); if (!apiKey && defaultApiKey) apiKey = defaultApiKey; else if (!apiKey) { switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "no jambonz api key provided\n"); return SWITCH_STATUS_FALSE; } jambonz::AudioPipe* ap = new jambonz::AudioPipe(tech_pvt->sessionId, bugname, tech_pvt->host, tech_pvt->port, tech_pvt->path, tech_pvt->sslFlags, buflen, read_impl.decoded_bytes_per_packet, apiKey, eventCallback); if (!ap) { switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error allocating AudioPipe\n"); return SWITCH_STATUS_FALSE; } tech_pvt->pAudioPipe = static_cast(ap); switch_mutex_init(&tech_pvt->mutex, SWITCH_MUTEX_NESTED, switch_core_session_get_pool(session)); if (desiredSampling != sampling) { switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) resampling from %u to %u\n", tech_pvt->id, sampling, desiredSampling); tech_pvt->resampler = speex_resampler_init(channels, sampling, desiredSampling, SWITCH_RESAMPLE_QUALITY, &err); if (0 != err) { switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error initializing resampler: %s.\n", speex_resampler_strerror(err)); return SWITCH_STATUS_FALSE; } } else { switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) no resampling needed for this call\n", tech_pvt->id); } switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) fork_data_init\n", tech_pvt->id); return SWITCH_STATUS_SUCCESS; } void lws_logger(int level, const char *line) { switch_log_level_t llevel = SWITCH_LOG_DEBUG; switch (level) { case LLL_ERR: llevel = SWITCH_LOG_ERROR; break; case LLL_WARN: llevel = SWITCH_LOG_WARNING; break; case LLL_NOTICE: llevel = SWITCH_LOG_NOTICE; break; case LLL_INFO: llevel = SWITCH_LOG_INFO; break; break; } switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "%s\n", line); } } extern "C" { switch_status_t jb_transcribe_init() { switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "mod_jambonz_transcribe: audio buffer (in secs): %d secs\n", nAudioBufferSecs); int logs = LLL_ERR | LLL_WARN | LLL_NOTICE ; // | LLL_INFO | LLL_PARSER | LLL_HEADER | LLL_EXT | LLL_CLIENT | LLL_LATENCY | LLL_DEBUG ; jambonz::AudioPipe::initialize(logs, lws_logger); switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "AudioPipe::initialize completed\n"); const char* apiKey = std::getenv("JAMBONZ_STT_API_KEY"); if (NULL == apiKey) { switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "\"JAMBONZ_STT_API_KEY\" env var not set; authentication will expect channel variables of same names to be set\n"); } else { hasDefaultCredentials = true; defaultApiKey = apiKey; } return SWITCH_STATUS_SUCCESS; } switch_status_t jb_transcribe_cleanup() { bool cleanup = false; cleanup = jambonz::AudioPipe::deinitialize(); if (cleanup == true) { return SWITCH_STATUS_SUCCESS; } return SWITCH_STATUS_FALSE; } switch_status_t jb_transcribe_session_init(switch_core_session_t *session, responseHandler_t responseHandler, uint32_t samples_per_second, int desiredSampling, uint32_t channels, char* lang, int interim, char* bugname, void **ppUserData) { int err; // allocate per-session data structure private_t* tech_pvt = (private_t *) switch_core_session_alloc(session, sizeof(private_t)); if (!tech_pvt) { switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "error allocating memory!\n"); return SWITCH_STATUS_FALSE; } if (SWITCH_STATUS_SUCCESS != fork_data_init(tech_pvt, session, samples_per_second, desiredSampling, channels, lang, interim, bugname, responseHandler)) { destroy_tech_pvt(tech_pvt); return SWITCH_STATUS_FALSE; } *ppUserData = tech_pvt; jambonz::AudioPipe *pAudioPipe = static_cast(tech_pvt->pAudioPipe); switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "connecting now\n"); pAudioPipe->connect(); switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "connection in progress\n"); return SWITCH_STATUS_SUCCESS; } switch_status_t jb_transcribe_session_stop(switch_core_session_t *session,int channelIsClosing, char* bugname) { switch_channel_t *channel = switch_core_session_get_channel(session); switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, bugname); if (!bug) { switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "jb_transcribe_session_stop: no bug %s - websocket conection already closed\n", bugname); return SWITCH_STATUS_FALSE; } private_t* tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug); if (!tech_pvt) return SWITCH_STATUS_FALSE; // close connection and get final responses switch_mutex_lock(tech_pvt->mutex); { // get the bug again, now that we are under lock switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, bugname); if (bug) { switch_channel_set_private(channel, bugname, NULL); uint32_t id = tech_pvt->id; switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) jb_transcribe_session_stop\n", id); if (!channelIsClosing) { switch_core_media_bug_remove(session, &bug); } jambonz::AudioPipe *pAudioPipe = static_cast(tech_pvt->pAudioPipe); if (pAudioPipe) reaper(tech_pvt); switch_mutex_unlock(tech_pvt->mutex); destroy_tech_pvt(tech_pvt); switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) jb_transcribe_session_stop, bug removed\n", id); return SWITCH_STATUS_SUCCESS; } else { switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "jb_transcribe_session_stop: race condition, previous close completed\n"); } switch_mutex_unlock(tech_pvt->mutex); } return SWITCH_STATUS_FALSE; } switch_bool_t jb_transcribe_frame(switch_core_session_t *session, switch_media_bug_t *bug) { private_t* tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug); size_t inuse = 0; bool dirty = false; char *p = (char *) "{\"msg\": \"buffer overrun\"}"; if (!tech_pvt) return SWITCH_TRUE; if (switch_mutex_trylock(tech_pvt->mutex) == SWITCH_STATUS_SUCCESS) { if (!tech_pvt->pAudioPipe) { switch_mutex_unlock(tech_pvt->mutex); return SWITCH_TRUE; } jambonz::AudioPipe *pAudioPipe = static_cast(tech_pvt->pAudioPipe); if (pAudioPipe->getLwsState() != jambonz::AudioPipe::LWS_CLIENT_CONNECTED) { switch_mutex_unlock(tech_pvt->mutex); return SWITCH_TRUE; } pAudioPipe->lockAudioBuffer(); size_t available = pAudioPipe->binarySpaceAvailable(); if (NULL == tech_pvt->resampler) { switch_frame_t frame = { 0 }; frame.data = pAudioPipe->binaryWritePtr(); frame.buflen = available; while (true) { // check if buffer would be overwritten; dump packets if so if (available < pAudioPipe->binaryMinSpace()) { if (!tech_pvt->buffer_overrun_notified) { tech_pvt->buffer_overrun_notified = 1; tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_BUFFER_OVERRUN, NULL, tech_pvt->bugname, 0); } switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "(%u) dropping packets!\n", tech_pvt->id); pAudioPipe->binaryWritePtrResetToZero(); frame.data = pAudioPipe->binaryWritePtr(); frame.buflen = available = pAudioPipe->binarySpaceAvailable(); } switch_status_t rv = switch_core_media_bug_read(bug, &frame, SWITCH_TRUE); if (rv != SWITCH_STATUS_SUCCESS) break; if (frame.datalen) { pAudioPipe->binaryWritePtrAdd(frame.datalen); frame.buflen = available = pAudioPipe->binarySpaceAvailable(); frame.data = pAudioPipe->binaryWritePtr(); dirty = true; } } } else { uint8_t data[SWITCH_RECOMMENDED_BUFFER_SIZE]; switch_frame_t frame = { 0 }; frame.data = data; frame.buflen = SWITCH_RECOMMENDED_BUFFER_SIZE; while (switch_core_media_bug_read(bug, &frame, SWITCH_TRUE) == SWITCH_STATUS_SUCCESS) { if (frame.datalen) { spx_uint32_t out_len = available >> 1; // space for samples which are 2 bytes spx_uint32_t in_len = frame.samples; speex_resampler_process_interleaved_int(tech_pvt->resampler, (const spx_int16_t *) frame.data, (spx_uint32_t *) &in_len, (spx_int16_t *) ((char *) pAudioPipe->binaryWritePtr()), &out_len); if (out_len > 0) { // bytes written = num samples * 2 * num channels size_t bytes_written = out_len << tech_pvt->channels; pAudioPipe->binaryWritePtrAdd(bytes_written); available = pAudioPipe->binarySpaceAvailable(); dirty = true; } if (available < pAudioPipe->binaryMinSpace()) { if (!tech_pvt->buffer_overrun_notified) { tech_pvt->buffer_overrun_notified = 1; switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "(%u) dropping packets!\n", tech_pvt->id); tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_BUFFER_OVERRUN, NULL, tech_pvt->bugname, 0); } break; } } } } pAudioPipe->unlockAudioBuffer(); switch_mutex_unlock(tech_pvt->mutex); } return SWITCH_TRUE; } }