mirror of
https://github.com/jambonz/freeswitch-modules.git
synced 2025-12-19 08:27:44 +00:00
eliminate support for multiple lws threads as part of fixing valgrind errors
Signed-off-by: Dave Horton <daveh@beachdognet.com>
This commit is contained in:
4
README.md
Normal file
4
README.md
Normal file
@@ -0,0 +1,4 @@
|
||||
# freeswitch-modules
|
||||
|
||||
A collection of Freeswitch modules intended for use with [jambonz](https://jambonz.org)
|
||||
|
||||
8
mod_assemblyai_transcribe/LICENSE
Normal file
8
mod_assemblyai_transcribe/LICENSE
Normal file
@@ -0,0 +1,8 @@
|
||||
Copyright 2023, Drachtio Communications Services, LLC
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
9
mod_assemblyai_transcribe/Makefile.am
Normal file
9
mod_assemblyai_transcribe/Makefile.am
Normal file
@@ -0,0 +1,9 @@
|
||||
include $(top_srcdir)/build/modmake.rulesam
|
||||
MODNAME=mod_assemblyai_transcribe
|
||||
|
||||
mod_LTLIBRARIES = mod_assemblyai_transcribe.la
|
||||
mod_assemblyai_transcribe_la_SOURCES = mod_assemblyai_transcribe.c aai_transcribe_glue.cpp audio_pipe.cpp parser.cpp
|
||||
mod_assemblyai_transcribe_la_CFLAGS = $(AM_CFLAGS)
|
||||
mod_assemblyai_transcribe_la_CXXFLAGS = $(AM_CXXFLAGS) -std=c++11
|
||||
mod_assemblyai_transcribe_la_LIBADD = $(switch_builddir)/libfreeswitch.la
|
||||
mod_assemblyai_transcribe_la_LDFLAGS = -avoid-version -module -no-undefined -shared `pkg-config --libs libwebsockets`
|
||||
96
mod_assemblyai_transcribe/README.md
Normal file
96
mod_assemblyai_transcribe/README.md
Normal file
@@ -0,0 +1,96 @@
|
||||
# mod_deepgram_transcribe
|
||||
|
||||
A Freeswitch module that generates real-time transcriptions on a Freeswitch channel by using Deepgram's streaming transcription API
|
||||
|
||||
## API
|
||||
|
||||
### Commands
|
||||
The freeswitch module exposes the following API commands:
|
||||
|
||||
```
|
||||
uuid_deepgram_transcribe <uuid> start <lang-code> [interim]
|
||||
```
|
||||
Attaches media bug to channel and performs streaming recognize request.
|
||||
- `uuid` - unique identifier of Freeswitch channel
|
||||
- `lang-code` - a valid AWS [language code](https://docs.deepgram.amazon.com/transcribe/latest/dg/what-is-transcribe.html) that is supported for streaming transcription
|
||||
- `interim` - If the 'interim' keyword is present then both interim and final transcription results will be returned; otherwise only final transcriptions will be returned
|
||||
|
||||
```
|
||||
uuid_deepgram_transcribe <uuid> stop
|
||||
```
|
||||
Stop transcription on the channel.
|
||||
|
||||
### Channel Variables
|
||||
|
||||
| variable | Description |
|
||||
| --- | ----------- |
|
||||
| DEEPGRAM_API_KEY | Deepgram API key used to authenticate |
|
||||
| DEEPGRAM_SPEECH_TIER | https://developers.deepgram.com/documentation/features/tier/ |
|
||||
| DEEPGRAM_SPEECH_CUSTOM_MODEL | custom model id |
|
||||
| DEEPGRAM_SPEECH_MODEL | https://developers.deepgram.com/documentation/features/model/ |
|
||||
| DEEPGRAM_SPEECH_MODEL_VERSION | https://developers.deepgram.com/documentation/features/version/ |
|
||||
| DEEPGRAM_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION | https://developers.deepgram.com/documentation/features/punctuate/ |
|
||||
| DEEPGRAM_SPEECH_PROFANITY_FILTER | https://developers.deepgram.com/documentation/features/profanity-filter/ |
|
||||
| DEEPGRAM_SPEECH_REDACT | https://developers.deepgram.com/documentation/features/redact/ |
|
||||
| DEEPGRAM_SPEECH_DIARIZE | https://developers.deepgram.com/documentation/features/diarize/ |
|
||||
| DEEPGRAM_SPEECH_DIARIZE_VERSION | https://developers.deepgram.com/documentation/features/diarize/ |
|
||||
| DEEPGRAM_SPEECH_NER | https://developers.deepgram.com/documentation/features/named-entity-recognition/ |
|
||||
| DEEPGRAM_SPEECH_ALTERNATIVES | number of alternative hypotheses to return (default: 1) |
|
||||
| DEEPGRAM_SPEECH_NUMERALS | https://developers.deepgram.com/documentation/features/numerals/ |
|
||||
| DEEPGRAM_SPEECH_SEARCH | https://developers.deepgram.com/documentation/features/search/ |
|
||||
| DEEPGRAM_SPEECH_KEYWORDS | https://developers.deepgram.com/documentation/features/keywords/ |
|
||||
| DEEPGRAM_SPEECH_REPLACE | https://developers.deepgram.com/documentation/features/replace/ |
|
||||
| DEEPGRAM_SPEECH_TAG | https://developers.deepgram.com/documentation/features/tag/ |
|
||||
| DEEPGRAM_SPEECH_ENDPOINTING | https://developers.deepgram.com/documentation/features/endpointing/ |
|
||||
| DEEPGRAM_SPEECH_VAD_TURNOFF | https://developers.deepgram.com/documentation/features/voice-activity-detection/ |
|
||||
|
||||
|
||||
### Events
|
||||
`deepgram_transcribe::transcription` - returns an interim or final transcription. The event contains a JSON body describing the transcription result:
|
||||
```js
|
||||
{
|
||||
"channel_index": [0, 1],
|
||||
"duration": 4.59,
|
||||
"start": 0.0,
|
||||
"is_final": true,
|
||||
"speech_final": true,
|
||||
"channel": {
|
||||
"alternatives": [{
|
||||
"transcript": "hello hello hello",
|
||||
"confidence": 0.98583984,
|
||||
"words": [{
|
||||
"word": "hello",
|
||||
"start": 3.0865219,
|
||||
"end": 3.206,
|
||||
"confidence": 0.99902344
|
||||
}, {
|
||||
"word": "hello",
|
||||
"start": 3.5644348,
|
||||
"end": 3.644087,
|
||||
"confidence": 0.9741211
|
||||
}, {
|
||||
"word": "hello",
|
||||
"start": 4.042348,
|
||||
"end": 4.3609567,
|
||||
"confidence": 0.98583984
|
||||
}]
|
||||
}]
|
||||
},
|
||||
"metadata": {
|
||||
"request_id": "37835678-5d3b-4c77-910e-f8914c882cec",
|
||||
"model_info": {
|
||||
"name": "conversationalai",
|
||||
"version": "2021-11-10.1",
|
||||
"tier": "base"
|
||||
},
|
||||
"model_uuid": "6b28e919-8427-4f32-9847-492e2efd7daf"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Usage
|
||||
When using [drachtio-fsrmf](https://www.npmjs.com/package/drachtio-fsmrf), you can access this API command via the api method on the 'endpoint' object.
|
||||
```js
|
||||
ep.api('uuid_deepgram_transcribe', `${ep.uuid} start en-US interim`);
|
||||
```
|
||||
|
||||
432
mod_assemblyai_transcribe/aai_transcribe_glue.cpp
Normal file
432
mod_assemblyai_transcribe/aai_transcribe_glue.cpp
Normal file
@@ -0,0 +1,432 @@
|
||||
#include <switch.h>
|
||||
#include <switch_json.h>
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <list>
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
#include <cassert>
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <regex>
|
||||
|
||||
#include "mod_assemblyai_transcribe.h"
|
||||
#include "simple_buffer.h"
|
||||
#include "parser.hpp"
|
||||
#include "audio_pipe.hpp"
|
||||
|
||||
#define RTP_PACKETIZATION_PERIOD 20
|
||||
#define FRAME_SIZE_8000 320 /*which means each 20ms frame as 320 bytes at 8 khz (1 channel only)*/
|
||||
|
||||
namespace {
|
||||
static bool hasDefaultCredentials = false;
|
||||
static const char* defaultApiKey = nullptr;
|
||||
static const char *requestedBufferSecs = std::getenv("MOD_AUDIO_FORK_BUFFER_SECS");
|
||||
static int nAudioBufferSecs = std::max(1, std::min(requestedBufferSecs ? ::atoi(requestedBufferSecs) : 2, 5));
|
||||
static const char *requestedNumServiceThreads = std::getenv("MOD_AUDIO_FORK_SERVICE_THREADS");
|
||||
static unsigned int idxCallCount = 0;
|
||||
static uint32_t playCount = 0;
|
||||
|
||||
static void reaper(private_t *tech_pvt) {
|
||||
std::shared_ptr<assemblyai::AudioPipe> pAp;
|
||||
pAp.reset((assemblyai::AudioPipe *)tech_pvt->pAudioPipe);
|
||||
tech_pvt->pAudioPipe = nullptr;
|
||||
|
||||
std::thread t([pAp, tech_pvt]{
|
||||
pAp->finish();
|
||||
pAp->waitForClose();
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "%s (%u) got remote close\n", tech_pvt->sessionId, tech_pvt->id);
|
||||
});
|
||||
t.detach();
|
||||
}
|
||||
|
||||
static void destroy_tech_pvt(private_t *tech_pvt) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "%s (%u) destroy_tech_pvt\n", tech_pvt->sessionId, tech_pvt->id);
|
||||
if (tech_pvt) {
|
||||
if (tech_pvt->pAudioPipe) {
|
||||
assemblyai::AudioPipe* p = (assemblyai::AudioPipe *) tech_pvt->pAudioPipe;
|
||||
delete p;
|
||||
tech_pvt->pAudioPipe = nullptr;
|
||||
}
|
||||
if (tech_pvt->resampler) {
|
||||
speex_resampler_destroy(tech_pvt->resampler);
|
||||
tech_pvt->resampler = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
if (tech_pvt->vad) {
|
||||
switch_vad_destroy(&tech_pvt->vad);
|
||||
tech_pvt->vad = nullptr;
|
||||
}
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
std::string encodeURIComponent(std::string decoded)
|
||||
{
|
||||
|
||||
std::ostringstream oss;
|
||||
std::regex r("[!'\\(\\)*-.0-9A-Za-z_~:]");
|
||||
|
||||
for (char &c : decoded)
|
||||
{
|
||||
if (std::regex_match((std::string){c}, r))
|
||||
{
|
||||
oss << c;
|
||||
}
|
||||
else
|
||||
{
|
||||
oss << "%" << std::uppercase << std::hex << (0xff & c);
|
||||
}
|
||||
}
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
std::string& constructPath(switch_core_session_t* session, std::string& path,
|
||||
int sampleRate, int channels, const char* language, int interim) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
const char *var ;
|
||||
std::ostringstream oss;
|
||||
|
||||
oss << "v2/realtime/ws?sample_rate=8000";
|
||||
|
||||
const char* hints = switch_channel_get_variable(channel, "ASSEMBLYAI_WORD_BOOST");
|
||||
if (hints) {
|
||||
oss << "&word_boost=";
|
||||
oss << encodeURIComponent(hints);
|
||||
}
|
||||
path = oss.str();
|
||||
return path;
|
||||
}
|
||||
|
||||
static void eventCallback(const char* sessionId, assemblyai::AudioPipe::NotifyEvent_t event, const char* message, bool finished) {
|
||||
switch_core_session_t* session = switch_core_session_locate(sessionId);
|
||||
if (session) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, MY_BUG_NAME);
|
||||
if (bug) {
|
||||
private_t* tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug);
|
||||
if (tech_pvt) {
|
||||
switch (event) {
|
||||
case assemblyai::AudioPipe::CONNECT_SUCCESS:
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "connection successful\n");
|
||||
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_CONNECT_SUCCESS, NULL, tech_pvt->bugname, finished);
|
||||
break;
|
||||
case assemblyai::AudioPipe::CONNECT_FAIL:
|
||||
{
|
||||
// first thing: we can no longer access the AudioPipe
|
||||
std::stringstream json;
|
||||
json << "{\"reason\":\"" << message << "\"}";
|
||||
tech_pvt->pAudioPipe = nullptr;
|
||||
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_CONNECT_FAIL, (char *) json.str().c_str(), tech_pvt->bugname, finished);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_NOTICE, "connection failed: %s\n", message);
|
||||
}
|
||||
break;
|
||||
case assemblyai::AudioPipe::CONNECTION_DROPPED:
|
||||
// first thing: we can no longer access the AudioPipe
|
||||
tech_pvt->pAudioPipe = nullptr;
|
||||
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_DISCONNECT, NULL, tech_pvt->bugname, finished);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "connection dropped from far end\n");
|
||||
break;
|
||||
case assemblyai::AudioPipe::CONNECTION_CLOSED_GRACEFULLY:
|
||||
// first thing: we can no longer access the AudioPipe
|
||||
tech_pvt->pAudioPipe = nullptr;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "connection closed gracefully\n");
|
||||
break;
|
||||
case assemblyai::AudioPipe::MESSAGE:
|
||||
{
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "assemblyai message: %s\n", message);
|
||||
if (strstr(message, "\"error\":")) {
|
||||
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_ERROR, message, tech_pvt->bugname, finished);
|
||||
}
|
||||
if (strstr(message, "\"message_type\":\"SessionBegins\"")) {
|
||||
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_SESSION_BEGINS, message, tech_pvt->bugname, finished);
|
||||
}
|
||||
if (strstr(message, "\"message_type\":\"SessionTerminated\"")) {
|
||||
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_SESSION_TERMINATED, message, tech_pvt->bugname, finished);
|
||||
}
|
||||
else if (strstr(message, "\"message_type\":\"FinalTranscript\"") || strstr(message, "\"message_type\":\"PartialTranscript\"")) {
|
||||
/* discard empty partials */
|
||||
if (strstr(message, "\"message_type\":\"PartialTranscript\"") &&
|
||||
strstr(message, "\"text\":\"\"") && strstr(message, "\"confidence\":0")) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "discarding empty partial transcript from assemblyai\n");
|
||||
break;
|
||||
}
|
||||
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_RESULTS, message, tech_pvt->bugname, finished);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_NOTICE, "got unexpected msg from assemblyai %d:%s\n", event, message);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
switch_core_session_rwunlock(session);
|
||||
}
|
||||
}
|
||||
switch_status_t fork_data_init(private_t *tech_pvt, switch_core_session_t *session,
|
||||
int sampling, int desiredSampling, int channels, char *lang, int interim,
|
||||
char* bugname, responseHandler_t responseHandler) {
|
||||
|
||||
int err;
|
||||
switch_codec_implementation_t read_impl;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
|
||||
switch_core_session_get_read_impl(session, &read_impl);
|
||||
|
||||
memset(tech_pvt, 0, sizeof(private_t));
|
||||
|
||||
std::string path;
|
||||
constructPath(session, path, desiredSampling, channels, lang, interim);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "path: %s\n", path.c_str());
|
||||
|
||||
strncpy(tech_pvt->sessionId, switch_core_session_get_uuid(session), MAX_SESSION_ID);
|
||||
strncpy(tech_pvt->host, "api.assemblyai.com", MAX_WS_URL_LEN);
|
||||
tech_pvt->port = 443;
|
||||
strncpy(tech_pvt->path, path.c_str(), MAX_PATH_LEN);
|
||||
tech_pvt->sampling = desiredSampling;
|
||||
tech_pvt->responseHandler = responseHandler;
|
||||
tech_pvt->channels = channels;
|
||||
tech_pvt->id = ++idxCallCount;
|
||||
tech_pvt->buffer_overrun_notified = 0;
|
||||
|
||||
size_t buflen = LWS_PRE + (FRAME_SIZE_8000 * desiredSampling / 8000 * channels * 1000 / RTP_PACKETIZATION_PERIOD * nAudioBufferSecs);
|
||||
|
||||
const char* apiKey = switch_channel_get_variable(channel, "ASSEMBLYAI_API_KEY");
|
||||
if (!apiKey && defaultApiKey) apiKey = defaultApiKey;
|
||||
else if (!apiKey) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "no assemblyai api key provided\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
assemblyai::AudioPipe* ap = new assemblyai::AudioPipe(tech_pvt->sessionId, tech_pvt->host, tech_pvt->port, tech_pvt->path,
|
||||
buflen, read_impl.decoded_bytes_per_packet, apiKey, eventCallback);
|
||||
if (!ap) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error allocating AudioPipe\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
tech_pvt->pAudioPipe = static_cast<void *>(ap);
|
||||
|
||||
switch_mutex_init(&tech_pvt->mutex, SWITCH_MUTEX_NESTED, switch_core_session_get_pool(session));
|
||||
|
||||
if (desiredSampling != sampling) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) resampling from %u to %u\n", tech_pvt->id, sampling, desiredSampling);
|
||||
tech_pvt->resampler = speex_resampler_init(channels, sampling, desiredSampling, SWITCH_RESAMPLE_QUALITY, &err);
|
||||
if (0 != err) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error initializing resampler: %s.\n", speex_resampler_strerror(err));
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
}
|
||||
else {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) no resampling needed for this call\n", tech_pvt->id);
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) fork_data_init\n", tech_pvt->id);
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void lws_logger(int level, const char *line) {
|
||||
switch_log_level_t llevel = SWITCH_LOG_DEBUG;
|
||||
|
||||
switch (level) {
|
||||
case LLL_ERR: llevel = SWITCH_LOG_ERROR; break;
|
||||
case LLL_WARN: llevel = SWITCH_LOG_WARNING; break;
|
||||
case LLL_NOTICE: llevel = SWITCH_LOG_NOTICE; break;
|
||||
case LLL_INFO: llevel = SWITCH_LOG_INFO; break;
|
||||
break;
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "%s\n", line);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
extern "C" {
|
||||
switch_status_t aai_transcribe_init() {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "mod_assemblyai_transcribe: audio buffer (in secs): %d secs\n", nAudioBufferSecs);
|
||||
|
||||
int logs = LLL_ERR | LLL_WARN | LLL_NOTICE || LLL_INFO | LLL_PARSER | LLL_HEADER | LLL_EXT | LLL_CLIENT | LLL_LATENCY | LLL_DEBUG ;
|
||||
|
||||
assemblyai::AudioPipe::initialize(logs, lws_logger);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "AudioPipe::initialize completed\n");
|
||||
|
||||
const char* apiKey = std::getenv("DEEPGRAM_API_KEY");
|
||||
if (NULL == apiKey) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE,
|
||||
"\"DEEPGRAM_API_KEY\" env var not set; authentication will expect channel variables of same names to be set\n");
|
||||
}
|
||||
else {
|
||||
hasDefaultCredentials = true;
|
||||
defaultApiKey = apiKey;
|
||||
}
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_status_t aai_transcribe_cleanup() {
|
||||
bool cleanup = false;
|
||||
cleanup = assemblyai::AudioPipe::deinitialize();
|
||||
if (cleanup == true) {
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
switch_status_t aai_transcribe_session_init(switch_core_session_t *session,
|
||||
responseHandler_t responseHandler, uint32_t samples_per_second, uint32_t channels,
|
||||
char* lang, int interim, char* bugname, void **ppUserData)
|
||||
{
|
||||
int err;
|
||||
|
||||
// allocate per-session data structure
|
||||
private_t* tech_pvt = (private_t *) switch_core_session_alloc(session, sizeof(private_t));
|
||||
if (!tech_pvt) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "error allocating memory!\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
if (SWITCH_STATUS_SUCCESS != fork_data_init(tech_pvt, session, samples_per_second, 8000, channels, lang, interim, bugname, responseHandler)) {
|
||||
destroy_tech_pvt(tech_pvt);
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
*ppUserData = tech_pvt;
|
||||
|
||||
assemblyai::AudioPipe *pAudioPipe = static_cast<assemblyai::AudioPipe *>(tech_pvt->pAudioPipe);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "connecting now\n");
|
||||
pAudioPipe->connect();
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "connection in progress\n");
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_status_t aai_transcribe_session_stop(switch_core_session_t *session,int channelIsClosing, char* bugname) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, MY_BUG_NAME);
|
||||
if (!bug) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "aai_transcribe_session_stop: no bug - websocket conection already closed\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
private_t* tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug);
|
||||
uint32_t id = tech_pvt->id;
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) aai_transcribe_session_stop\n", id);
|
||||
|
||||
if (!tech_pvt) return SWITCH_STATUS_FALSE;
|
||||
|
||||
// close connection and get final responses
|
||||
switch_mutex_lock(tech_pvt->mutex);
|
||||
switch_channel_set_private(channel, bugname, NULL);
|
||||
if (!channelIsClosing) switch_core_media_bug_remove(session, &bug);
|
||||
|
||||
assemblyai::AudioPipe *pAudioPipe = static_cast<assemblyai::AudioPipe *>(tech_pvt->pAudioPipe);
|
||||
if (pAudioPipe) {
|
||||
//TODO: I think here we should call a method on pAudioPipe to send a terminate session message to assemblyai
|
||||
//see: https://www.assemblyai.com/docs/guides/real-time-streaming-transcription#terminating-a-session
|
||||
reaper(tech_pvt);
|
||||
}
|
||||
destroy_tech_pvt(tech_pvt);
|
||||
switch_mutex_unlock(tech_pvt->mutex);
|
||||
switch_mutex_destroy(tech_pvt->mutex);
|
||||
tech_pvt->mutex = nullptr;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) aai_transcribe_session_stop\n", id);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_bool_t aai_transcribe_frame(switch_core_session_t *session, switch_media_bug_t *bug) {
|
||||
private_t* tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug);
|
||||
size_t inuse = 0;
|
||||
bool dirty = false;
|
||||
char *p = (char *) "{\"msg\": \"buffer overrun\"}";
|
||||
|
||||
if (!tech_pvt) return SWITCH_TRUE;
|
||||
|
||||
if (switch_mutex_trylock(tech_pvt->mutex) == SWITCH_STATUS_SUCCESS) {
|
||||
if (!tech_pvt->pAudioPipe) {
|
||||
switch_mutex_unlock(tech_pvt->mutex);
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
assemblyai::AudioPipe *pAudioPipe = static_cast<assemblyai::AudioPipe *>(tech_pvt->pAudioPipe);
|
||||
if (pAudioPipe->getLwsState() != assemblyai::AudioPipe::LWS_CLIENT_CONNECTED) {
|
||||
switch_mutex_unlock(tech_pvt->mutex);
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
|
||||
pAudioPipe->lockAudioBuffer();
|
||||
size_t available = pAudioPipe->binarySpaceAvailable();
|
||||
if (NULL == tech_pvt->resampler) {
|
||||
switch_frame_t frame = { 0 };
|
||||
frame.data = pAudioPipe->binaryWritePtr();
|
||||
frame.buflen = available;
|
||||
while (true) {
|
||||
|
||||
// check if buffer would be overwritten; dump packets if so
|
||||
if (available < pAudioPipe->binaryMinSpace()) {
|
||||
if (!tech_pvt->buffer_overrun_notified) {
|
||||
tech_pvt->buffer_overrun_notified = 1;
|
||||
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_BUFFER_OVERRUN, NULL, tech_pvt->bugname, 0);
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "(%u) dropping packets!\n",
|
||||
tech_pvt->id);
|
||||
pAudioPipe->binaryWritePtrResetToZero();
|
||||
|
||||
frame.data = pAudioPipe->binaryWritePtr();
|
||||
frame.buflen = available = pAudioPipe->binarySpaceAvailable();
|
||||
}
|
||||
|
||||
switch_status_t rv = switch_core_media_bug_read(bug, &frame, SWITCH_TRUE);
|
||||
if (rv != SWITCH_STATUS_SUCCESS) break;
|
||||
if (frame.datalen) {
|
||||
pAudioPipe->binaryWritePtrAdd(frame.datalen);
|
||||
frame.buflen = available = pAudioPipe->binarySpaceAvailable();
|
||||
frame.data = pAudioPipe->binaryWritePtr();
|
||||
dirty = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
uint8_t data[SWITCH_RECOMMENDED_BUFFER_SIZE];
|
||||
switch_frame_t frame = { 0 };
|
||||
frame.data = data;
|
||||
frame.buflen = SWITCH_RECOMMENDED_BUFFER_SIZE;
|
||||
while (switch_core_media_bug_read(bug, &frame, SWITCH_TRUE) == SWITCH_STATUS_SUCCESS) {
|
||||
if (frame.datalen) {
|
||||
spx_uint32_t out_len = available >> 1; // space for samples which are 2 bytes
|
||||
spx_uint32_t in_len = frame.samples;
|
||||
|
||||
speex_resampler_process_interleaved_int(tech_pvt->resampler,
|
||||
(const spx_int16_t *) frame.data,
|
||||
(spx_uint32_t *) &in_len,
|
||||
(spx_int16_t *) ((char *) pAudioPipe->binaryWritePtr()),
|
||||
&out_len);
|
||||
|
||||
if (out_len > 0) {
|
||||
// bytes written = num samples * 2 * num channels
|
||||
size_t bytes_written = out_len << tech_pvt->channels;
|
||||
pAudioPipe->binaryWritePtrAdd(bytes_written);
|
||||
available = pAudioPipe->binarySpaceAvailable();
|
||||
dirty = true;
|
||||
}
|
||||
if (available < pAudioPipe->binaryMinSpace()) {
|
||||
if (!tech_pvt->buffer_overrun_notified) {
|
||||
tech_pvt->buffer_overrun_notified = 1;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "(%u) dropping packets!\n",
|
||||
tech_pvt->id);
|
||||
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_BUFFER_OVERRUN, NULL, tech_pvt->bugname, 0);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pAudioPipe->unlockAudioBuffer();
|
||||
switch_mutex_unlock(tech_pvt->mutex);
|
||||
}
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
}
|
||||
11
mod_assemblyai_transcribe/aai_transcribe_glue.h
Normal file
11
mod_assemblyai_transcribe/aai_transcribe_glue.h
Normal file
@@ -0,0 +1,11 @@
|
||||
#ifndef __DG_GLUE_H__
|
||||
#define __DG_GLUE_H__
|
||||
|
||||
switch_status_t aai_transcribe_init();
|
||||
switch_status_t aai_transcribe_cleanup();
|
||||
switch_status_t aai_transcribe_session_init(switch_core_session_t *session, responseHandler_t responseHandler,
|
||||
uint32_t samples_per_second, uint32_t channels, char* lang, int interim, char* bugname, void **ppUserData);
|
||||
switch_status_t aai_transcribe_session_stop(switch_core_session_t *session, int channelIsClosing, char* bugname);
|
||||
switch_bool_t aai_transcribe_frame(switch_core_session_t *session, switch_media_bug_t *bug);
|
||||
|
||||
#endif
|
||||
521
mod_assemblyai_transcribe/audio_pipe.cpp
Normal file
521
mod_assemblyai_transcribe/audio_pipe.cpp
Normal file
@@ -0,0 +1,521 @@
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
#include "audio_pipe.hpp"
|
||||
#include "base64.hpp"
|
||||
|
||||
/* discard incoming text messages over the socket that are longer than this */
|
||||
#define MAX_RECV_BUF_SIZE (65 * 1024 * 10)
|
||||
#define RECV_BUF_REALLOC_SIZE (8 * 1024)
|
||||
|
||||
using namespace assemblyai;
|
||||
|
||||
namespace {
|
||||
static const char *requestedTcpKeepaliveSecs = std::getenv("MOD_AUDIO_FORK_TCP_KEEPALIVE_SECS");
|
||||
static int nTcpKeepaliveSecs = requestedTcpKeepaliveSecs ? ::atoi(requestedTcpKeepaliveSecs) : 55;
|
||||
}
|
||||
|
||||
static int dch_lws_http_basic_auth_gen(const char *apiKey, char *buf, size_t len) {
|
||||
size_t n = strlen(apiKey);
|
||||
|
||||
if (len < n + 7)
|
||||
return 1;
|
||||
|
||||
strcpy(buf,"Token ");
|
||||
strcpy(buf + 6, apiKey);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int AudioPipe::lws_callback(struct lws *wsi,
|
||||
enum lws_callback_reasons reason,
|
||||
void *user, void *in, size_t len) {
|
||||
|
||||
struct AudioPipe::lws_per_vhost_data *vhd =
|
||||
(struct AudioPipe::lws_per_vhost_data *) lws_protocol_vh_priv_get(lws_get_vhost(wsi), lws_get_protocol(wsi));
|
||||
|
||||
struct lws_vhost* vhost = lws_get_vhost(wsi);
|
||||
AudioPipe ** ppAp = (AudioPipe **) user;
|
||||
|
||||
switch (reason) {
|
||||
case LWS_CALLBACK_PROTOCOL_INIT:
|
||||
vhd = (struct AudioPipe::lws_per_vhost_data *) lws_protocol_vh_priv_zalloc(lws_get_vhost(wsi), lws_get_protocol(wsi), sizeof(struct AudioPipe::lws_per_vhost_data));
|
||||
vhd->context = lws_get_context(wsi);
|
||||
vhd->protocol = lws_get_protocol(wsi);
|
||||
vhd->vhost = lws_get_vhost(wsi);
|
||||
break;
|
||||
|
||||
case LWS_CALLBACK_CLIENT_APPEND_HANDSHAKE_HEADER:
|
||||
{
|
||||
AudioPipe* ap = findPendingConnect(wsi);
|
||||
if (ap) {
|
||||
std::string apiKey = ap->getApiKey();
|
||||
unsigned char **p = (unsigned char **)in, *end = (*p) + len;
|
||||
char b[256];
|
||||
memset(b, 0, sizeof(b));
|
||||
strcpy(b, apiKey.c_str());
|
||||
|
||||
if (lws_add_http_header_by_token(wsi, WSI_TOKEN_HTTP_AUTHORIZATION, (unsigned char *)b, strlen(b), p, end)) return -1;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case LWS_CALLBACK_EVENT_WAIT_CANCELLED:
|
||||
processPendingConnects(vhd);
|
||||
processPendingDisconnects(vhd);
|
||||
processPendingWrites();
|
||||
break;
|
||||
case LWS_CALLBACK_CLIENT_CONNECTION_ERROR:
|
||||
{
|
||||
AudioPipe* ap = findAndRemovePendingConnect(wsi);
|
||||
int rc = lws_http_client_http_response(wsi);
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_CONNECTION_ERROR: %s, response status %d\n", in ? (char *)in : "(null)", rc);
|
||||
if (ap) {
|
||||
ap->m_state = LWS_CLIENT_FAILED;
|
||||
ap->m_callback(ap->m_uuid.c_str(), AudioPipe::CONNECT_FAIL, (char *) in, ap->isFinished());
|
||||
}
|
||||
else {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_CONNECTION_ERROR unable to find wsi %p..\n", wsi);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case LWS_CALLBACK_CLIENT_ESTABLISHED:
|
||||
{
|
||||
AudioPipe* ap = findAndRemovePendingConnect(wsi);
|
||||
if (ap) {
|
||||
*ppAp = ap;
|
||||
ap->m_vhd = vhd;
|
||||
ap->m_state = LWS_CLIENT_CONNECTED;
|
||||
ap->m_callback(ap->m_uuid.c_str(), AudioPipe::CONNECT_SUCCESS, NULL, ap->isFinished());
|
||||
}
|
||||
else {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_ESTABLISHED %s unable to find wsi %p..\n", ap->m_uuid.c_str(), wsi);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case LWS_CALLBACK_CLIENT_CLOSED:
|
||||
{
|
||||
AudioPipe* ap = *ppAp;
|
||||
if (!ap) {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_CLOSED %s unable to find wsi %p..\n", ap->m_uuid.c_str(), wsi);
|
||||
return 0;
|
||||
}
|
||||
if (ap->m_state == LWS_CLIENT_DISCONNECTING) {
|
||||
// closed by us
|
||||
|
||||
lwsl_debug("%s socket closed by us\n", ap->m_uuid.c_str());
|
||||
ap->m_callback(ap->m_uuid.c_str(), AudioPipe::CONNECTION_CLOSED_GRACEFULLY, NULL, ap->isFinished());
|
||||
}
|
||||
else if (ap->m_state == LWS_CLIENT_CONNECTED) {
|
||||
// closed by far end
|
||||
lwsl_info("%s socket closed by far end\n", ap->m_uuid.c_str());
|
||||
ap->m_callback(ap->m_uuid.c_str(), AudioPipe::CONNECTION_DROPPED, NULL, ap->isFinished());
|
||||
}
|
||||
ap->m_state = LWS_CLIENT_DISCONNECTED;
|
||||
ap->setClosed();
|
||||
|
||||
//NB: after receiving any of the events above, any holder of a
|
||||
//pointer or reference to this object must treat is as no longer valid
|
||||
|
||||
//*ppAp = NULL;
|
||||
//delete ap;
|
||||
}
|
||||
break;
|
||||
|
||||
case LWS_CALLBACK_CLIENT_RECEIVE:
|
||||
{
|
||||
AudioPipe* ap = *ppAp;
|
||||
if (!ap) {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_RECEIVE %s unable to find wsi %p..\n", ap->m_uuid.c_str(), wsi);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (lws_frame_is_binary(wsi)) {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_RECEIVE received binary frame, discarding.\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (lws_is_first_fragment(wsi)) {
|
||||
// allocate a buffer for the entire chunk of memory needed
|
||||
assert(nullptr == ap->m_recv_buf);
|
||||
ap->m_recv_buf_len = len + lws_remaining_packet_payload(wsi);
|
||||
ap->m_recv_buf = (uint8_t*) malloc(ap->m_recv_buf_len);
|
||||
ap->m_recv_buf_ptr = ap->m_recv_buf;
|
||||
}
|
||||
|
||||
size_t write_offset = ap->m_recv_buf_ptr - ap->m_recv_buf;
|
||||
size_t remaining_space = ap->m_recv_buf_len - write_offset;
|
||||
if (remaining_space < len) {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_RECEIVE buffer realloc needed.\n");
|
||||
size_t newlen = ap->m_recv_buf_len + RECV_BUF_REALLOC_SIZE;
|
||||
if (newlen > MAX_RECV_BUF_SIZE) {
|
||||
free(ap->m_recv_buf);
|
||||
ap->m_recv_buf = ap->m_recv_buf_ptr = nullptr;
|
||||
ap->m_recv_buf_len = 0;
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_RECEIVE max buffer exceeded, truncating message.\n");
|
||||
}
|
||||
else {
|
||||
ap->m_recv_buf = (uint8_t*) realloc(ap->m_recv_buf, newlen);
|
||||
if (nullptr != ap->m_recv_buf) {
|
||||
ap->m_recv_buf_len = newlen;
|
||||
ap->m_recv_buf_ptr = ap->m_recv_buf + write_offset;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (nullptr != ap->m_recv_buf) {
|
||||
if (len > 0) {
|
||||
memcpy(ap->m_recv_buf_ptr, in, len);
|
||||
ap->m_recv_buf_ptr += len;
|
||||
}
|
||||
if (lws_is_final_fragment(wsi)) {
|
||||
if (nullptr != ap->m_recv_buf) {
|
||||
std::string msg((char *)ap->m_recv_buf, ap->m_recv_buf_ptr - ap->m_recv_buf);
|
||||
ap->m_callback(ap->m_uuid.c_str(), AudioPipe::MESSAGE, msg.c_str(), ap->isFinished());
|
||||
if (nullptr != ap->m_recv_buf) free(ap->m_recv_buf);
|
||||
}
|
||||
ap->m_recv_buf = ap->m_recv_buf_ptr = nullptr;
|
||||
ap->m_recv_buf_len = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case LWS_CALLBACK_CLIENT_WRITEABLE:
|
||||
{
|
||||
AudioPipe* ap = *ppAp;
|
||||
if (!ap) {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_WRITEABLE %s unable to find wsi %p..\n", ap->m_uuid.c_str(), wsi);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// check for text frames to send
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(ap->m_text_mutex);
|
||||
if (ap->m_metadata.length() > 0) {
|
||||
uint8_t buf[ap->m_metadata.length() + LWS_PRE];
|
||||
memcpy(buf + LWS_PRE, ap->m_metadata.c_str(), ap->m_metadata.length());
|
||||
int n = ap->m_metadata.length();
|
||||
int m = lws_write(wsi, buf + LWS_PRE, n, LWS_WRITE_TEXT);
|
||||
ap->m_metadata.clear();
|
||||
if (m < n) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// there may be audio data, but only one write per writeable event
|
||||
// get it next time
|
||||
lws_callback_on_writable(wsi);
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (ap->m_state == LWS_CLIENT_DISCONNECTING) {
|
||||
lws_close_reason(wsi, LWS_CLOSE_STATUS_NORMAL, NULL, 0);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// check for audio packets
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(ap->m_audio_mutex);
|
||||
//TODO: we need to have at least 100ms buffered which is 5 packets at 320 bytes per packet
|
||||
if (ap->m_audio_buffer_write_offset > LWS_PRE) {
|
||||
size_t datalen = ap->m_audio_buffer_write_offset - LWS_PRE;
|
||||
if (datalen >= 1600) {
|
||||
std::ostringstream oss;
|
||||
oss << "{\"audio_data\":\"" << drachtio::base64_encode((unsigned char const *) ap->m_audio_buffer + LWS_PRE, datalen) << "\"}";
|
||||
std::string result = oss.str();
|
||||
uint8_t buf[result.length() + LWS_PRE];
|
||||
memcpy(buf + LWS_PRE,result.c_str(), result.length());
|
||||
int n = result.length();
|
||||
int m = lws_write(wsi, buf + LWS_PRE, n, LWS_WRITE_TEXT);
|
||||
if (m < n) {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_WRITEABLE attemped to send %lu bytes only sent %d wsi %p..\n",
|
||||
n, m, wsi);
|
||||
}
|
||||
ap->m_audio_buffer_write_offset = LWS_PRE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return lws_callback_http_dummy(wsi, reason, user, in, len);
|
||||
}
|
||||
|
||||
|
||||
// static members
|
||||
static const lws_retry_bo_t retry = {
|
||||
nullptr, // retry_ms_table
|
||||
0, // retry_ms_table_count
|
||||
0, // conceal_count
|
||||
UINT16_MAX, // secs_since_valid_ping
|
||||
UINT16_MAX, // secs_since_valid_hangup
|
||||
0 // jitter_percent
|
||||
};
|
||||
|
||||
struct lws_context *AudioPipe::context = nullptr;
|
||||
std::string AudioPipe::protocolName;
|
||||
std::mutex AudioPipe::mutex_connects;
|
||||
std::mutex AudioPipe::mutex_disconnects;
|
||||
std::mutex AudioPipe::mutex_writes;
|
||||
std::list<AudioPipe*> AudioPipe::pendingConnects;
|
||||
std::list<AudioPipe*> AudioPipe::pendingDisconnects;
|
||||
std::list<AudioPipe*> AudioPipe::pendingWrites;
|
||||
AudioPipe::log_emit_function AudioPipe::logger;
|
||||
std::mutex AudioPipe::mapMutex;
|
||||
bool AudioPipe::stopFlag;
|
||||
|
||||
void AudioPipe::processPendingConnects(lws_per_vhost_data *vhd) {
|
||||
std::list<AudioPipe*> connects;
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_connects);
|
||||
for (auto it = pendingConnects.begin(); it != pendingConnects.end(); ++it) {
|
||||
if ((*it)->m_state == LWS_CLIENT_IDLE) {
|
||||
connects.push_back(*it);
|
||||
(*it)->m_state = LWS_CLIENT_CONNECTING;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto it = connects.begin(); it != connects.end(); ++it) {
|
||||
AudioPipe* ap = *it;
|
||||
ap->connect_client(vhd);
|
||||
}
|
||||
}
|
||||
|
||||
void AudioPipe::processPendingDisconnects(lws_per_vhost_data *vhd) {
|
||||
std::list<AudioPipe*> disconnects;
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_disconnects);
|
||||
for (auto it = pendingDisconnects.begin(); it != pendingDisconnects.end(); ++it) {
|
||||
if ((*it)->m_state == LWS_CLIENT_DISCONNECTING) disconnects.push_back(*it);
|
||||
}
|
||||
pendingDisconnects.clear();
|
||||
}
|
||||
for (auto it = disconnects.begin(); it != disconnects.end(); ++it) {
|
||||
AudioPipe* ap = *it;
|
||||
lws_callback_on_writable(ap->m_wsi);
|
||||
}
|
||||
}
|
||||
|
||||
void AudioPipe::processPendingWrites() {
|
||||
std::list<AudioPipe*> writes;
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_writes);
|
||||
for (auto it = pendingWrites.begin(); it != pendingWrites.end(); ++it) {
|
||||
if ((*it)->m_state == LWS_CLIENT_CONNECTED) writes.push_back(*it);
|
||||
}
|
||||
pendingWrites.clear();
|
||||
}
|
||||
for (auto it = writes.begin(); it != writes.end(); ++it) {
|
||||
AudioPipe* ap = *it;
|
||||
lws_callback_on_writable(ap->m_wsi);
|
||||
}
|
||||
}
|
||||
|
||||
AudioPipe* AudioPipe::findAndRemovePendingConnect(struct lws *wsi) {
|
||||
AudioPipe* ap = NULL;
|
||||
std::lock_guard<std::mutex> guard(mutex_connects);
|
||||
std::list<AudioPipe* > toRemove;
|
||||
|
||||
for (auto it = pendingConnects.begin(); it != pendingConnects.end() && !ap; ++it) {
|
||||
int state = (*it)->m_state;
|
||||
|
||||
if ((*it)->m_wsi == nullptr)
|
||||
toRemove.push_back(*it);
|
||||
|
||||
if ((state == LWS_CLIENT_CONNECTING) &&
|
||||
(*it)->m_wsi == wsi) ap = *it;
|
||||
}
|
||||
|
||||
for (auto it = toRemove.begin(); it != toRemove.end(); ++it)
|
||||
pendingConnects.remove(*it);
|
||||
|
||||
if (ap) {
|
||||
pendingConnects.remove(ap);
|
||||
}
|
||||
|
||||
return ap;
|
||||
}
|
||||
|
||||
AudioPipe* AudioPipe::findPendingConnect(struct lws *wsi) {
|
||||
AudioPipe* ap = NULL;
|
||||
std::lock_guard<std::mutex> guard(mutex_connects);
|
||||
|
||||
for (auto it = pendingConnects.begin(); it != pendingConnects.end() && !ap; ++it) {
|
||||
int state = (*it)->m_state;
|
||||
if ((state == LWS_CLIENT_CONNECTING) &&
|
||||
(*it)->m_wsi == wsi) ap = *it;
|
||||
}
|
||||
return ap;
|
||||
}
|
||||
|
||||
void AudioPipe::addPendingConnect(AudioPipe* ap) {
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_connects);
|
||||
pendingConnects.push_back(ap);
|
||||
lwsl_debug("%s after adding connect there are %lu pending connects\n",
|
||||
ap->m_uuid.c_str(), pendingConnects.size());
|
||||
}
|
||||
lws_cancel_service(context);
|
||||
}
|
||||
void AudioPipe::addPendingDisconnect(AudioPipe* ap) {
|
||||
ap->m_state = LWS_CLIENT_DISCONNECTING;
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_disconnects);
|
||||
pendingDisconnects.push_back(ap);
|
||||
lwsl_debug("%s after adding disconnect there are %lu pending disconnects\n",
|
||||
ap->m_uuid.c_str(), pendingDisconnects.size());
|
||||
}
|
||||
lws_cancel_service(ap->m_vhd->context);
|
||||
}
|
||||
void AudioPipe::addPendingWrite(AudioPipe* ap) {
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_writes);
|
||||
pendingWrites.push_back(ap);
|
||||
}
|
||||
lws_cancel_service(ap->m_vhd->context);
|
||||
}
|
||||
|
||||
bool AudioPipe::lws_service_thread() {
|
||||
struct lws_context_creation_info info;
|
||||
|
||||
const struct lws_protocols protocols[] = {
|
||||
{
|
||||
"",
|
||||
AudioPipe::lws_callback,
|
||||
sizeof(void *),
|
||||
1024,
|
||||
},
|
||||
{ NULL, NULL, 0, 0 }
|
||||
};
|
||||
|
||||
memset(&info, 0, sizeof info);
|
||||
info.port = CONTEXT_PORT_NO_LISTEN;
|
||||
info.options = LWS_SERVER_OPTION_DO_SSL_GLOBAL_INIT;
|
||||
info.protocols = protocols;
|
||||
info.ka_time = nTcpKeepaliveSecs; // tcp keep-alive timer
|
||||
info.ka_probes = 4; // number of times to try ka before closing connection
|
||||
info.ka_interval = 5; // time between ka's
|
||||
info.timeout_secs = 10; // doc says timeout for "various processes involving network roundtrips"
|
||||
info.keepalive_timeout = 5; // seconds to allow remote client to hold on to an idle HTTP/1.1 connection
|
||||
info.timeout_secs_ah_idle = 10; // secs to allow a client to hold an ah without using it
|
||||
info.retry_and_idle_policy = &retry;
|
||||
|
||||
lwsl_notice("AudioPipe::lws_service_thread creating context\n");
|
||||
|
||||
context = lws_create_context(&info);
|
||||
if (!context) {
|
||||
lwsl_err("AudioPipe::lws_service_thread failed creating context\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
int n;
|
||||
do {
|
||||
n = lws_service(context, 0);
|
||||
} while (n >= 0 && !stopFlag);
|
||||
|
||||
lwsl_notice("AudioPipe::lws_service_thread ending\n");
|
||||
lws_context_destroy(context);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void AudioPipe::initialize(int loglevel, log_emit_function logger) {
|
||||
|
||||
lws_set_log_level(loglevel, logger);
|
||||
|
||||
lwsl_notice("AudioPipe::initialize starting\n");
|
||||
std::lock_guard<std::mutex> lock(mapMutex);
|
||||
std::thread t(&AudioPipe::lws_service_thread);
|
||||
stopFlag = false;
|
||||
t.detach();
|
||||
}
|
||||
|
||||
bool AudioPipe::deinitialize() {
|
||||
lwsl_notice("AudioPipe::deinitialize\n");
|
||||
std::lock_guard<std::mutex> lock(mapMutex);
|
||||
stopFlag = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
// instance members
|
||||
AudioPipe::AudioPipe(const char* uuid, const char* host, unsigned int port, const char* path,
|
||||
size_t bufLen, size_t minFreespace, const char* apiKey, notifyHandler_t callback) :
|
||||
m_uuid(uuid), m_host(host), m_port(port), m_path(path), m_finished(false),
|
||||
m_audio_buffer_min_freespace(minFreespace), m_audio_buffer_max_len(bufLen), m_gracefulShutdown(false),
|
||||
m_audio_buffer_write_offset(LWS_PRE), m_recv_buf(nullptr), m_recv_buf_ptr(nullptr),
|
||||
m_state(LWS_CLIENT_IDLE), m_wsi(nullptr), m_vhd(nullptr), m_apiKey(apiKey), m_callback(callback) {
|
||||
|
||||
m_audio_buffer = new uint8_t[m_audio_buffer_max_len];
|
||||
}
|
||||
AudioPipe::~AudioPipe() {
|
||||
if (m_audio_buffer) delete [] m_audio_buffer;
|
||||
if (m_recv_buf) delete [] m_recv_buf;
|
||||
}
|
||||
|
||||
void AudioPipe::connect(void) {
|
||||
addPendingConnect(this);
|
||||
}
|
||||
|
||||
bool AudioPipe::connect_client(struct lws_per_vhost_data *vhd) {
|
||||
assert(m_audio_buffer != nullptr);
|
||||
assert(m_vhd == nullptr);
|
||||
struct lws_client_connect_info i;
|
||||
|
||||
memset(&i, 0, sizeof(i));
|
||||
i.context = vhd->context;
|
||||
i.port = m_port;
|
||||
i.address = m_host.c_str();
|
||||
i.path = m_path.c_str();
|
||||
i.host = i.address;
|
||||
i.origin = i.address;
|
||||
i.ssl_connection = LCCSCF_USE_SSL;
|
||||
//i.protocol = protocolName.c_str();
|
||||
i.pwsi = &(m_wsi);
|
||||
|
||||
m_state = LWS_CLIENT_CONNECTING;
|
||||
m_vhd = vhd;
|
||||
|
||||
m_wsi = lws_client_connect_via_info(&i);
|
||||
lwsl_debug("%s attempting connection, wsi is %p\n", m_uuid.c_str(), m_wsi);
|
||||
|
||||
return nullptr != m_wsi;
|
||||
}
|
||||
|
||||
void AudioPipe::bufferForSending(const char* text) {
|
||||
if (m_state != LWS_CLIENT_CONNECTED) return;
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(m_text_mutex);
|
||||
m_metadata.append(text);
|
||||
}
|
||||
addPendingWrite(this);
|
||||
}
|
||||
|
||||
void AudioPipe::unlockAudioBuffer() {
|
||||
if (m_audio_buffer_write_offset > LWS_PRE) addPendingWrite(this);
|
||||
m_audio_mutex.unlock();
|
||||
}
|
||||
|
||||
void AudioPipe::close() {
|
||||
if (m_state != LWS_CLIENT_CONNECTED) return;
|
||||
addPendingDisconnect(this);
|
||||
}
|
||||
|
||||
void AudioPipe::finish() {
|
||||
if (m_finished || m_state != LWS_CLIENT_CONNECTED) return;
|
||||
m_finished = true;
|
||||
bufferForSending("{\"terminate_session\": true}");
|
||||
}
|
||||
|
||||
void AudioPipe::waitForClose() {
|
||||
std::shared_future<void> sf(m_promise.get_future());
|
||||
sf.wait();
|
||||
return;
|
||||
}
|
||||
143
mod_assemblyai_transcribe/audio_pipe.hpp
Normal file
143
mod_assemblyai_transcribe/audio_pipe.hpp
Normal file
@@ -0,0 +1,143 @@
|
||||
#ifndef __AAI_AUDIO_PIPE_HPP__
|
||||
#define __AAI_AUDIO_PIPE_HPP__
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
#include <mutex>
|
||||
#include <future>
|
||||
#include <queue>
|
||||
#include <unordered_map>
|
||||
#include <thread>
|
||||
|
||||
#include <libwebsockets.h>
|
||||
|
||||
namespace assemblyai {
|
||||
|
||||
class AudioPipe {
|
||||
public:
|
||||
enum LwsState_t {
|
||||
LWS_CLIENT_IDLE,
|
||||
LWS_CLIENT_CONNECTING,
|
||||
LWS_CLIENT_CONNECTED,
|
||||
LWS_CLIENT_FAILED,
|
||||
LWS_CLIENT_DISCONNECTING,
|
||||
LWS_CLIENT_DISCONNECTED
|
||||
};
|
||||
enum NotifyEvent_t {
|
||||
CONNECT_SUCCESS,
|
||||
CONNECT_FAIL,
|
||||
CONNECTION_DROPPED,
|
||||
CONNECTION_CLOSED_GRACEFULLY,
|
||||
MESSAGE
|
||||
};
|
||||
typedef void (*log_emit_function)(int level, const char *line);
|
||||
typedef void (*notifyHandler_t)(const char *sessionId, NotifyEvent_t event, const char* message, bool finished);
|
||||
|
||||
struct lws_per_vhost_data {
|
||||
struct lws_context *context;
|
||||
struct lws_vhost *vhost;
|
||||
const struct lws_protocols *protocol;
|
||||
};
|
||||
|
||||
static void initialize(int loglevel, log_emit_function logger);
|
||||
static bool deinitialize();
|
||||
static bool lws_service_thread();
|
||||
|
||||
// constructor
|
||||
AudioPipe(const char* uuid, const char* host, unsigned int port, const char* path,
|
||||
size_t bufLen, size_t minFreespace, const char* apiKey, notifyHandler_t callback);
|
||||
~AudioPipe();
|
||||
|
||||
LwsState_t getLwsState(void) { return m_state; }
|
||||
std::string& getApiKey(void) {
|
||||
return m_apiKey;
|
||||
}
|
||||
void connect(void);
|
||||
void bufferForSending(const char* text);
|
||||
size_t binarySpaceAvailable(void) {
|
||||
return m_audio_buffer_max_len - m_audio_buffer_write_offset;
|
||||
}
|
||||
size_t binaryMinSpace(void) {
|
||||
return m_audio_buffer_min_freespace;
|
||||
}
|
||||
char * binaryWritePtr(void) {
|
||||
return (char *) m_audio_buffer + m_audio_buffer_write_offset;
|
||||
}
|
||||
void binaryWritePtrAdd(size_t len) {
|
||||
m_audio_buffer_write_offset += len;
|
||||
}
|
||||
void binaryWritePtrResetToZero(void) {
|
||||
m_audio_buffer_write_offset = 0;
|
||||
}
|
||||
void lockAudioBuffer(void) {
|
||||
m_audio_mutex.lock();
|
||||
}
|
||||
void unlockAudioBuffer(void) ;
|
||||
|
||||
void close() ;
|
||||
void finish();
|
||||
void waitForClose();
|
||||
void setClosed() { m_promise.set_value(); }
|
||||
bool isFinished() { return m_finished;}
|
||||
|
||||
// no default constructor or copying
|
||||
AudioPipe() = delete;
|
||||
AudioPipe(const AudioPipe&) = delete;
|
||||
void operator=(const AudioPipe&) = delete;
|
||||
|
||||
private:
|
||||
|
||||
static int lws_callback(struct lws *wsi, enum lws_callback_reasons reason, void *user, void *in, size_t len);
|
||||
static struct lws_context *context;
|
||||
static std::string protocolName;
|
||||
static std::mutex mutex_connects;
|
||||
static std::mutex mutex_disconnects;
|
||||
static std::mutex mutex_writes;
|
||||
static std::list<AudioPipe*> pendingConnects;
|
||||
static std::list<AudioPipe*> pendingDisconnects;
|
||||
static std::list<AudioPipe*> pendingWrites;
|
||||
static log_emit_function logger;
|
||||
|
||||
static std::mutex mapMutex;
|
||||
static bool stopFlag;
|
||||
|
||||
static AudioPipe* findAndRemovePendingConnect(struct lws *wsi);
|
||||
static AudioPipe* findPendingConnect(struct lws *wsi);
|
||||
static void addPendingConnect(AudioPipe* ap);
|
||||
static void addPendingDisconnect(AudioPipe* ap);
|
||||
static void addPendingWrite(AudioPipe* ap);
|
||||
static void processPendingConnects(lws_per_vhost_data *vhd);
|
||||
static void processPendingDisconnects(lws_per_vhost_data *vhd);
|
||||
static void processPendingWrites(void);
|
||||
|
||||
bool connect_client(struct lws_per_vhost_data *vhd);
|
||||
|
||||
LwsState_t m_state;
|
||||
std::string m_uuid;
|
||||
std::string m_host;
|
||||
unsigned int m_port;
|
||||
std::string m_path;
|
||||
std::string m_metadata;
|
||||
std::mutex m_text_mutex;
|
||||
std::mutex m_audio_mutex;
|
||||
int m_sslFlags;
|
||||
struct lws *m_wsi;
|
||||
uint8_t *m_audio_buffer;
|
||||
size_t m_audio_buffer_max_len;
|
||||
size_t m_audio_buffer_write_offset;
|
||||
size_t m_audio_buffer_min_freespace;
|
||||
uint8_t* m_recv_buf;
|
||||
uint8_t* m_recv_buf_ptr;
|
||||
size_t m_recv_buf_len;
|
||||
struct lws_per_vhost_data* m_vhd;
|
||||
notifyHandler_t m_callback;
|
||||
log_emit_function m_logger;
|
||||
std::string m_apiKey;
|
||||
bool m_gracefulShutdown;
|
||||
bool m_finished;
|
||||
std::string m_bugname;
|
||||
std::promise<void> m_promise;
|
||||
};
|
||||
|
||||
} // namespace assemblyai
|
||||
#endif
|
||||
178
mod_assemblyai_transcribe/base64.hpp
Normal file
178
mod_assemblyai_transcribe/base64.hpp
Normal file
@@ -0,0 +1,178 @@
|
||||
/*
|
||||
******
|
||||
base64.hpp is a repackaging of the base64.cpp and base64.h files into a
|
||||
single header suitable for use as a header only library. This conversion was
|
||||
done by Peter Thorson (webmaster@zaphoyd.com) in 2012. All modifications to
|
||||
the code are redistributed under the same license as the original, which is
|
||||
listed below.
|
||||
******
|
||||
|
||||
base64.cpp and base64.h
|
||||
|
||||
Copyright (C) 2004-2008 René Nyffenegger
|
||||
|
||||
This source code is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the author be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this source code must not be misrepresented; you must not
|
||||
claim that you wrote the original source code. If you use this source code
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original source code.
|
||||
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
|
||||
René Nyffenegger rene.nyffenegger@adp-gmbh.ch
|
||||
|
||||
*/
|
||||
|
||||
#ifndef _BASE64_HPP_
|
||||
#define _BASE64_HPP_
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace drachtio {
|
||||
|
||||
static std::string const base64_chars =
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
"abcdefghijklmnopqrstuvwxyz"
|
||||
"0123456789+/";
|
||||
|
||||
/// Test whether a character is a valid base64 character
|
||||
/**
|
||||
* @param c The character to test
|
||||
* @return true if c is a valid base64 character
|
||||
*/
|
||||
static inline bool is_base64(unsigned char c) {
|
||||
return (c == 43 || // +
|
||||
(c >= 47 && c <= 57) || // /-9
|
||||
(c >= 65 && c <= 90) || // A-Z
|
||||
(c >= 97 && c <= 122)); // a-z
|
||||
}
|
||||
|
||||
/// Encode a char buffer into a base64 string
|
||||
/**
|
||||
* @param input The input data
|
||||
* @param len The length of input in bytes
|
||||
* @return A base64 encoded string representing input
|
||||
*/
|
||||
inline std::string base64_encode(unsigned char const * input, size_t len) {
|
||||
std::string ret;
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
unsigned char char_array_3[3];
|
||||
unsigned char char_array_4[4];
|
||||
|
||||
while (len--) {
|
||||
char_array_3[i++] = *(input++);
|
||||
if (i == 3) {
|
||||
char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
|
||||
char_array_4[1] = ((char_array_3[0] & 0x03) << 4) +
|
||||
((char_array_3[1] & 0xf0) >> 4);
|
||||
char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) +
|
||||
((char_array_3[2] & 0xc0) >> 6);
|
||||
char_array_4[3] = char_array_3[2] & 0x3f;
|
||||
|
||||
for(i = 0; (i <4) ; i++) {
|
||||
ret += base64_chars[char_array_4[i]];
|
||||
}
|
||||
i = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (i) {
|
||||
for(j = i; j < 3; j++) {
|
||||
char_array_3[j] = '\0';
|
||||
}
|
||||
|
||||
char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
|
||||
char_array_4[1] = ((char_array_3[0] & 0x03) << 4) +
|
||||
((char_array_3[1] & 0xf0) >> 4);
|
||||
char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) +
|
||||
((char_array_3[2] & 0xc0) >> 6);
|
||||
char_array_4[3] = char_array_3[2] & 0x3f;
|
||||
|
||||
for (j = 0; (j < i + 1); j++) {
|
||||
ret += base64_chars[char_array_4[j]];
|
||||
}
|
||||
|
||||
while((i++ < 3)) {
|
||||
ret += '=';
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/// Encode a string into a base64 string
|
||||
/**
|
||||
* @param input The input data
|
||||
* @return A base64 encoded string representing input
|
||||
*/
|
||||
inline std::string base64_encode(std::string const & input) {
|
||||
return base64_encode(
|
||||
reinterpret_cast<const unsigned char *>(input.data()),
|
||||
input.size()
|
||||
);
|
||||
}
|
||||
|
||||
/// Decode a base64 encoded string into a string of raw bytes
|
||||
/**
|
||||
* @param input The base64 encoded input data
|
||||
* @return A string representing the decoded raw bytes
|
||||
*/
|
||||
inline std::string base64_decode(std::string const & input) {
|
||||
size_t in_len = input.size();
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
int in_ = 0;
|
||||
unsigned char char_array_4[4], char_array_3[3];
|
||||
std::string ret;
|
||||
|
||||
while (in_len-- && ( input[in_] != '=') && is_base64(input[in_])) {
|
||||
char_array_4[i++] = input[in_]; in_++;
|
||||
if (i ==4) {
|
||||
for (i = 0; i <4; i++) {
|
||||
char_array_4[i] = static_cast<unsigned char>(base64_chars.find(char_array_4[i]));
|
||||
}
|
||||
|
||||
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
|
||||
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
||||
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
||||
|
||||
for (i = 0; (i < 3); i++) {
|
||||
ret += char_array_3[i];
|
||||
}
|
||||
i = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (i) {
|
||||
for (j = i; j <4; j++)
|
||||
char_array_4[j] = 0;
|
||||
|
||||
for (j = 0; j <4; j++)
|
||||
char_array_4[j] = static_cast<unsigned char>(base64_chars.find(char_array_4[j]));
|
||||
|
||||
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
|
||||
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
||||
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
||||
|
||||
for (j = 0; (j < i - 1); j++) {
|
||||
ret += static_cast<std::string::value_type>(char_array_3[j]);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // namespace websocketpp
|
||||
|
||||
#endif // _BASE64_HPP_
|
||||
211
mod_assemblyai_transcribe/mod_assemblyai_transcribe.c
Normal file
211
mod_assemblyai_transcribe/mod_assemblyai_transcribe.c
Normal file
@@ -0,0 +1,211 @@
|
||||
/*
|
||||
*
|
||||
* mod_assemblyai_transcribe.c -- Freeswitch module for using assemblyai streaming transcribe api
|
||||
*
|
||||
*/
|
||||
#include "mod_assemblyai_transcribe.h"
|
||||
#include "aai_transcribe_glue.h"
|
||||
|
||||
/* Prototypes */
|
||||
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_assemblyai_transcribe_shutdown);
|
||||
SWITCH_MODULE_LOAD_FUNCTION(mod_assemblyai_transcribe_load);
|
||||
|
||||
SWITCH_MODULE_DEFINITION(mod_assemblyai_transcribe, mod_assemblyai_transcribe_load, mod_assemblyai_transcribe_shutdown, NULL);
|
||||
|
||||
static switch_status_t do_stop(switch_core_session_t *session, char* bugname);
|
||||
|
||||
static void responseHandler(switch_core_session_t* session,
|
||||
const char* eventName, const char * json, const char* bugname, int finished) {
|
||||
switch_event_t *event;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, eventName);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "assemblyai");
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-session-finished", finished ? "true" : "false");
|
||||
if (finished) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "responseHandler returning event %s, from finished recognition session\n", eventName);
|
||||
}
|
||||
if (json) switch_event_add_body(event, "%s", json);
|
||||
if (bugname) switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "media-bugname", bugname);
|
||||
switch_event_fire(&event);
|
||||
}
|
||||
|
||||
|
||||
static switch_bool_t capture_callback(switch_media_bug_t *bug, void *user_data, switch_abc_type_t type)
|
||||
{
|
||||
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
|
||||
|
||||
switch (type) {
|
||||
case SWITCH_ABC_TYPE_INIT:
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Got SWITCH_ABC_TYPE_INIT.\n");
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_CLOSE:
|
||||
{
|
||||
private_t *tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Got SWITCH_ABC_TYPE_CLOSE.\n");
|
||||
|
||||
aai_transcribe_session_stop(session, 1, tech_pvt->bugname);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Finished SWITCH_ABC_TYPE_CLOSE.\n");
|
||||
}
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_READ:
|
||||
|
||||
return aai_transcribe_frame(session, bug);
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_WRITE:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
|
||||
static switch_status_t start_capture(switch_core_session_t *session, switch_media_bug_flag_t flags,
|
||||
char* lang, int interim, char* bugname)
|
||||
{
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug;
|
||||
switch_status_t status;
|
||||
switch_codec_implementation_t read_impl = { 0 };
|
||||
void *pUserData;
|
||||
uint32_t samples_per_second;
|
||||
|
||||
if (switch_channel_get_private(channel, MY_BUG_NAME)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "removing bug from previous transcribe\n");
|
||||
do_stop(session, bugname);
|
||||
}
|
||||
|
||||
switch_core_session_get_read_impl(session, &read_impl);
|
||||
|
||||
if (switch_channel_pre_answer(channel) != SWITCH_STATUS_SUCCESS) {
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
samples_per_second = !strcasecmp(read_impl.iananame, "g722") ? read_impl.actual_samples_per_second : read_impl.samples_per_second;
|
||||
|
||||
if (SWITCH_STATUS_FALSE == aai_transcribe_session_init(session, responseHandler, samples_per_second, flags & SMBF_STEREO ? 2 : 1, lang, interim, bugname, &pUserData)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error initializing assemblyai speech session.\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
if ((status = switch_core_media_bug_add(session, "aai_transcribe", NULL, capture_callback, pUserData, 0, flags, &bug)) != SWITCH_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
switch_channel_set_private(channel, MY_BUG_NAME, bug);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "added media bug for assemblyai transcribe\n");
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static switch_status_t do_stop(switch_core_session_t *session, char* bugname)
|
||||
{
|
||||
switch_status_t status = SWITCH_STATUS_SUCCESS;
|
||||
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = switch_channel_get_private(channel, MY_BUG_NAME);
|
||||
|
||||
if (bug) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Received user command command to stop transcribe.\n");
|
||||
status = aai_transcribe_session_stop(session, 0, bugname);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "stopped transcribe.\n");
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
#define TRANSCRIBE_API_SYNTAX "<uuid> [start|stop] lang-code [interim] [stereo|mono]"
|
||||
SWITCH_STANDARD_API(aai_transcribe_function)
|
||||
{
|
||||
char *mycmd = NULL, *argv[6] = { 0 };
|
||||
int argc = 0;
|
||||
switch_status_t status = SWITCH_STATUS_FALSE;
|
||||
switch_media_bug_flag_t flags = SMBF_READ_STREAM /* | SMBF_WRITE_STREAM | SMBF_READ_PING */;
|
||||
|
||||
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
|
||||
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
|
||||
}
|
||||
|
||||
if (zstr(cmd) ||
|
||||
(!strcasecmp(argv[1], "stop") && argc < 2) ||
|
||||
(!strcasecmp(argv[1], "start") && argc < 3) ||
|
||||
zstr(argv[0])) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s %s %s.\n", cmd, argv[0], argv[1]);
|
||||
stream->write_function(stream, "-USAGE: %s\n", TRANSCRIBE_API_SYNTAX);
|
||||
goto done;
|
||||
} else {
|
||||
switch_core_session_t *lsession = NULL;
|
||||
|
||||
if ((lsession = switch_core_session_locate(argv[0]))) {
|
||||
if (!strcasecmp(argv[1], "stop")) {
|
||||
char *bugname = argc > 2 ? argv[2] : MY_BUG_NAME;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "stop transcribing\n");
|
||||
status = do_stop(lsession, bugname);
|
||||
} else if (!strcasecmp(argv[1], "start")) {
|
||||
char* lang = argv[2];
|
||||
int interim = argc > 3 && !strcmp(argv[3], "interim");
|
||||
char *bugname = argc > 5 ? argv[5] : MY_BUG_NAME;
|
||||
if (argc > 4 && !strcmp(argv[4], "stereo")) {
|
||||
flags |= SMBF_WRITE_STREAM ;
|
||||
flags |= SMBF_STEREO;
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "start transcribing %s %s\n", lang, interim ? "interim": "complete");
|
||||
status = start_capture(lsession, flags, lang, interim, bugname);
|
||||
}
|
||||
switch_core_session_rwunlock(lsession);
|
||||
}
|
||||
}
|
||||
|
||||
if (status == SWITCH_STATUS_SUCCESS) {
|
||||
stream->write_function(stream, "+OK Success\n");
|
||||
} else {
|
||||
stream->write_function(stream, "-ERR Operation Failed\n");
|
||||
}
|
||||
|
||||
done:
|
||||
|
||||
switch_safe_free(mycmd);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
SWITCH_MODULE_LOAD_FUNCTION(mod_assemblyai_transcribe_load)
|
||||
{
|
||||
switch_api_interface_t *api_interface;
|
||||
|
||||
/* create/register custom event message type */
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_RESULTS) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_RESULTS);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
|
||||
/* connect my internal structure to the blank pointer passed to me */
|
||||
*module_interface = switch_loadable_module_create_module_interface(pool, modname);
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Deepgram Speech Transcription API loading..\n");
|
||||
|
||||
if (SWITCH_STATUS_FALSE == aai_transcribe_init()) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Failed initializing dg speech interface\n");
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Deepgram Speech Transcription API successfully loaded\n");
|
||||
|
||||
SWITCH_ADD_API(api_interface, "uuid_assemblyai_transcribe", "Deepgram Speech Transcription API", aai_transcribe_function, TRANSCRIBE_API_SYNTAX);
|
||||
switch_console_set_complete("add uuid_assemblyai_transcribe start lang-code [interim|final] [stereo|mono]");
|
||||
switch_console_set_complete("add uuid_assemblyai_transcribe stop ");
|
||||
|
||||
/* indicate that the module should continue to be loaded */
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Called when the system shuts down
|
||||
Macro expands to: switch_status_t mod_assemblyai_transcribe_shutdown() */
|
||||
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_assemblyai_transcribe_shutdown)
|
||||
{
|
||||
aai_transcribe_cleanup();
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_RESULTS);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
50
mod_assemblyai_transcribe/mod_assemblyai_transcribe.h
Normal file
50
mod_assemblyai_transcribe/mod_assemblyai_transcribe.h
Normal file
@@ -0,0 +1,50 @@
|
||||
|
||||
#ifndef __MOD_AWS_TRANSCRIBE_H__
|
||||
#define __MOD_AWS_TRANSCRIBE_H__
|
||||
|
||||
#include <switch.h>
|
||||
#include <speex/speex_resampler.h>
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
#define MY_BUG_NAME "assemblyai_transcribe"
|
||||
#define TRANSCRIBE_EVENT_RESULTS "assemblyai_transcribe::transcription"
|
||||
#define TRANSCRIBE_EVENT_SESSION_BEGINS "assemblyai_transcribe::session_begins"
|
||||
#define TRANSCRIBE_EVENT_SESSION_TERMINATED "assemblyai_transcribe::session_termanated"
|
||||
#define TRANSCRIBE_EVENT_ERROR "assemblyai_transcribe::error"
|
||||
#define TRANSCRIBE_EVENT_NO_AUDIO_DETECTED "assemblyai_transcribe::no_audio_detected"
|
||||
#define TRANSCRIBE_EVENT_VAD_DETECTED "assemblyai_transcribe::vad_detected"
|
||||
#define TRANSCRIBE_EVENT_CONNECT_SUCCESS "assemblyai_transcribe::connect"
|
||||
#define TRANSCRIBE_EVENT_CONNECT_FAIL "assemblyai_transcribe::connect_failed"
|
||||
#define TRANSCRIBE_EVENT_BUFFER_OVERRUN "assemblyai_transcribe::buffer_overrun"
|
||||
#define TRANSCRIBE_EVENT_DISCONNECT "assemblyai_transcribe::disconnect"
|
||||
|
||||
#define MAX_LANG (12)
|
||||
#define MAX_SESSION_ID (256)
|
||||
#define MAX_WS_URL_LEN (512)
|
||||
#define MAX_PATH_LEN (4096)
|
||||
#define MAX_BUG_LEN (64)
|
||||
|
||||
typedef void (*responseHandler_t)(switch_core_session_t* session, const char* eventName, const char* json, const char* bugname, int finished);
|
||||
|
||||
struct private_data {
|
||||
switch_mutex_t *mutex;
|
||||
char sessionId[MAX_SESSION_ID];
|
||||
SpeexResamplerState *resampler;
|
||||
responseHandler_t responseHandler;
|
||||
void *pAudioPipe;
|
||||
int ws_state;
|
||||
char host[MAX_WS_URL_LEN];
|
||||
unsigned int port;
|
||||
char path[MAX_PATH_LEN];
|
||||
char bugname[MAX_BUG_LEN+1];
|
||||
int sampling;
|
||||
int channels;
|
||||
unsigned int id;
|
||||
int buffer_overrun_notified:1;
|
||||
int is_finished:1;
|
||||
};
|
||||
|
||||
typedef struct private_data private_t;
|
||||
|
||||
#endif
|
||||
21
mod_assemblyai_transcribe/parser.cpp
Normal file
21
mod_assemblyai_transcribe/parser.cpp
Normal file
@@ -0,0 +1,21 @@
|
||||
#include "parser.hpp"
|
||||
#include <switch.h>
|
||||
|
||||
cJSON* parse_json(switch_core_session_t* session, const std::string& data, std::string& type) {
|
||||
cJSON* json = NULL;
|
||||
const char *szType = NULL;
|
||||
json = cJSON_Parse(data.c_str());
|
||||
if (!json) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "parse - failed parsing incoming msg as JSON: %s\n", data.c_str());
|
||||
return NULL;
|
||||
}
|
||||
|
||||
szType = cJSON_GetObjectCstr(json, "type");
|
||||
if (szType) {
|
||||
type.assign(szType);
|
||||
}
|
||||
else {
|
||||
type.assign("json");
|
||||
}
|
||||
return json;
|
||||
}
|
||||
9
mod_assemblyai_transcribe/parser.hpp
Normal file
9
mod_assemblyai_transcribe/parser.hpp
Normal file
@@ -0,0 +1,9 @@
|
||||
#ifndef __PARSER_H__
|
||||
#define __PARSER_H__
|
||||
|
||||
#include <string>
|
||||
#include <switch_json.h>
|
||||
|
||||
cJSON* parse_json(switch_core_session_t* session, const std::string& data, std::string& type) ;
|
||||
|
||||
#endif
|
||||
51
mod_assemblyai_transcribe/simple_buffer.h
Normal file
51
mod_assemblyai_transcribe/simple_buffer.h
Normal file
@@ -0,0 +1,51 @@
|
||||
/**
|
||||
* (very) simple and limited circular buffer,
|
||||
* supporting only the use case of doing all of the adds
|
||||
* and then subsquently retrieves.
|
||||
*
|
||||
*/
|
||||
class SimpleBuffer {
|
||||
public:
|
||||
SimpleBuffer(uint32_t chunkSize, uint32_t numChunks) : numItems(0),
|
||||
m_numChunks(numChunks), m_chunkSize(chunkSize) {
|
||||
m_pData = new char[chunkSize * numChunks];
|
||||
m_pNextWrite = m_pData;
|
||||
}
|
||||
~SimpleBuffer() {
|
||||
delete [] m_pData;
|
||||
}
|
||||
|
||||
void add(void *data, uint32_t datalen) {
|
||||
if (datalen % m_chunkSize != 0) return;
|
||||
int numChunks = datalen / m_chunkSize;
|
||||
for (int i = 0; i < numChunks; i++) {
|
||||
memcpy(m_pNextWrite, data, m_chunkSize);
|
||||
data = static_cast<char*>(data) + m_chunkSize;
|
||||
if (numItems < m_numChunks) numItems++;
|
||||
|
||||
uint32_t offset = (m_pNextWrite - m_pData) / m_chunkSize;
|
||||
if (offset >= m_numChunks - 1) m_pNextWrite = m_pData;
|
||||
else m_pNextWrite += m_chunkSize;
|
||||
}
|
||||
}
|
||||
|
||||
char* getNextChunk() {
|
||||
if (numItems--) {
|
||||
char *p = m_pNextWrite;
|
||||
uint32_t offset = (m_pNextWrite - m_pData) / m_chunkSize;
|
||||
if (offset >= m_numChunks - 1) m_pNextWrite = m_pData;
|
||||
else m_pNextWrite += m_chunkSize;
|
||||
return p;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
uint32_t getNumItems() { return numItems;}
|
||||
|
||||
private:
|
||||
char *m_pData;
|
||||
uint32_t numItems;
|
||||
uint32_t m_chunkSize;
|
||||
uint32_t m_numChunks;
|
||||
char* m_pNextWrite;
|
||||
};
|
||||
8
mod_audio_fork/LICENSE
Normal file
8
mod_audio_fork/LICENSE
Normal file
@@ -0,0 +1,8 @@
|
||||
Copyright 2023, Drachtio Communications Services, LLC
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
10
mod_audio_fork/Makefile.am
Normal file
10
mod_audio_fork/Makefile.am
Normal file
@@ -0,0 +1,10 @@
|
||||
include $(top_srcdir)/build/modmake.rulesam
|
||||
MODNAME=mod_audio_fork
|
||||
|
||||
mod_LTLIBRARIES = mod_audio_fork.la
|
||||
mod_audio_fork_la_SOURCES = mod_audio_fork.c lws_glue.cpp parser.cpp audio_pipe.cpp
|
||||
mod_audio_fork_la_CFLAGS = $(AM_CFLAGS)
|
||||
mod_audio_fork_la_CXXFLAGS = $(AM_CXXFLAGS) -std=c++11
|
||||
|
||||
mod_audio_fork_la_LIBADD = $(switch_builddir)/libfreeswitch.la
|
||||
mod_audio_fork_la_LDFLAGS = -avoid-version -module -no-undefined -shared `pkg-config --libs libwebsockets`
|
||||
187
mod_audio_fork/README.md
Normal file
187
mod_audio_fork/README.md
Normal file
@@ -0,0 +1,187 @@
|
||||
# mod_audio_fork
|
||||
|
||||
A Freeswitch module that attaches a bug to a media server endpoint and streams L16 audio via websockets to a remote server. This module also supports receiving media from the server to play back to the caller, enabling the creation of full-fledged IVR or dialog-type applications.
|
||||
|
||||
#### Environment variables
|
||||
- MOD_AUDIO_FORK_SUBPROTOCOL_NAME - optional, name of the [websocket sub-protocol](https://tools.ietf.org/html/rfc6455#section-1.9) to advertise; defaults to "audio.drachtio.org"
|
||||
- MOD_AUDIO_FORK_SERVICE_THREADS - optional, number of libwebsocket service threads to create; these threads handling sending all messages for all sessions. Defaults to 1, but can be set to as many as 5.
|
||||
|
||||
## API
|
||||
|
||||
### Commands
|
||||
The freeswitch module exposes the following API commands:
|
||||
|
||||
```
|
||||
uuid_audio_fork <uuid> start <wss-url> <mix-type> <sampling-rate> <metadata>
|
||||
```
|
||||
Attaches media bug and starts streaming audio stream to the back-end server. Audio is streamed in linear 16 format (16-bit PCM encoding) with either one or two channels depending on the mix-type requested.
|
||||
- `uuid` - unique identifier of Freeswitch channel
|
||||
- `wss-url` - websocket url to connect and stream audio to
|
||||
- `mix-type` - choice of
|
||||
- "mono" - single channel containing caller's audio
|
||||
- "mixed" - single channel containing both caller and callee audio
|
||||
- "stereo" - two channels with caller audio in one and callee audio in the other.
|
||||
- `sampling-rate` - choice of
|
||||
- "8k" = 8000 Hz sample rate will be generated
|
||||
- "16k" = 16000 Hz sample rate will be generated
|
||||
- `metadata` - a text frame of arbitrary data to send to the back-end server immediately upon connecting. Once this text frame has been sent, the incoming audio will be sent in binary frames to the server.
|
||||
|
||||
```
|
||||
uuid_audio_fork <uuid> send_text <metadata>
|
||||
```
|
||||
Send a text frame of arbitrary data to the remote server (e.g. this can be used to notify of DTMF events).
|
||||
|
||||
```
|
||||
uuid_audio_fork <uuid> stop <metadata>
|
||||
```
|
||||
Closes websocket connection and detaches media bug, optionally sending a final text frame over the websocket connection before closing.
|
||||
|
||||
### Events
|
||||
An optional feature of this module is that it can receive JSON text frames from the server and generate associated events to an application. The format of the JSON text frames and the associated events are described below.
|
||||
|
||||
#### audio
|
||||
##### server JSON message
|
||||
The server can provide audio content to be played back to the caller by sending a JSON text frame like this:
|
||||
```json
|
||||
{
|
||||
"type": "playAudio",
|
||||
"data": {
|
||||
"audioContentType": "raw",
|
||||
"sampleRate": 8000,
|
||||
"audioContent": "base64 encoded raw audio..",
|
||||
"textContent": "Hi there! How can we help?"
|
||||
}
|
||||
}
|
||||
```
|
||||
The `audioContentType` value can be either `wave` or `raw`. If the latter, then `sampleRate` must be specified. The audio content itself is supplied as a base64 encoded string. The `textContent` attribute can optionally contain the text of the prompt. This allows an application to choose whether to play the raw audio or to use its own text-to-speech to play the text prompt.
|
||||
|
||||
Note that the module does _not_ directly play out the raw audio. Instead, it writes it to a temporary file and provides the path to the file in the event generated. It is left to the application to play out this file if it wishes to do so.
|
||||
##### Freeswitch event generated
|
||||
**Name**: mod_audio_fork::play_audio
|
||||
**Body**: JSON string
|
||||
```
|
||||
{
|
||||
"audioContentType": "raw",
|
||||
"sampleRate": 8000,
|
||||
"textContent": "Hi there! How can we help?",
|
||||
"file": "/tmp/7dd5e34e-5db4-4edb-a166-757e5d29b941_2.tmp.r8"
|
||||
}
|
||||
```
|
||||
Note the audioContent attribute has been replaced with the path to the file containing the audio. This temporary file will be removed when the Freeswitch session ends.
|
||||
#### killAudio
|
||||
##### server JSON message
|
||||
The server can provide a request to kill the current audio playback:
|
||||
```json
|
||||
{
|
||||
"type": "killAudio",
|
||||
}
|
||||
```
|
||||
Any current audio being played to the caller will be immediately stopped. The event sent to the application is for information purposes only.
|
||||
|
||||
##### Freeswitch event generated
|
||||
**Name**: mod_audio_fork::kill_audio
|
||||
**Body**: JSON string - the data attribute from the server message
|
||||
|
||||
|
||||
#### transcription
|
||||
##### server JSON message
|
||||
The server can optionally provide transcriptions to the application in real-time:
|
||||
```json
|
||||
{
|
||||
"type": "transcription",
|
||||
"data": {
|
||||
|
||||
}
|
||||
}
|
||||
```
|
||||
The transcription data can be any JSON object; for instance, a server may choose to return a transcript and an associated confidence level. Whatever is provided as the `data` attribute will be attached to the generated event.
|
||||
|
||||
##### Freeswitch event generated
|
||||
**Name**: mod_audio_fork::transcription
|
||||
**Body**: JSON string - the data attribute from the server message
|
||||
|
||||
#### transfer
|
||||
##### server JSON message
|
||||
The server can optionally provide a request to transfer the call:
|
||||
```json
|
||||
{
|
||||
"type": "transfer",
|
||||
"data": {
|
||||
|
||||
}
|
||||
}
|
||||
```
|
||||
The transfer data can be any JSON object and is left for the application to determine how to handle it and accomplish the call transfer. Whatever is provided as the `data` attribute will be attached to the generated event.
|
||||
|
||||
##### Freeswitch event generated
|
||||
**Name**: mod_audio_fork::transfer
|
||||
**Body**: JSON string - the data attribute from the server message
|
||||
|
||||
#### disconnect
|
||||
##### server JSON message
|
||||
The server can optionally request to disconnect the caller:
|
||||
```json
|
||||
{
|
||||
"type": "disconnect"
|
||||
}
|
||||
```
|
||||
Note that the module _does not_ close the Freeswitch channel when a disconnect request is received. It is left for the application to determine whether to tear down the call.
|
||||
|
||||
##### Freeswitch event generated
|
||||
**Name**: mod_audio_fork::disconnect
|
||||
**Body**: none
|
||||
|
||||
#### error
|
||||
##### server JSON message
|
||||
The server can optionally report an error of some kind.
|
||||
```json
|
||||
{
|
||||
"type": "error",
|
||||
"data": {
|
||||
|
||||
}
|
||||
}
|
||||
```
|
||||
The error data can be any JSON object and is left for the application to the application to determine what, if any, action should be taken in response to an error.. Whatever is provided as the `data` attribute will be attached to the generated event.
|
||||
|
||||
##### Freeswitch event generated
|
||||
**Name**: mod_audio_fork::error
|
||||
**Body**: JSON string - the data attribute from the server message
|
||||
|
||||
## Usage
|
||||
When using [drachtio-fsrmf](https://www.npmjs.com/package/drachtio-fsmrf), you can access this API command via the api method on the 'endpoint' object.
|
||||
```js
|
||||
const url = 'https://70f21a76.ngrok.io';
|
||||
const callerData = {to: '6173333456', from: '2061236666', callid: req.get('Call-Id')};
|
||||
ep.api('uuid_audio_fork', `${ep.uuid} start ${url} mono 8k ${JSON.stringify(callerData)}`);
|
||||
```
|
||||
or, from version 1.4.1 on, by using the Endpoint convenience methods:
|
||||
```js
|
||||
await ep.forkAudioStart({
|
||||
wsUrl,
|
||||
mixType: 'stereo',
|
||||
sampling: '16k',
|
||||
metadata
|
||||
});
|
||||
..
|
||||
ep.forkAudioSendText(moremetadata);
|
||||
..
|
||||
ep.forkAudioStop(evenmoremetadata);
|
||||
```
|
||||
Each of the methods above returns a promise that resolves when the api command has been executed, or throws an error.
|
||||
## Examples
|
||||
[audio_fork.js](../../examples/audio_fork.js) provides an example of an application that connects an incoming call to Freeswitch and then forks the audio to a remote websocket server.
|
||||
|
||||
To run this app, you can run [the simple websocket server provided](../../examples/ws_server.js) in a separate terminal. It will listen on port 3001 and will simply write the incoming raw audio to `/tmp/audio.raw` in linear16 format with no header or file container.
|
||||
|
||||
So in the first terminal window run:
|
||||
```
|
||||
node ws_server.js
|
||||
```
|
||||
And in the second window run:
|
||||
```
|
||||
node audio_fork.js http://localhost:3001
|
||||
```
|
||||
The app uses text-to-speech to play prompts, so you will need mod_google_tts loaded as well, and configured to use your GCS cloud credentials to access Google Cloud Text-to-Speech. (If you don't want to run mod_google_tts you can of course simply modify the application remove the prompt, just be aware that you will hear silence when you connect, and should simply begin speaking after the call connects).
|
||||
|
||||
|
||||
527
mod_audio_fork/audio_pipe.cpp
Normal file
527
mod_audio_fork/audio_pipe.cpp
Normal file
@@ -0,0 +1,527 @@
|
||||
#include "audio_pipe.hpp"
|
||||
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
|
||||
/* discard incoming text messages over the socket that are longer than this */
|
||||
#define MAX_RECV_BUF_SIZE (65 * 1024 * 10)
|
||||
#define RECV_BUF_REALLOC_SIZE (8 * 1024)
|
||||
|
||||
|
||||
namespace {
|
||||
static const char* basicAuthUser = std::getenv("MOD_AUDIO_FORK_HTTP_AUTH_USER");
|
||||
static const char* basicAuthPassword = std::getenv("MOD_AUDIO_FORK_HTTP_AUTH_PASSWORD");
|
||||
|
||||
static const char *requestedTcpKeepaliveSecs = std::getenv("MOD_AUDIO_FORK_TCP_KEEPALIVE_SECS");
|
||||
static int nTcpKeepaliveSecs = requestedTcpKeepaliveSecs ? ::atoi(requestedTcpKeepaliveSecs) : 55;
|
||||
}
|
||||
|
||||
// remove once we update to lws with this helper
|
||||
static int dch_lws_http_basic_auth_gen(const char *user, const char *pw, char *buf, size_t len) {
|
||||
size_t n = strlen(user), m = strlen(pw);
|
||||
char b[128];
|
||||
|
||||
if (len < 6 + ((4 * (n + m + 1)) / 3) + 1)
|
||||
return 1;
|
||||
|
||||
memcpy(buf, "Basic ", 6);
|
||||
|
||||
n = lws_snprintf(b, sizeof(b), "%s:%s", user, pw);
|
||||
if (n >= sizeof(b) - 2)
|
||||
return 2;
|
||||
|
||||
lws_b64_encode_string(b, n, buf + 6, len - 6);
|
||||
buf[len - 1] = '\0';
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int AudioPipe::lws_callback(struct lws *wsi,
|
||||
enum lws_callback_reasons reason,
|
||||
void *user, void *in, size_t len) {
|
||||
|
||||
struct AudioPipe::lws_per_vhost_data *vhd =
|
||||
(struct AudioPipe::lws_per_vhost_data *) lws_protocol_vh_priv_get(lws_get_vhost(wsi), lws_get_protocol(wsi));
|
||||
|
||||
struct lws_vhost* vhost = lws_get_vhost(wsi);
|
||||
AudioPipe ** ppAp = (AudioPipe **) user;
|
||||
|
||||
switch (reason) {
|
||||
case LWS_CALLBACK_PROTOCOL_INIT:
|
||||
vhd = (struct AudioPipe::lws_per_vhost_data *) lws_protocol_vh_priv_zalloc(lws_get_vhost(wsi), lws_get_protocol(wsi), sizeof(struct AudioPipe::lws_per_vhost_data));
|
||||
vhd->context = lws_get_context(wsi);
|
||||
vhd->protocol = lws_get_protocol(wsi);
|
||||
vhd->vhost = lws_get_vhost(wsi);
|
||||
break;
|
||||
|
||||
case LWS_CALLBACK_CLIENT_APPEND_HANDSHAKE_HEADER:
|
||||
{
|
||||
AudioPipe* ap = findPendingConnect(wsi);
|
||||
if (ap && ap->hasBasicAuth()) {
|
||||
unsigned char **p = (unsigned char **)in, *end = (*p) + len;
|
||||
char b[128];
|
||||
std::string username, password;
|
||||
|
||||
ap->getBasicAuth(username, password);
|
||||
lwsl_notice("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_APPEND_HANDSHAKE_HEADER username: %s, password: xxxxxx\n", username.c_str());
|
||||
if (dch_lws_http_basic_auth_gen(username.c_str(), password.c_str(), b, sizeof(b))) break;
|
||||
if (lws_add_http_header_by_token(wsi, WSI_TOKEN_HTTP_AUTHORIZATION, (unsigned char *)b, strlen(b), p, end)) return -1;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case LWS_CALLBACK_EVENT_WAIT_CANCELLED:
|
||||
processPendingConnects(vhd);
|
||||
processPendingDisconnects(vhd);
|
||||
processPendingWrites();
|
||||
break;
|
||||
case LWS_CALLBACK_CLIENT_CONNECTION_ERROR:
|
||||
{
|
||||
AudioPipe* ap = findAndRemovePendingConnect(wsi);
|
||||
int rc = lws_http_client_http_response(wsi);
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_CONNECTION_ERROR: %s, response status %d\n", in ? (char *)in : "(null)", rc);
|
||||
if (ap) {
|
||||
ap->m_state = LWS_CLIENT_FAILED;
|
||||
ap->m_callback(ap->m_uuid.c_str(), ap->m_bugname.c_str(), AudioPipe::CONNECT_FAIL, (char *) in);
|
||||
}
|
||||
else {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_CONNECTION_ERROR unable to find wsi %p..\n", wsi);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case LWS_CALLBACK_CLIENT_ESTABLISHED:
|
||||
{
|
||||
AudioPipe* ap = findAndRemovePendingConnect(wsi);
|
||||
if (ap) {
|
||||
*ppAp = ap;
|
||||
ap->m_vhd = vhd;
|
||||
ap->m_state = LWS_CLIENT_CONNECTED;
|
||||
ap->m_callback(ap->m_uuid.c_str(), ap->m_bugname.c_str(), AudioPipe::CONNECT_SUCCESS, NULL);
|
||||
}
|
||||
else {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_ESTABLISHED %s unable to find wsi %p..\n", ap->m_uuid.c_str(), wsi);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case LWS_CALLBACK_CLIENT_CLOSED:
|
||||
{
|
||||
AudioPipe* ap = *ppAp;
|
||||
if (!ap) {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_CLOSED %s unable to find wsi %p..\n", ap->m_uuid.c_str(), wsi);
|
||||
return 0;
|
||||
}
|
||||
if (ap->m_state == LWS_CLIENT_DISCONNECTING) {
|
||||
// closed by us
|
||||
ap->m_callback(ap->m_uuid.c_str(), ap->m_bugname.c_str(), AudioPipe::CONNECTION_CLOSED_GRACEFULLY, NULL);
|
||||
}
|
||||
else if (ap->m_state == LWS_CLIENT_CONNECTED) {
|
||||
// closed by far end
|
||||
lwsl_notice("%s socket closed by far end\n", ap->m_uuid.c_str());
|
||||
ap->m_callback(ap->m_uuid.c_str(), ap->m_bugname.c_str(), AudioPipe::CONNECTION_DROPPED, NULL);
|
||||
}
|
||||
ap->m_state = LWS_CLIENT_DISCONNECTED;
|
||||
|
||||
//NB: after receiving any of the events above, any holder of a
|
||||
//pointer or reference to this object must treat is as no longer valid
|
||||
|
||||
*ppAp = NULL;
|
||||
delete ap;
|
||||
}
|
||||
break;
|
||||
|
||||
case LWS_CALLBACK_CLIENT_RECEIVE:
|
||||
{
|
||||
AudioPipe* ap = *ppAp;
|
||||
if (!ap) {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_RECEIVE %s unable to find wsi %p..\n", ap->m_uuid.c_str(), wsi);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (lws_frame_is_binary(wsi)) {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_RECEIVE received binary frame, discarding.\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (lws_is_first_fragment(wsi)) {
|
||||
// allocate a buffer for the entire chunk of memory needed
|
||||
assert(nullptr == ap->m_recv_buf);
|
||||
ap->m_recv_buf_len = len + lws_remaining_packet_payload(wsi);
|
||||
ap->m_recv_buf = (uint8_t*) malloc(ap->m_recv_buf_len);
|
||||
ap->m_recv_buf_ptr = ap->m_recv_buf;
|
||||
}
|
||||
|
||||
size_t write_offset = ap->m_recv_buf_ptr - ap->m_recv_buf;
|
||||
size_t remaining_space = ap->m_recv_buf_len - write_offset;
|
||||
if (remaining_space < len) {
|
||||
lwsl_notice("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_RECEIVE buffer realloc needed.\n");
|
||||
size_t newlen = ap->m_recv_buf_len + RECV_BUF_REALLOC_SIZE;
|
||||
if (newlen > MAX_RECV_BUF_SIZE) {
|
||||
free(ap->m_recv_buf);
|
||||
ap->m_recv_buf = ap->m_recv_buf_ptr = nullptr;
|
||||
ap->m_recv_buf_len = 0;
|
||||
lwsl_notice("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_RECEIVE max buffer exceeded, truncating message.\n");
|
||||
}
|
||||
else {
|
||||
ap->m_recv_buf = (uint8_t*) realloc(ap->m_recv_buf, newlen);
|
||||
if (nullptr != ap->m_recv_buf) {
|
||||
ap->m_recv_buf_len = newlen;
|
||||
ap->m_recv_buf_ptr = ap->m_recv_buf + write_offset;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (nullptr != ap->m_recv_buf) {
|
||||
if (len > 0) {
|
||||
memcpy(ap->m_recv_buf_ptr, in, len);
|
||||
ap->m_recv_buf_ptr += len;
|
||||
}
|
||||
if (lws_is_final_fragment(wsi)) {
|
||||
if (nullptr != ap->m_recv_buf) {
|
||||
std::string msg((char *)ap->m_recv_buf, ap->m_recv_buf_ptr - ap->m_recv_buf);
|
||||
ap->m_callback(ap->m_uuid.c_str(), ap->m_bugname.c_str(), AudioPipe::MESSAGE, msg.c_str());
|
||||
if (nullptr != ap->m_recv_buf) free(ap->m_recv_buf);
|
||||
}
|
||||
ap->m_recv_buf = ap->m_recv_buf_ptr = nullptr;
|
||||
ap->m_recv_buf_len = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case LWS_CALLBACK_CLIENT_WRITEABLE:
|
||||
{
|
||||
AudioPipe* ap = *ppAp;
|
||||
if (!ap) {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_WRITEABLE %s unable to find wsi %p..\n", ap->m_uuid.c_str(), wsi);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// check for graceful close - send a zero length binary frame
|
||||
if (ap->isGracefulShutdown()) {
|
||||
lwsl_notice("%s graceful shutdown - sending zero length binary frame to flush any final responses\n", ap->m_uuid.c_str());
|
||||
std::lock_guard<std::mutex> lk(ap->m_audio_mutex);
|
||||
int sent = lws_write(wsi, (unsigned char *) ap->m_audio_buffer + LWS_PRE, 0, LWS_WRITE_BINARY);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// check for text frames to send
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(ap->m_text_mutex);
|
||||
if (ap->m_metadata.length() > 0) {
|
||||
uint8_t buf[ap->m_metadata.length() + LWS_PRE];
|
||||
memcpy(buf + LWS_PRE, ap->m_metadata.c_str(), ap->m_metadata.length());
|
||||
int n = ap->m_metadata.length();
|
||||
int m = lws_write(wsi, buf + LWS_PRE, n, LWS_WRITE_TEXT);
|
||||
ap->m_metadata.clear();
|
||||
if (m < n) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// there may be audio data, but only one write per writeable event
|
||||
// get it next time
|
||||
lws_callback_on_writable(wsi);
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (ap->m_state == LWS_CLIENT_DISCONNECTING) {
|
||||
lws_close_reason(wsi, LWS_CLOSE_STATUS_NORMAL, NULL, 0);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// check for audio packets
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(ap->m_audio_mutex);
|
||||
if (ap->m_audio_buffer_write_offset > LWS_PRE) {
|
||||
size_t datalen = ap->m_audio_buffer_write_offset - LWS_PRE;
|
||||
int sent = lws_write(wsi, (unsigned char *) ap->m_audio_buffer + LWS_PRE, datalen, LWS_WRITE_BINARY);
|
||||
if (sent < datalen) {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_WRITEABLE %s attemped to send %lu only sent %d wsi %p..\n",
|
||||
ap->m_uuid.c_str(), datalen, sent, wsi);
|
||||
}
|
||||
ap->m_audio_buffer_write_offset = LWS_PRE;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return lws_callback_http_dummy(wsi, reason, user, in, len);
|
||||
}
|
||||
|
||||
|
||||
// static members
|
||||
static const lws_retry_bo_t retry = {
|
||||
nullptr, // retry_ms_table
|
||||
0, // retry_ms_table_count
|
||||
0, // conceal_count
|
||||
UINT16_MAX, // secs_since_valid_ping
|
||||
UINT16_MAX, // secs_since_valid_hangup
|
||||
0 // jitter_percent
|
||||
};
|
||||
|
||||
struct lws_context *AudioPipe::context = nullptr;
|
||||
std::string AudioPipe::protocolName;
|
||||
std::mutex AudioPipe::mutex_connects;
|
||||
std::mutex AudioPipe::mutex_disconnects;
|
||||
std::mutex AudioPipe::mutex_writes;
|
||||
std::list<AudioPipe*> AudioPipe::pendingConnects;
|
||||
std::list<AudioPipe*> AudioPipe::pendingDisconnects;
|
||||
std::list<AudioPipe*> AudioPipe::pendingWrites;
|
||||
AudioPipe::log_emit_function AudioPipe::logger;
|
||||
std::mutex AudioPipe::mapMutex;
|
||||
bool AudioPipe::stopFlag;
|
||||
|
||||
void AudioPipe::processPendingConnects(lws_per_vhost_data *vhd) {
|
||||
std::list<AudioPipe*> connects;
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_connects);
|
||||
for (auto it = pendingConnects.begin(); it != pendingConnects.end(); ++it) {
|
||||
if ((*it)->m_state == LWS_CLIENT_IDLE) {
|
||||
connects.push_back(*it);
|
||||
(*it)->m_state = LWS_CLIENT_CONNECTING;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto it = connects.begin(); it != connects.end(); ++it) {
|
||||
AudioPipe* ap = *it;
|
||||
ap->connect_client(vhd);
|
||||
}
|
||||
}
|
||||
|
||||
void AudioPipe::processPendingDisconnects(lws_per_vhost_data *vhd) {
|
||||
std::list<AudioPipe*> disconnects;
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_disconnects);
|
||||
for (auto it = pendingDisconnects.begin(); it != pendingDisconnects.end(); ++it) {
|
||||
if ((*it)->m_state == LWS_CLIENT_DISCONNECTING) disconnects.push_back(*it);
|
||||
}
|
||||
pendingDisconnects.clear();
|
||||
}
|
||||
for (auto it = disconnects.begin(); it != disconnects.end(); ++it) {
|
||||
AudioPipe* ap = *it;
|
||||
lws_callback_on_writable(ap->m_wsi);
|
||||
}
|
||||
}
|
||||
|
||||
void AudioPipe::processPendingWrites() {
|
||||
std::list<AudioPipe*> writes;
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_writes);
|
||||
for (auto it = pendingWrites.begin(); it != pendingWrites.end(); ++it) {
|
||||
if ((*it)->m_state == LWS_CLIENT_CONNECTED) writes.push_back(*it);
|
||||
}
|
||||
pendingWrites.clear();
|
||||
}
|
||||
for (auto it = writes.begin(); it != writes.end(); ++it) {
|
||||
AudioPipe* ap = *it;
|
||||
lws_callback_on_writable(ap->m_wsi);
|
||||
}
|
||||
}
|
||||
|
||||
AudioPipe* AudioPipe::findAndRemovePendingConnect(struct lws *wsi) {
|
||||
AudioPipe* ap = NULL;
|
||||
std::lock_guard<std::mutex> guard(mutex_connects);
|
||||
std::list<AudioPipe* > toRemove;
|
||||
|
||||
for (auto it = pendingConnects.begin(); it != pendingConnects.end() && !ap; ++it) {
|
||||
int state = (*it)->m_state;
|
||||
|
||||
if ((*it)->m_wsi == nullptr)
|
||||
toRemove.push_back(*it);
|
||||
|
||||
if ((state == LWS_CLIENT_CONNECTING) &&
|
||||
(*it)->m_wsi == wsi) ap = *it;
|
||||
}
|
||||
|
||||
for (auto it = toRemove.begin(); it != toRemove.end(); ++it)
|
||||
pendingConnects.remove(*it);
|
||||
|
||||
if (ap) {
|
||||
pendingConnects.remove(ap);
|
||||
}
|
||||
|
||||
return ap;
|
||||
}
|
||||
|
||||
AudioPipe* AudioPipe::findPendingConnect(struct lws *wsi) {
|
||||
AudioPipe* ap = NULL;
|
||||
std::lock_guard<std::mutex> guard(mutex_connects);
|
||||
|
||||
for (auto it = pendingConnects.begin(); it != pendingConnects.end() && !ap; ++it) {
|
||||
int state = (*it)->m_state;
|
||||
if ((state == LWS_CLIENT_CONNECTING) &&
|
||||
(*it)->m_wsi == wsi) ap = *it;
|
||||
}
|
||||
return ap;
|
||||
}
|
||||
|
||||
void AudioPipe::addPendingConnect(AudioPipe* ap) {
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_connects);
|
||||
pendingConnects.push_back(ap);
|
||||
lwsl_notice("%s after adding connect there are %lu pending connects\n",
|
||||
ap->m_uuid.c_str(), pendingConnects.size());
|
||||
}
|
||||
lws_cancel_service(context);
|
||||
}
|
||||
void AudioPipe::addPendingDisconnect(AudioPipe* ap) {
|
||||
ap->m_state = LWS_CLIENT_DISCONNECTING;
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_disconnects);
|
||||
pendingDisconnects.push_back(ap);
|
||||
lwsl_notice("%s after adding disconnect there are %lu pending disconnects\n",
|
||||
ap->m_uuid.c_str(), pendingDisconnects.size());
|
||||
}
|
||||
lws_cancel_service(ap->m_vhd->context);
|
||||
}
|
||||
void AudioPipe::addPendingWrite(AudioPipe* ap) {
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_writes);
|
||||
pendingWrites.push_back(ap);
|
||||
}
|
||||
lws_cancel_service(ap->m_vhd->context);
|
||||
}
|
||||
|
||||
bool AudioPipe::lws_service_thread() {
|
||||
struct lws_context_creation_info info;
|
||||
|
||||
const struct lws_protocols protocols[] = {
|
||||
{
|
||||
protocolName.c_str(),
|
||||
AudioPipe::lws_callback,
|
||||
sizeof(void *),
|
||||
1024,
|
||||
},
|
||||
{ NULL, NULL, 0, 0 }
|
||||
};
|
||||
|
||||
memset(&info, 0, sizeof info);
|
||||
info.port = CONTEXT_PORT_NO_LISTEN;
|
||||
info.protocols = protocols;
|
||||
info.options = LWS_SERVER_OPTION_DO_SSL_GLOBAL_INIT;
|
||||
|
||||
info.ka_time = nTcpKeepaliveSecs; // tcp keep-alive timer
|
||||
info.ka_probes = 4; // number of times to try ka before closing connection
|
||||
info.ka_interval = 5; // time between ka's
|
||||
info.timeout_secs = 10; // doc says timeout for "various processes involving network roundtrips"
|
||||
info.keepalive_timeout = 5; // seconds to allow remote client to hold on to an idle HTTP/1.1 connection
|
||||
info.timeout_secs_ah_idle = 10; // secs to allow a client to hold an ah without using it
|
||||
info.retry_and_idle_policy = &retry;
|
||||
|
||||
lwsl_notice("AudioPipe::lws_service_thread creating context\n");
|
||||
|
||||
context = lws_create_context(&info);
|
||||
if (!context) {
|
||||
lwsl_err("AudioPipe::lws_service_thread failed creating context\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
int n;
|
||||
do {
|
||||
n = lws_service(context, 0);
|
||||
} while (n >= 0 && !stopFlag);
|
||||
|
||||
lwsl_notice("AudioPipe::lws_service_thread ending\n");
|
||||
lws_context_destroy(context);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void AudioPipe::initialize(const char* protocol, int loglevel, log_emit_function logger) {
|
||||
protocolName = protocol;
|
||||
lws_set_log_level(loglevel, logger);
|
||||
|
||||
lwsl_notice("AudioPipe::initialize starting\n");
|
||||
std::lock_guard<std::mutex> lock(mapMutex);
|
||||
std::thread t(&AudioPipe::lws_service_thread);
|
||||
stopFlag = false;
|
||||
t.detach();
|
||||
}
|
||||
|
||||
bool AudioPipe::deinitialize() {
|
||||
lwsl_notice("AudioPipe::deinitialize\n");
|
||||
std::lock_guard<std::mutex> lock(mapMutex);
|
||||
stopFlag = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
// instance members
|
||||
AudioPipe::AudioPipe(const char* uuid, const char* host, unsigned int port, const char* path,
|
||||
int sslFlags, size_t bufLen, size_t minFreespace, const char* username, const char* password, char* bugname, notifyHandler_t callback) :
|
||||
m_uuid(uuid), m_host(host), m_port(port), m_path(path), m_sslFlags(sslFlags),
|
||||
m_audio_buffer_min_freespace(minFreespace), m_audio_buffer_max_len(bufLen), m_gracefulShutdown(false),
|
||||
m_audio_buffer_write_offset(LWS_PRE), m_recv_buf(nullptr), m_recv_buf_ptr(nullptr), m_bugname(bugname),
|
||||
m_state(LWS_CLIENT_IDLE), m_wsi(nullptr), m_vhd(nullptr), m_callback(callback) {
|
||||
|
||||
if (username && password) {
|
||||
m_username.assign(username);
|
||||
m_password.assign(password);
|
||||
}
|
||||
|
||||
m_audio_buffer = new uint8_t[m_audio_buffer_max_len];
|
||||
}
|
||||
AudioPipe::~AudioPipe() {
|
||||
if (m_audio_buffer) delete [] m_audio_buffer;
|
||||
if (m_recv_buf) delete [] m_recv_buf;
|
||||
}
|
||||
|
||||
void AudioPipe::connect(void) {
|
||||
addPendingConnect(this);
|
||||
}
|
||||
|
||||
bool AudioPipe::connect_client(struct lws_per_vhost_data *vhd) {
|
||||
assert(m_audio_buffer != nullptr);
|
||||
assert(m_vhd == nullptr);
|
||||
|
||||
struct lws_client_connect_info i;
|
||||
|
||||
memset(&i, 0, sizeof(i));
|
||||
i.context = vhd->context;
|
||||
i.port = m_port;
|
||||
i.address = m_host.c_str();
|
||||
i.path = m_path.c_str();
|
||||
i.host = i.address;
|
||||
i.origin = i.address;
|
||||
i.ssl_connection = m_sslFlags;
|
||||
i.protocol = protocolName.c_str();
|
||||
i.pwsi = &(m_wsi);
|
||||
|
||||
m_state = LWS_CLIENT_CONNECTING;
|
||||
m_vhd = vhd;
|
||||
|
||||
m_wsi = lws_client_connect_via_info(&i);
|
||||
lwsl_notice("%s attempting connection, wsi is %p\n", m_uuid.c_str(), m_wsi);
|
||||
|
||||
return nullptr != m_wsi;
|
||||
}
|
||||
|
||||
void AudioPipe::bufferForSending(const char* text) {
|
||||
if (m_state != LWS_CLIENT_CONNECTED) return;
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(m_text_mutex);
|
||||
m_metadata.append(text);
|
||||
}
|
||||
addPendingWrite(this);
|
||||
}
|
||||
|
||||
void AudioPipe::unlockAudioBuffer() {
|
||||
if (m_audio_buffer_write_offset > LWS_PRE) addPendingWrite(this);
|
||||
m_audio_mutex.unlock();
|
||||
}
|
||||
|
||||
void AudioPipe::close() {
|
||||
if (m_state != LWS_CLIENT_CONNECTED) return;
|
||||
addPendingDisconnect(this);
|
||||
}
|
||||
|
||||
void AudioPipe::do_graceful_shutdown() {
|
||||
m_gracefulShutdown = true;
|
||||
addPendingWrite(this);
|
||||
}
|
||||
144
mod_audio_fork/audio_pipe.hpp
Normal file
144
mod_audio_fork/audio_pipe.hpp
Normal file
@@ -0,0 +1,144 @@
|
||||
#ifndef __AUDIO_PIPE_HPP__
|
||||
#define __AUDIO_PIPE_HPP__
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
#include <mutex>
|
||||
#include <queue>
|
||||
#include <unordered_map>
|
||||
#include <thread>
|
||||
|
||||
#include <libwebsockets.h>
|
||||
|
||||
class AudioPipe {
|
||||
public:
|
||||
enum LwsState_t {
|
||||
LWS_CLIENT_IDLE,
|
||||
LWS_CLIENT_CONNECTING,
|
||||
LWS_CLIENT_CONNECTED,
|
||||
LWS_CLIENT_FAILED,
|
||||
LWS_CLIENT_DISCONNECTING,
|
||||
LWS_CLIENT_DISCONNECTED
|
||||
};
|
||||
enum NotifyEvent_t {
|
||||
CONNECT_SUCCESS,
|
||||
CONNECT_FAIL,
|
||||
CONNECTION_DROPPED,
|
||||
CONNECTION_CLOSED_GRACEFULLY,
|
||||
MESSAGE
|
||||
};
|
||||
typedef void (*log_emit_function)(int level, const char *line);
|
||||
typedef void (*notifyHandler_t)(const char *sessionId, const char* bugname, NotifyEvent_t event, const char* message);
|
||||
|
||||
struct lws_per_vhost_data {
|
||||
struct lws_context *context;
|
||||
struct lws_vhost *vhost;
|
||||
const struct lws_protocols *protocol;
|
||||
};
|
||||
|
||||
static void initialize(const char* protocolName, int loglevel, log_emit_function logger);
|
||||
static bool deinitialize();
|
||||
static bool lws_service_thread();
|
||||
|
||||
// constructor
|
||||
AudioPipe(const char* uuid, const char* host, unsigned int port, const char* path, int sslFlags,
|
||||
size_t bufLen, size_t minFreespace, const char* username, const char* password, char* bugname, notifyHandler_t callback);
|
||||
~AudioPipe();
|
||||
|
||||
LwsState_t getLwsState(void) { return m_state; }
|
||||
void connect(void);
|
||||
void bufferForSending(const char* text);
|
||||
size_t binarySpaceAvailable(void) {
|
||||
return m_audio_buffer_max_len - m_audio_buffer_write_offset;
|
||||
}
|
||||
size_t binaryMinSpace(void) {
|
||||
return m_audio_buffer_min_freespace;
|
||||
}
|
||||
char * binaryWritePtr(void) {
|
||||
return (char *) m_audio_buffer + m_audio_buffer_write_offset;
|
||||
}
|
||||
void binaryWritePtrAdd(size_t len) {
|
||||
m_audio_buffer_write_offset += len;
|
||||
}
|
||||
void binaryWritePtrResetToZero(void) {
|
||||
m_audio_buffer_write_offset = 0;
|
||||
}
|
||||
void lockAudioBuffer(void) {
|
||||
m_audio_mutex.lock();
|
||||
}
|
||||
void unlockAudioBuffer(void) ;
|
||||
bool hasBasicAuth(void) {
|
||||
return !m_username.empty() && !m_password.empty();
|
||||
}
|
||||
|
||||
void getBasicAuth(std::string& username, std::string& password) {
|
||||
username = m_username;
|
||||
password = m_password;
|
||||
}
|
||||
|
||||
void do_graceful_shutdown();
|
||||
bool isGracefulShutdown(void) {
|
||||
return m_gracefulShutdown;
|
||||
}
|
||||
|
||||
void close() ;
|
||||
|
||||
// no default constructor or copying
|
||||
AudioPipe() = delete;
|
||||
AudioPipe(const AudioPipe&) = delete;
|
||||
void operator=(const AudioPipe&) = delete;
|
||||
|
||||
private:
|
||||
|
||||
static int lws_callback(struct lws *wsi, enum lws_callback_reasons reason, void *user, void *in, size_t len);
|
||||
static struct lws_context *context;
|
||||
static std::string protocolName;
|
||||
static std::mutex mutex_connects;
|
||||
static std::mutex mutex_disconnects;
|
||||
static std::mutex mutex_writes;
|
||||
static std::list<AudioPipe*> pendingConnects;
|
||||
static std::list<AudioPipe*> pendingDisconnects;
|
||||
static std::list<AudioPipe*> pendingWrites;
|
||||
static log_emit_function logger;
|
||||
|
||||
static std::mutex mapMutex;
|
||||
static bool stopFlag;
|
||||
|
||||
static AudioPipe* findAndRemovePendingConnect(struct lws *wsi);
|
||||
static AudioPipe* findPendingConnect(struct lws *wsi);
|
||||
static void addPendingConnect(AudioPipe* ap);
|
||||
static void addPendingDisconnect(AudioPipe* ap);
|
||||
static void addPendingWrite(AudioPipe* ap);
|
||||
static void processPendingConnects(lws_per_vhost_data *vhd);
|
||||
static void processPendingDisconnects(lws_per_vhost_data *vhd);
|
||||
static void processPendingWrites(void);
|
||||
|
||||
bool connect_client(struct lws_per_vhost_data *vhd);
|
||||
|
||||
LwsState_t m_state;
|
||||
std::string m_uuid;
|
||||
std::string m_host;
|
||||
std::string m_bugname;
|
||||
unsigned int m_port;
|
||||
std::string m_path;
|
||||
std::string m_metadata;
|
||||
std::mutex m_text_mutex;
|
||||
std::mutex m_audio_mutex;
|
||||
int m_sslFlags;
|
||||
struct lws *m_wsi;
|
||||
uint8_t *m_audio_buffer;
|
||||
size_t m_audio_buffer_max_len;
|
||||
size_t m_audio_buffer_write_offset;
|
||||
size_t m_audio_buffer_min_freespace;
|
||||
uint8_t* m_recv_buf;
|
||||
uint8_t* m_recv_buf_ptr;
|
||||
size_t m_recv_buf_len;
|
||||
struct lws_per_vhost_data* m_vhd;
|
||||
notifyHandler_t m_callback;
|
||||
log_emit_function m_logger;
|
||||
std::string m_username;
|
||||
std::string m_password;
|
||||
bool m_gracefulShutdown;
|
||||
};
|
||||
|
||||
#endif
|
||||
178
mod_audio_fork/base64.hpp
Normal file
178
mod_audio_fork/base64.hpp
Normal file
@@ -0,0 +1,178 @@
|
||||
/*
|
||||
******
|
||||
base64.hpp is a repackaging of the base64.cpp and base64.h files into a
|
||||
single header suitable for use as a header only library. This conversion was
|
||||
done by Peter Thorson (webmaster@zaphoyd.com) in 2012. All modifications to
|
||||
the code are redistributed under the same license as the original, which is
|
||||
listed below.
|
||||
******
|
||||
|
||||
base64.cpp and base64.h
|
||||
|
||||
Copyright (C) 2004-2008 René Nyffenegger
|
||||
|
||||
This source code is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the author be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this source code must not be misrepresented; you must not
|
||||
claim that you wrote the original source code. If you use this source code
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original source code.
|
||||
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
|
||||
René Nyffenegger rene.nyffenegger@adp-gmbh.ch
|
||||
|
||||
*/
|
||||
|
||||
#ifndef _BASE64_HPP_
|
||||
#define _BASE64_HPP_
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace drachtio {
|
||||
|
||||
static std::string const base64_chars =
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
"abcdefghijklmnopqrstuvwxyz"
|
||||
"0123456789+/";
|
||||
|
||||
/// Test whether a character is a valid base64 character
|
||||
/**
|
||||
* @param c The character to test
|
||||
* @return true if c is a valid base64 character
|
||||
*/
|
||||
static inline bool is_base64(unsigned char c) {
|
||||
return (c == 43 || // +
|
||||
(c >= 47 && c <= 57) || // /-9
|
||||
(c >= 65 && c <= 90) || // A-Z
|
||||
(c >= 97 && c <= 122)); // a-z
|
||||
}
|
||||
|
||||
/// Encode a char buffer into a base64 string
|
||||
/**
|
||||
* @param input The input data
|
||||
* @param len The length of input in bytes
|
||||
* @return A base64 encoded string representing input
|
||||
*/
|
||||
inline std::string base64_encode(unsigned char const * input, size_t len) {
|
||||
std::string ret;
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
unsigned char char_array_3[3];
|
||||
unsigned char char_array_4[4];
|
||||
|
||||
while (len--) {
|
||||
char_array_3[i++] = *(input++);
|
||||
if (i == 3) {
|
||||
char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
|
||||
char_array_4[1] = ((char_array_3[0] & 0x03) << 4) +
|
||||
((char_array_3[1] & 0xf0) >> 4);
|
||||
char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) +
|
||||
((char_array_3[2] & 0xc0) >> 6);
|
||||
char_array_4[3] = char_array_3[2] & 0x3f;
|
||||
|
||||
for(i = 0; (i <4) ; i++) {
|
||||
ret += base64_chars[char_array_4[i]];
|
||||
}
|
||||
i = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (i) {
|
||||
for(j = i; j < 3; j++) {
|
||||
char_array_3[j] = '\0';
|
||||
}
|
||||
|
||||
char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
|
||||
char_array_4[1] = ((char_array_3[0] & 0x03) << 4) +
|
||||
((char_array_3[1] & 0xf0) >> 4);
|
||||
char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) +
|
||||
((char_array_3[2] & 0xc0) >> 6);
|
||||
char_array_4[3] = char_array_3[2] & 0x3f;
|
||||
|
||||
for (j = 0; (j < i + 1); j++) {
|
||||
ret += base64_chars[char_array_4[j]];
|
||||
}
|
||||
|
||||
while((i++ < 3)) {
|
||||
ret += '=';
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/// Encode a string into a base64 string
|
||||
/**
|
||||
* @param input The input data
|
||||
* @return A base64 encoded string representing input
|
||||
*/
|
||||
inline std::string base64_encode(std::string const & input) {
|
||||
return base64_encode(
|
||||
reinterpret_cast<const unsigned char *>(input.data()),
|
||||
input.size()
|
||||
);
|
||||
}
|
||||
|
||||
/// Decode a base64 encoded string into a string of raw bytes
|
||||
/**
|
||||
* @param input The base64 encoded input data
|
||||
* @return A string representing the decoded raw bytes
|
||||
*/
|
||||
inline std::string base64_decode(std::string const & input) {
|
||||
size_t in_len = input.size();
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
int in_ = 0;
|
||||
unsigned char char_array_4[4], char_array_3[3];
|
||||
std::string ret;
|
||||
|
||||
while (in_len-- && ( input[in_] != '=') && is_base64(input[in_])) {
|
||||
char_array_4[i++] = input[in_]; in_++;
|
||||
if (i ==4) {
|
||||
for (i = 0; i <4; i++) {
|
||||
char_array_4[i] = static_cast<unsigned char>(base64_chars.find(char_array_4[i]));
|
||||
}
|
||||
|
||||
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
|
||||
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
||||
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
||||
|
||||
for (i = 0; (i < 3); i++) {
|
||||
ret += char_array_3[i];
|
||||
}
|
||||
i = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (i) {
|
||||
for (j = i; j <4; j++)
|
||||
char_array_4[j] = 0;
|
||||
|
||||
for (j = 0; j <4; j++)
|
||||
char_array_4[j] = static_cast<unsigned char>(base64_chars.find(char_array_4[j]));
|
||||
|
||||
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
|
||||
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
||||
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
||||
|
||||
for (j = 0; (j < i - 1); j++) {
|
||||
ret += static_cast<std::string::value_type>(char_array_3[j]);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // namespace websocketpp
|
||||
|
||||
#endif // _BASE64_HPP_
|
||||
619
mod_audio_fork/lws_glue.cpp
Normal file
619
mod_audio_fork/lws_glue.cpp
Normal file
@@ -0,0 +1,619 @@
|
||||
#include <switch.h>
|
||||
#include <switch_json.h>
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <list>
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
#include <cassert>
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <regex>
|
||||
|
||||
#include "base64.hpp"
|
||||
#include "parser.hpp"
|
||||
#include "mod_audio_fork.h"
|
||||
#include "audio_pipe.hpp"
|
||||
|
||||
#define RTP_PACKETIZATION_PERIOD 20
|
||||
#define FRAME_SIZE_8000 320 /*which means each 20ms frame as 320 bytes at 8 khz (1 channel only)*/
|
||||
|
||||
namespace {
|
||||
static const char *requestedBufferSecs = std::getenv("MOD_AUDIO_FORK_BUFFER_SECS");
|
||||
static int nAudioBufferSecs = std::max(1, std::min(requestedBufferSecs ? ::atoi(requestedBufferSecs) : 2, 5));
|
||||
static const char *requestedNumServiceThreads = std::getenv("MOD_AUDIO_FORK_SERVICE_THREADS");
|
||||
static const char* mySubProtocolName = std::getenv("MOD_AUDIO_FORK_SUBPROTOCOL_NAME") ?
|
||||
std::getenv("MOD_AUDIO_FORK_SUBPROTOCOL_NAME") : "audio.drachtio.org";
|
||||
static unsigned int nServiceThreads = std::max(1, std::min(requestedNumServiceThreads ? ::atoi(requestedNumServiceThreads) : 1, 5));
|
||||
static unsigned int idxCallCount = 0;
|
||||
static uint32_t playCount = 0;
|
||||
|
||||
void processIncomingMessage(private_t* tech_pvt, switch_core_session_t* session, const char* message) {
|
||||
std::string msg = message;
|
||||
std::string type;
|
||||
cJSON* json = parse_json(session, msg, type) ;
|
||||
if (json) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "(%u) processIncomingMessage - received %s message\n", tech_pvt->id, type.c_str());
|
||||
cJSON* jsonData = cJSON_GetObjectItem(json, "data");
|
||||
if (0 == type.compare("playAudio")) {
|
||||
if (jsonData) {
|
||||
// dont send actual audio bytes in event message
|
||||
cJSON* jsonFile = NULL;
|
||||
cJSON* jsonAudio = cJSON_DetachItemFromObject(jsonData, "audioContent");
|
||||
int validAudio = (jsonAudio && NULL != jsonAudio->valuestring);
|
||||
|
||||
const char* szAudioContentType = cJSON_GetObjectCstr(jsonData, "audioContentType");
|
||||
char fileType[6];
|
||||
int sampleRate = 16000;
|
||||
if (0 == strcmp(szAudioContentType, "raw")) {
|
||||
cJSON* jsonSR = cJSON_GetObjectItem(jsonData, "sampleRate");
|
||||
sampleRate = jsonSR && jsonSR->valueint ? jsonSR->valueint : 0;
|
||||
|
||||
switch(sampleRate) {
|
||||
case 8000:
|
||||
strcpy(fileType, ".r8");
|
||||
break;
|
||||
case 16000:
|
||||
strcpy(fileType, ".r16");
|
||||
break;
|
||||
case 24000:
|
||||
strcpy(fileType, ".r24");
|
||||
break;
|
||||
case 32000:
|
||||
strcpy(fileType, ".r32");
|
||||
break;
|
||||
case 48000:
|
||||
strcpy(fileType, ".r48");
|
||||
break;
|
||||
case 64000:
|
||||
strcpy(fileType, ".r64");
|
||||
break;
|
||||
default:
|
||||
strcpy(fileType, ".r16");
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (0 == strcmp(szAudioContentType, "wave") || 0 == strcmp(szAudioContentType, "wav")) {
|
||||
strcpy(fileType, ".wav");
|
||||
}
|
||||
else {
|
||||
validAudio = 0;
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "(%u) processIncomingMessage - unsupported audioContentType: %s\n", tech_pvt->id, szAudioContentType);
|
||||
}
|
||||
|
||||
if (validAudio) {
|
||||
char szFilePath[256];
|
||||
|
||||
std::string rawAudio = drachtio::base64_decode(jsonAudio->valuestring);
|
||||
switch_snprintf(szFilePath, 256, "%s%s%s_%d.tmp%s", SWITCH_GLOBAL_dirs.temp_dir,
|
||||
SWITCH_PATH_SEPARATOR, tech_pvt->sessionId, playCount++, fileType);
|
||||
std::ofstream f(szFilePath, std::ofstream::binary);
|
||||
f << rawAudio;
|
||||
f.close();
|
||||
|
||||
// add the file to the list of files played for this session, we'll delete when session closes
|
||||
struct playout* playout = (struct playout *) malloc(sizeof(struct playout));
|
||||
playout->file = (char *) malloc(strlen(szFilePath) + 1);
|
||||
strcpy(playout->file, szFilePath);
|
||||
playout->next = tech_pvt->playout;
|
||||
tech_pvt->playout = playout;
|
||||
|
||||
jsonFile = cJSON_CreateString(szFilePath);
|
||||
cJSON_AddItemToObject(jsonData, "file", jsonFile);
|
||||
}
|
||||
|
||||
char* jsonString = cJSON_PrintUnformatted(jsonData);
|
||||
tech_pvt->responseHandler(session, EVENT_PLAY_AUDIO, jsonString);
|
||||
free(jsonString);
|
||||
if (jsonAudio) cJSON_Delete(jsonAudio);
|
||||
}
|
||||
else {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "(%u) processIncomingMessage - missing data payload in playAudio request\n", tech_pvt->id);
|
||||
}
|
||||
}
|
||||
else if (0 == type.compare("killAudio")) {
|
||||
tech_pvt->responseHandler(session, EVENT_KILL_AUDIO, NULL);
|
||||
|
||||
// kill any current playback on the channel
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_channel_set_flag_value(channel, CF_BREAK, 2);
|
||||
}
|
||||
else if (0 == type.compare("transcription")) {
|
||||
char* jsonString = cJSON_PrintUnformatted(jsonData);
|
||||
tech_pvt->responseHandler(session, EVENT_TRANSCRIPTION, jsonString);
|
||||
free(jsonString);
|
||||
}
|
||||
else if (0 == type.compare("transfer")) {
|
||||
char* jsonString = cJSON_PrintUnformatted(jsonData);
|
||||
tech_pvt->responseHandler(session, EVENT_TRANSFER, jsonString);
|
||||
free(jsonString);
|
||||
}
|
||||
else if (0 == type.compare("disconnect")) {
|
||||
char* jsonString = cJSON_PrintUnformatted(jsonData);
|
||||
tech_pvt->responseHandler(session, EVENT_DISCONNECT, jsonString);
|
||||
free(jsonString);
|
||||
}
|
||||
else if (0 == type.compare("error")) {
|
||||
char* jsonString = cJSON_PrintUnformatted(jsonData);
|
||||
tech_pvt->responseHandler(session, EVENT_ERROR, jsonString);
|
||||
free(jsonString);
|
||||
}
|
||||
else if (0 == type.compare("json")) {
|
||||
char* jsonString = cJSON_PrintUnformatted(json);
|
||||
tech_pvt->responseHandler(session, EVENT_JSON, jsonString);
|
||||
free(jsonString);
|
||||
}
|
||||
else {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "(%u) processIncomingMessage - unsupported msg type %s\n", tech_pvt->id, type.c_str());
|
||||
}
|
||||
cJSON_Delete(json);
|
||||
}
|
||||
else {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "(%u) processIncomingMessage - could not parse message: %s\n", tech_pvt->id, message);
|
||||
}
|
||||
}
|
||||
|
||||
static void eventCallback(const char* sessionId, const char* bugname, AudioPipe::NotifyEvent_t event, const char* message) {
|
||||
switch_core_session_t* session = switch_core_session_locate(sessionId);
|
||||
if (session) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, bugname);
|
||||
if (bug) {
|
||||
private_t* tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug);
|
||||
if (tech_pvt) {
|
||||
switch (event) {
|
||||
case AudioPipe::CONNECT_SUCCESS:
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "connection successful\n");
|
||||
tech_pvt->responseHandler(session, EVENT_CONNECT_SUCCESS, NULL);
|
||||
if (strlen(tech_pvt->initialMetadata) > 0) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "sending initial metadata %s\n", tech_pvt->initialMetadata);
|
||||
AudioPipe *pAudioPipe = static_cast<AudioPipe *>(tech_pvt->pAudioPipe);
|
||||
pAudioPipe->bufferForSending(tech_pvt->initialMetadata);
|
||||
}
|
||||
break;
|
||||
case AudioPipe::CONNECT_FAIL:
|
||||
{
|
||||
// first thing: we can no longer access the AudioPipe
|
||||
std::stringstream json;
|
||||
json << "{\"reason\":\"" << message << "\"}";
|
||||
tech_pvt->pAudioPipe = nullptr;
|
||||
tech_pvt->responseHandler(session, EVENT_CONNECT_FAIL, (char *) json.str().c_str());
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_NOTICE, "connection failed: %s\n", message);
|
||||
}
|
||||
break;
|
||||
case AudioPipe::CONNECTION_DROPPED:
|
||||
// first thing: we can no longer access the AudioPipe
|
||||
tech_pvt->pAudioPipe = nullptr;
|
||||
tech_pvt->responseHandler(session, EVENT_DISCONNECT, NULL);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_NOTICE, "connection dropped from far end\n");
|
||||
break;
|
||||
case AudioPipe::CONNECTION_CLOSED_GRACEFULLY:
|
||||
// first thing: we can no longer access the AudioPipe
|
||||
tech_pvt->pAudioPipe = nullptr;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "connection closed gracefully\n");
|
||||
break;
|
||||
case AudioPipe::MESSAGE:
|
||||
processIncomingMessage(tech_pvt, session, message);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
switch_core_session_rwunlock(session);
|
||||
}
|
||||
}
|
||||
switch_status_t fork_data_init(private_t *tech_pvt, switch_core_session_t *session, char * host,
|
||||
unsigned int port, char* path, int sslFlags, int sampling, int desiredSampling, int channels,
|
||||
char *bugname, char* metadata, responseHandler_t responseHandler) {
|
||||
|
||||
const char* username = nullptr;
|
||||
const char* password = nullptr;
|
||||
int err;
|
||||
switch_codec_implementation_t read_impl;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
|
||||
switch_core_session_get_read_impl(session, &read_impl);
|
||||
|
||||
if (username = switch_channel_get_variable(channel, "MOD_AUDIO_BASIC_AUTH_USERNAME")) {
|
||||
password = switch_channel_get_variable(channel, "MOD_AUDIO_BASIC_AUTH_PASSWORD");
|
||||
}
|
||||
|
||||
memset(tech_pvt, 0, sizeof(private_t));
|
||||
|
||||
strncpy(tech_pvt->sessionId, switch_core_session_get_uuid(session), MAX_SESSION_ID);
|
||||
strncpy(tech_pvt->host, host, MAX_WS_URL_LEN);
|
||||
tech_pvt->port = port;
|
||||
strncpy(tech_pvt->path, path, MAX_PATH_LEN);
|
||||
tech_pvt->sampling = desiredSampling;
|
||||
tech_pvt->responseHandler = responseHandler;
|
||||
tech_pvt->playout = NULL;
|
||||
tech_pvt->channels = channels;
|
||||
tech_pvt->id = ++idxCallCount;
|
||||
tech_pvt->buffer_overrun_notified = 0;
|
||||
tech_pvt->audio_paused = 0;
|
||||
tech_pvt->graceful_shutdown = 0;
|
||||
strncpy(tech_pvt->bugname, bugname, MAX_BUG_LEN);
|
||||
if (metadata) strncpy(tech_pvt->initialMetadata, metadata, MAX_METADATA_LEN);
|
||||
|
||||
size_t buflen = LWS_PRE + (FRAME_SIZE_8000 * desiredSampling / 8000 * channels * 1000 / RTP_PACKETIZATION_PERIOD * nAudioBufferSecs);
|
||||
|
||||
AudioPipe* ap = new AudioPipe(tech_pvt->sessionId, host, port, path, sslFlags,
|
||||
buflen, read_impl.decoded_bytes_per_packet, username, password, bugname, eventCallback);
|
||||
if (!ap) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error allocating AudioPipe\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
tech_pvt->pAudioPipe = static_cast<void *>(ap);
|
||||
|
||||
switch_mutex_init(&tech_pvt->mutex, SWITCH_MUTEX_NESTED, switch_core_session_get_pool(session));
|
||||
|
||||
if (desiredSampling != sampling) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) resampling from %u to %u\n", tech_pvt->id, sampling, desiredSampling);
|
||||
tech_pvt->resampler = speex_resampler_init(channels, sampling, desiredSampling, SWITCH_RESAMPLE_QUALITY, &err);
|
||||
if (0 != err) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error initializing resampler: %s.\n", speex_resampler_strerror(err));
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
}
|
||||
else {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) no resampling needed for this call\n", tech_pvt->id);
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) fork_data_init\n", tech_pvt->id);
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void destroy_tech_pvt(private_t* tech_pvt) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "%s (%u) destroy_tech_pvt\n", tech_pvt->sessionId, tech_pvt->id);
|
||||
if (tech_pvt->resampler) {
|
||||
speex_resampler_destroy(tech_pvt->resampler);
|
||||
tech_pvt->resampler = nullptr;
|
||||
}
|
||||
if (tech_pvt->mutex) {
|
||||
switch_mutex_destroy(tech_pvt->mutex);
|
||||
tech_pvt->mutex = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void lws_logger(int level, const char *line) {
|
||||
switch_log_level_t llevel = SWITCH_LOG_DEBUG;
|
||||
|
||||
switch (level) {
|
||||
case LLL_ERR: llevel = SWITCH_LOG_ERROR; break;
|
||||
case LLL_WARN: llevel = SWITCH_LOG_WARNING; break;
|
||||
case LLL_NOTICE: llevel = SWITCH_LOG_NOTICE; break;
|
||||
case LLL_INFO: llevel = SWITCH_LOG_INFO; break;
|
||||
break;
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, llevel, "%s\n", line);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
int parse_ws_uri(switch_channel_t *channel, const char* szServerUri, char* host, char *path, unsigned int* pPort, int* pSslFlags) {
|
||||
int i = 0, offset;
|
||||
char server[MAX_WS_URL_LEN + MAX_PATH_LEN];
|
||||
char *saveptr;
|
||||
int flags = LCCSCF_USE_SSL;
|
||||
|
||||
if (switch_true(switch_channel_get_variable(channel, "MOD_AUDIO_FORK_ALLOW_SELFSIGNED"))) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "parse_ws_uri - allowing self-signed certs\n");
|
||||
flags |= LCCSCF_ALLOW_SELFSIGNED;
|
||||
}
|
||||
if (switch_true(switch_channel_get_variable(channel, "MOD_AUDIO_FORK_SKIP_SERVER_CERT_HOSTNAME_CHECK"))) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "parse_ws_uri - skipping hostname check\n");
|
||||
flags |= LCCSCF_SKIP_SERVER_CERT_HOSTNAME_CHECK;
|
||||
}
|
||||
if (switch_true(switch_channel_get_variable(channel, "MOD_AUDIO_FORK_ALLOW_EXPIRED"))) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "parse_ws_uri - allowing expired certs\n");
|
||||
flags |= LCCSCF_ALLOW_EXPIRED;
|
||||
}
|
||||
|
||||
// get the scheme
|
||||
strncpy(server, szServerUri, MAX_WS_URL_LEN + MAX_PATH_LEN);
|
||||
if (0 == strncmp(server, "https://", 8) || 0 == strncmp(server, "HTTPS://", 8)) {
|
||||
*pSslFlags = flags;
|
||||
offset = 8;
|
||||
*pPort = 443;
|
||||
}
|
||||
else if (0 == strncmp(server, "wss://", 6) || 0 == strncmp(server, "WSS://", 6)) {
|
||||
*pSslFlags = flags;
|
||||
offset = 6;
|
||||
*pPort = 443;
|
||||
}
|
||||
else if (0 == strncmp(server, "http://", 7) || 0 == strncmp(server, "HTTP://", 7)) {
|
||||
offset = 7;
|
||||
*pSslFlags = 0;
|
||||
*pPort = 80;
|
||||
}
|
||||
else if (0 == strncmp(server, "ws://", 5) || 0 == strncmp(server, "WS://", 5)) {
|
||||
offset = 5;
|
||||
*pSslFlags = 0;
|
||||
*pPort = 80;
|
||||
}
|
||||
else {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "parse_ws_uri - error parsing uri %s: invalid scheme\n", szServerUri);;
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::string strHost(server + offset);
|
||||
std::regex re("^(.+?):?(\\d+)?(/.*)?$");
|
||||
std::smatch matches;
|
||||
if(std::regex_search(strHost, matches, re)) {
|
||||
/*
|
||||
for (int i = 0; i < matches.length(); i++) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "parse_ws_uri - %d: %s\n", i, matches[i].str().c_str());
|
||||
}
|
||||
*/
|
||||
strncpy(host, matches[1].str().c_str(), MAX_WS_URL_LEN);
|
||||
if (matches[2].str().length() > 0) {
|
||||
*pPort = atoi(matches[2].str().c_str());
|
||||
}
|
||||
if (matches[3].str().length() > 0) {
|
||||
strncpy(path, matches[3].str().c_str(), MAX_PATH_LEN);
|
||||
}
|
||||
else {
|
||||
strcpy(path, "/");
|
||||
}
|
||||
} else {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "parse_ws_uri - invalid format %s\n", strHost.c_str());
|
||||
return 0;
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "parse_ws_uri - host %s, path %s\n", host, path);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
switch_status_t fork_init() {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "mod_audio_fork: audio buffer (in secs): %d secs\n", nAudioBufferSecs);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "mod_audio_fork: sub-protocol: %s\n", mySubProtocolName);
|
||||
|
||||
int logs = LLL_ERR | LLL_WARN | LLL_NOTICE ;
|
||||
//LLL_INFO | LLL_PARSER | LLL_HEADER | LLL_EXT | LLL_CLIENT | LLL_LATENCY | LLL_DEBUG ;
|
||||
AudioPipe::initialize(mySubProtocolName, logs, lws_logger);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_status_t fork_cleanup() {
|
||||
bool cleanup = false;
|
||||
cleanup = AudioPipe::deinitialize();
|
||||
if (cleanup == true) {
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
switch_status_t fork_session_init(switch_core_session_t *session,
|
||||
responseHandler_t responseHandler,
|
||||
uint32_t samples_per_second,
|
||||
char *host,
|
||||
unsigned int port,
|
||||
char *path,
|
||||
int sampling,
|
||||
int sslFlags,
|
||||
int channels,
|
||||
char *bugname,
|
||||
char* metadata,
|
||||
void **ppUserData)
|
||||
{
|
||||
int err;
|
||||
|
||||
// allocate per-session data structure
|
||||
private_t* tech_pvt = (private_t *) switch_core_session_alloc(session, sizeof(private_t));
|
||||
if (!tech_pvt) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "error allocating memory!\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
if (SWITCH_STATUS_SUCCESS != fork_data_init(tech_pvt, session, host, port, path, sslFlags, samples_per_second, sampling, channels,
|
||||
bugname, metadata, responseHandler)) {
|
||||
destroy_tech_pvt(tech_pvt);
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
*ppUserData = tech_pvt;
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_status_t fork_session_connect(void **ppUserData) {
|
||||
private_t *tech_pvt = static_cast<private_t *>(*ppUserData);
|
||||
AudioPipe *pAudioPipe = static_cast<AudioPipe*>(tech_pvt->pAudioPipe);
|
||||
pAudioPipe->connect();
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_status_t fork_session_cleanup(switch_core_session_t *session, char *bugname, char* text, int channelIsClosing) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, bugname);
|
||||
if (!bug) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "fork_session_cleanup: no bug %s - websocket conection already closed\n", bugname);
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
private_t* tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug);
|
||||
uint32_t id = tech_pvt->id;
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) fork_session_cleanup\n", id);
|
||||
|
||||
if (!tech_pvt) return SWITCH_STATUS_FALSE;
|
||||
AudioPipe *pAudioPipe = static_cast<AudioPipe *>(tech_pvt->pAudioPipe);
|
||||
|
||||
switch_mutex_lock(tech_pvt->mutex);
|
||||
|
||||
// get the bug again, now that we are under lock
|
||||
{
|
||||
switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, bugname);
|
||||
if (bug) {
|
||||
switch_channel_set_private(channel, bugname, NULL);
|
||||
if (!channelIsClosing) {
|
||||
switch_core_media_bug_remove(session, &bug);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// delete any temp files
|
||||
struct playout* playout = tech_pvt->playout;
|
||||
while (playout) {
|
||||
std::remove(playout->file);
|
||||
free(playout->file);
|
||||
struct playout *tmp = playout;
|
||||
playout = playout->next;
|
||||
free(tmp);
|
||||
}
|
||||
|
||||
if (pAudioPipe && text) pAudioPipe->bufferForSending(text);
|
||||
if (pAudioPipe) pAudioPipe->close();
|
||||
|
||||
destroy_tech_pvt(tech_pvt);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "(%u) fork_session_cleanup: connection closed\n", id);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_status_t fork_session_send_text(switch_core_session_t *session, char *bugname, char* text) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, bugname);
|
||||
if (!bug) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "fork_session_send_text failed because no bug\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
private_t* tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug);
|
||||
|
||||
if (!tech_pvt) return SWITCH_STATUS_FALSE;
|
||||
AudioPipe *pAudioPipe = static_cast<AudioPipe *>(tech_pvt->pAudioPipe);
|
||||
if (pAudioPipe && text) pAudioPipe->bufferForSending(text);
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_status_t fork_session_pauseresume(switch_core_session_t *session, char *bugname, int pause) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, bugname);
|
||||
if (!bug) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "fork_session_pauseresume failed because no bug\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
private_t* tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug);
|
||||
|
||||
if (!tech_pvt) return SWITCH_STATUS_FALSE;
|
||||
|
||||
switch_core_media_bug_flush(bug);
|
||||
tech_pvt->audio_paused = pause;
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_status_t fork_session_graceful_shutdown(switch_core_session_t *session, char *bugname) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, bugname);
|
||||
if (!bug) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "fork_session_graceful_shutdown failed because no bug\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
private_t* tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug);
|
||||
|
||||
if (!tech_pvt) return SWITCH_STATUS_FALSE;
|
||||
|
||||
tech_pvt->graceful_shutdown = 1;
|
||||
|
||||
AudioPipe *pAudioPipe = static_cast<AudioPipe *>(tech_pvt->pAudioPipe);
|
||||
if (pAudioPipe) pAudioPipe->do_graceful_shutdown();
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_bool_t fork_frame(switch_core_session_t *session, switch_media_bug_t *bug) {
|
||||
private_t* tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug);
|
||||
size_t inuse = 0;
|
||||
bool dirty = false;
|
||||
char *p = (char *) "{\"msg\": \"buffer overrun\"}";
|
||||
|
||||
if (!tech_pvt || tech_pvt->audio_paused || tech_pvt->graceful_shutdown) return SWITCH_TRUE;
|
||||
|
||||
if (switch_mutex_trylock(tech_pvt->mutex) == SWITCH_STATUS_SUCCESS) {
|
||||
if (!tech_pvt->pAudioPipe) {
|
||||
switch_mutex_unlock(tech_pvt->mutex);
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
AudioPipe *pAudioPipe = static_cast<AudioPipe *>(tech_pvt->pAudioPipe);
|
||||
if (pAudioPipe->getLwsState() != AudioPipe::LWS_CLIENT_CONNECTED) {
|
||||
switch_mutex_unlock(tech_pvt->mutex);
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
|
||||
pAudioPipe->lockAudioBuffer();
|
||||
size_t available = pAudioPipe->binarySpaceAvailable();
|
||||
if (NULL == tech_pvt->resampler) {
|
||||
switch_frame_t frame = { 0 };
|
||||
frame.data = pAudioPipe->binaryWritePtr();
|
||||
frame.buflen = available;
|
||||
while (true) {
|
||||
|
||||
// check if buffer would be overwritten; dump packets if so
|
||||
if (available < pAudioPipe->binaryMinSpace()) {
|
||||
if (!tech_pvt->buffer_overrun_notified) {
|
||||
tech_pvt->buffer_overrun_notified = 1;
|
||||
tech_pvt->responseHandler(session, EVENT_BUFFER_OVERRUN, NULL);
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "(%u) dropping packets!\n",
|
||||
tech_pvt->id);
|
||||
pAudioPipe->binaryWritePtrResetToZero();
|
||||
|
||||
frame.data = pAudioPipe->binaryWritePtr();
|
||||
frame.buflen = available = pAudioPipe->binarySpaceAvailable();
|
||||
}
|
||||
|
||||
switch_status_t rv = switch_core_media_bug_read(bug, &frame, SWITCH_TRUE);
|
||||
if (rv != SWITCH_STATUS_SUCCESS) break;
|
||||
if (frame.datalen) {
|
||||
pAudioPipe->binaryWritePtrAdd(frame.datalen);
|
||||
frame.buflen = available = pAudioPipe->binarySpaceAvailable();
|
||||
frame.data = pAudioPipe->binaryWritePtr();
|
||||
dirty = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
uint8_t data[SWITCH_RECOMMENDED_BUFFER_SIZE];
|
||||
switch_frame_t frame = { 0 };
|
||||
frame.data = data;
|
||||
frame.buflen = SWITCH_RECOMMENDED_BUFFER_SIZE;
|
||||
while (switch_core_media_bug_read(bug, &frame, SWITCH_TRUE) == SWITCH_STATUS_SUCCESS) {
|
||||
if (frame.datalen) {
|
||||
spx_uint32_t out_len = available >> 1; // space for samples which are 2 bytes
|
||||
spx_uint32_t in_len = frame.samples;
|
||||
|
||||
speex_resampler_process_interleaved_int(tech_pvt->resampler,
|
||||
(const spx_int16_t *) frame.data,
|
||||
(spx_uint32_t *) &in_len,
|
||||
(spx_int16_t *) ((char *) pAudioPipe->binaryWritePtr()),
|
||||
&out_len);
|
||||
|
||||
if (out_len > 0) {
|
||||
// bytes written = num samples * 2 * num channels
|
||||
size_t bytes_written = out_len << tech_pvt->channels;
|
||||
pAudioPipe->binaryWritePtrAdd(bytes_written);
|
||||
available = pAudioPipe->binarySpaceAvailable();
|
||||
dirty = true;
|
||||
}
|
||||
if (available < pAudioPipe->binaryMinSpace()) {
|
||||
if (!tech_pvt->buffer_overrun_notified) {
|
||||
tech_pvt->buffer_overrun_notified = 1;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "(%u) dropping packets!\n",
|
||||
tech_pvt->id);
|
||||
tech_pvt->responseHandler(session, EVENT_BUFFER_OVERRUN, NULL);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pAudioPipe->unlockAudioBuffer();
|
||||
switch_mutex_unlock(tech_pvt->mutex);
|
||||
}
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
20
mod_audio_fork/lws_glue.h
Normal file
20
mod_audio_fork/lws_glue.h
Normal file
@@ -0,0 +1,20 @@
|
||||
#ifndef __LWS_GLUE_H__
|
||||
#define __LWS_GLUE_H__
|
||||
|
||||
#include "mod_audio_fork.h"
|
||||
|
||||
int parse_ws_uri(switch_channel_t *channel, const char* szServerUri, char* host, char *path, unsigned int* pPort, int* pSslFlags);
|
||||
|
||||
switch_status_t fork_init();
|
||||
switch_status_t fork_cleanup();
|
||||
switch_status_t fork_session_init(switch_core_session_t *session, responseHandler_t responseHandler,
|
||||
uint32_t samples_per_second, char *host, unsigned int port, char* path, int sampling, int sslFlags, int channels,
|
||||
char *bugname, char* metadata, void **ppUserData);
|
||||
switch_status_t fork_session_cleanup(switch_core_session_t *session, char *bugname, char* text, int channelIsClosing);
|
||||
switch_status_t fork_session_pauseresume(switch_core_session_t *session, char *bugname, int pause);
|
||||
switch_status_t fork_session_graceful_shutdown(switch_core_session_t *session, char *bugname);
|
||||
switch_status_t fork_session_send_text(switch_core_session_t *session, char *bugname, char* text);
|
||||
switch_bool_t fork_frame(switch_core_session_t *session, switch_media_bug_t *bug);
|
||||
switch_status_t fork_service_threads();
|
||||
switch_status_t fork_session_connect(void **ppUserData);
|
||||
#endif
|
||||
359
mod_audio_fork/mod_audio_fork.c
Normal file
359
mod_audio_fork/mod_audio_fork.c
Normal file
@@ -0,0 +1,359 @@
|
||||
/*
|
||||
*
|
||||
* mod_audio_fork.c -- Freeswitch module for forking audio to remote server over websockets
|
||||
*
|
||||
*/
|
||||
#include "mod_audio_fork.h"
|
||||
#include "lws_glue.h"
|
||||
|
||||
//static int mod_running = 0;
|
||||
|
||||
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_audio_fork_shutdown);
|
||||
SWITCH_MODULE_RUNTIME_FUNCTION(mod_audio_fork_runtime);
|
||||
SWITCH_MODULE_LOAD_FUNCTION(mod_audio_fork_load);
|
||||
|
||||
SWITCH_MODULE_DEFINITION(mod_audio_fork, mod_audio_fork_load, mod_audio_fork_shutdown, NULL /*mod_audio_fork_runtime*/);
|
||||
|
||||
static void responseHandler(switch_core_session_t* session, const char * eventName, char * json) {
|
||||
switch_event_t *event;
|
||||
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
if (json) switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "responseHandler: sending event payload: %s.\n", json);
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, eventName);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
if (json) switch_event_add_body(event, "%s", json);
|
||||
switch_event_fire(&event);
|
||||
}
|
||||
|
||||
static switch_bool_t capture_callback(switch_media_bug_t *bug, void *user_data, switch_abc_type_t type)
|
||||
{
|
||||
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
|
||||
switch (type) {
|
||||
case SWITCH_ABC_TYPE_INIT:
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_CLOSE:
|
||||
{
|
||||
private_t* tech_pvt = (private_t *) switch_core_media_bug_get_user_data(bug);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "Got SWITCH_ABC_TYPE_CLOSE for bug %s\n", tech_pvt->bugname);
|
||||
fork_session_cleanup(session, tech_pvt->bugname, NULL, 1);
|
||||
}
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_READ:
|
||||
return fork_frame(session, bug);
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_WRITE:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
|
||||
static switch_status_t start_capture(switch_core_session_t *session,
|
||||
switch_media_bug_flag_t flags,
|
||||
char* host,
|
||||
unsigned int port,
|
||||
char* path,
|
||||
int sampling,
|
||||
int sslFlags,
|
||||
char* bugname,
|
||||
char* metadata)
|
||||
{
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug;
|
||||
switch_status_t status;
|
||||
switch_codec_t* read_codec;
|
||||
|
||||
void *pUserData = NULL;
|
||||
int channels = (flags & SMBF_STEREO) ? 2 : 1;
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO,
|
||||
"mod_audio_fork (%s): streaming %d sampling to %s path %s port %d tls: %s.\n",
|
||||
bugname, sampling, host, path, port, sslFlags ? "yes" : "no");
|
||||
|
||||
if (switch_channel_get_private(channel, bugname)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "mod_audio_fork: bug %s already attached!\n", bugname);
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
read_codec = switch_core_session_get_read_codec(session);
|
||||
|
||||
if (switch_channel_pre_answer(channel) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "mod_audio_fork: channel must have reached pre-answer status before calling start!\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "calling fork_session_init.\n");
|
||||
if (SWITCH_STATUS_FALSE == fork_session_init(session, responseHandler, read_codec->implementation->actual_samples_per_second,
|
||||
host, port, path, sampling, sslFlags, channels, bugname, metadata, &pUserData)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error initializing mod_audio_fork session.\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "adding bug %s.\n", bugname);
|
||||
if ((status = switch_core_media_bug_add(session, bugname, NULL, capture_callback, pUserData, 0, flags, &bug)) != SWITCH_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "setting bug private data %s.\n", bugname);
|
||||
switch_channel_set_private(channel, bugname, bug);
|
||||
|
||||
if (fork_session_connect(&pUserData) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error mod_audio_fork session cannot connect.\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "exiting start_capture.\n");
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static switch_status_t do_stop(switch_core_session_t *session, char* bugname, char* text)
|
||||
{
|
||||
switch_status_t status = SWITCH_STATUS_SUCCESS;
|
||||
|
||||
if (text) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "mod_audio_fork (%s): stop w/ final text %s\n", bugname, text);
|
||||
}
|
||||
else {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "mod_audio_fork (%s): stop\n", bugname);
|
||||
}
|
||||
status = fork_session_cleanup(session, bugname, text, 0);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static switch_status_t do_pauseresume(switch_core_session_t *session, char* bugname, int pause)
|
||||
{
|
||||
switch_status_t status = SWITCH_STATUS_SUCCESS;
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "mod_audio_fork (%s): %s\n", bugname, pause ? "pause" : "resume");
|
||||
status = fork_session_pauseresume(session, bugname, pause);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static switch_status_t do_graceful_shutdown(switch_core_session_t *session, char* bugname)
|
||||
{
|
||||
switch_status_t status = SWITCH_STATUS_SUCCESS;
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "mod_audio_fork (%s): do_graceful_shutdown \n", bugname);
|
||||
status = fork_session_graceful_shutdown(session, bugname);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static switch_status_t send_text(switch_core_session_t *session, char* bugname, char* text) {
|
||||
switch_status_t status = SWITCH_STATUS_FALSE;
|
||||
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = switch_channel_get_private(channel, bugname);
|
||||
|
||||
if (bug) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "mod_audio_fork (%s): sending text: %s.\n", bugname, text);
|
||||
status = fork_session_send_text(session, bugname, text);
|
||||
}
|
||||
else {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "mod_audio_fork (%s): no bug, failed sending text: %s.\n", bugname, text);
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
#define FORK_API_SYNTAX "<uuid> [start | stop | send_text | pause | resume | graceful-shutdown ] [wss-url | path] [mono | mixed | stereo] [8000 | 16000 | 24000 | 32000 | 64000] [bugname] [metadata]"
|
||||
SWITCH_STANDARD_API(fork_function)
|
||||
{
|
||||
char *mycmd = NULL, *argv[7] = { 0 };
|
||||
int argc = 0;
|
||||
switch_status_t status = SWITCH_STATUS_FALSE;
|
||||
char *bugname = MY_BUG_NAME;
|
||||
|
||||
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
|
||||
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
|
||||
}
|
||||
assert(cmd);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "mod_audio_fork cmd: %s\n", cmd);
|
||||
|
||||
|
||||
if (zstr(cmd) || argc < 2 ||
|
||||
(0 == strcmp(argv[1], "start") && argc < 4)) {
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s %s %s.\n", cmd, argv[0], argv[1]);
|
||||
stream->write_function(stream, "-USAGE: %s\n", FORK_API_SYNTAX);
|
||||
goto done;
|
||||
} else {
|
||||
switch_core_session_t *lsession = NULL;
|
||||
|
||||
if ((lsession = switch_core_session_locate(argv[0]))) {
|
||||
if (!strcasecmp(argv[1], "stop")) {
|
||||
char * text = NULL;
|
||||
if (argc > 3) {
|
||||
bugname = argv[2];
|
||||
text = argv[3];
|
||||
}
|
||||
else if (argc > 2) {
|
||||
if (argv[2][0] == '{' || argv[2][0] == '[') text = argv[2];
|
||||
else bugname = argv[2];
|
||||
}
|
||||
status = do_stop(lsession, bugname, text);
|
||||
}
|
||||
else if (!strcasecmp(argv[1], "pause")) {
|
||||
if (argc > 2) bugname = argv[2];
|
||||
status = do_pauseresume(lsession, bugname, 1);
|
||||
}
|
||||
else if (!strcasecmp(argv[1], "resume")) {
|
||||
if (argc > 2) bugname = argv[2];
|
||||
status = do_pauseresume(lsession, bugname, 0);
|
||||
}
|
||||
else if (!strcasecmp(argv[1], "graceful-shutdown")) {
|
||||
if (argc > 2) bugname = argv[2];
|
||||
status = do_graceful_shutdown(lsession, bugname);
|
||||
}
|
||||
else if (!strcasecmp(argv[1], "send_text")) {
|
||||
char * text = 0;
|
||||
if (argc < 3) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "send_text requires an argument specifying text to send\n");
|
||||
switch_core_session_rwunlock(lsession);
|
||||
goto done;
|
||||
}
|
||||
if (argc > 3) {
|
||||
bugname = argv[2];
|
||||
text = argv[3];
|
||||
}
|
||||
else {
|
||||
if (argv[2][0] == '{' || argv[2][0] == '[') text = argv[2];
|
||||
else bugname = argv[2];
|
||||
}
|
||||
status = send_text(lsession, bugname, text);
|
||||
}
|
||||
else if (!strcasecmp(argv[1], "start")) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(lsession);
|
||||
char host[MAX_WS_URL_LEN], path[MAX_PATH_LEN];
|
||||
unsigned int port;
|
||||
int sslFlags;
|
||||
int sampling = 8000;
|
||||
switch_media_bug_flag_t flags = SMBF_READ_STREAM ;
|
||||
char *metadata = NULL;
|
||||
if( argc > 6) {
|
||||
bugname = argv[5];
|
||||
metadata = argv[6];
|
||||
}
|
||||
else if (argc > 5) {
|
||||
if (argv[5][0] == '{' || argv[5][0] == '[') metadata = argv[5];
|
||||
else bugname = argv[5];
|
||||
}
|
||||
if (0 == strcmp(argv[3], "mixed")) {
|
||||
flags |= SMBF_WRITE_STREAM ;
|
||||
}
|
||||
else if (0 == strcmp(argv[3], "stereo")) {
|
||||
flags |= SMBF_WRITE_STREAM ;
|
||||
flags |= SMBF_STEREO;
|
||||
}
|
||||
else if(0 != strcmp(argv[3], "mono")) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "invalid mix type: %s, must be mono, mixed, or stereo\n", argv[3]);
|
||||
switch_core_session_rwunlock(lsession);
|
||||
goto done;
|
||||
}
|
||||
if (0 == strcmp(argv[4], "16k")) {
|
||||
sampling = 16000;
|
||||
}
|
||||
else if (0 == strcmp(argv[4], "8k")) {
|
||||
sampling = 8000;
|
||||
}
|
||||
else {
|
||||
sampling = atoi(argv[4]);
|
||||
}
|
||||
if (!parse_ws_uri(channel, argv[2], &host[0], &path[0], &port, &sslFlags)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "invalid websocket uri: %s\n", argv[2]);
|
||||
}
|
||||
else if (sampling % 8000 != 0) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "invalid sample rate: %s\n", argv[4]);
|
||||
}
|
||||
status = start_capture(lsession, flags, host, port, path, sampling, sslFlags, bugname, metadata);
|
||||
}
|
||||
else {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "unsupported mod_audio_fork cmd: %s\n", argv[1]);
|
||||
}
|
||||
switch_core_session_rwunlock(lsession);
|
||||
}
|
||||
else {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error locating session %s\n", argv[0]);
|
||||
}
|
||||
}
|
||||
|
||||
if (status == SWITCH_STATUS_SUCCESS) {
|
||||
stream->write_function(stream, "+OK Success\n");
|
||||
} else {
|
||||
stream->write_function(stream, "-ERR Operation Failed\n");
|
||||
}
|
||||
|
||||
done:
|
||||
|
||||
switch_safe_free(mycmd);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
SWITCH_MODULE_LOAD_FUNCTION(mod_audio_fork_load)
|
||||
{
|
||||
switch_api_interface_t *api_interface;
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "mod_audio_fork API loading..\n");
|
||||
|
||||
/* connect my internal structure to the blank pointer passed to me */
|
||||
*module_interface = switch_loadable_module_create_module_interface(pool, modname);
|
||||
|
||||
/* create/register custom event message types */
|
||||
if (switch_event_reserve_subclass(EVENT_TRANSCRIPTION) != SWITCH_STATUS_SUCCESS ||
|
||||
switch_event_reserve_subclass(EVENT_TRANSFER) != SWITCH_STATUS_SUCCESS ||
|
||||
switch_event_reserve_subclass(EVENT_PLAY_AUDIO) != SWITCH_STATUS_SUCCESS ||
|
||||
switch_event_reserve_subclass(EVENT_KILL_AUDIO) != SWITCH_STATUS_SUCCESS ||
|
||||
switch_event_reserve_subclass(EVENT_ERROR) != SWITCH_STATUS_SUCCESS ||
|
||||
switch_event_reserve_subclass(EVENT_DISCONNECT) != SWITCH_STATUS_SUCCESS) {
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register an event subclass for mod_audio_fork API.\n");
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
|
||||
SWITCH_ADD_API(api_interface, "uuid_audio_fork", "audio_fork API", fork_function, FORK_API_SYNTAX);
|
||||
switch_console_set_complete("add uuid_audio_fork start wss-url metadata");
|
||||
switch_console_set_complete("add uuid_audio_fork start wss-url");
|
||||
switch_console_set_complete("add uuid_audio_fork stop");
|
||||
|
||||
fork_init();
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "mod_audio_fork API successfully loaded\n");
|
||||
|
||||
/* indicate that the module should continue to be loaded */
|
||||
//mod_running = 1;
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Called when the system shuts down
|
||||
Macro expands to: switch_status_t mod_audio_fork_shutdown() */
|
||||
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_audio_fork_shutdown)
|
||||
{
|
||||
fork_cleanup();
|
||||
//mod_running = 0;
|
||||
switch_event_free_subclass(EVENT_TRANSCRIPTION);
|
||||
switch_event_free_subclass(EVENT_TRANSFER);
|
||||
switch_event_free_subclass(EVENT_PLAY_AUDIO);
|
||||
switch_event_free_subclass(EVENT_KILL_AUDIO);
|
||||
switch_event_free_subclass(EVENT_DISCONNECT);
|
||||
switch_event_free_subclass(EVENT_ERROR);
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
If it exists, this is called in it's own thread when the module-load completes
|
||||
If it returns anything but SWITCH_STATUS_TERM it will be called again automatically
|
||||
Macro expands to: switch_status_t mod_audio_fork_runtime()
|
||||
*/
|
||||
/*
|
||||
SWITCH_MODULE_RUNTIME_FUNCTION(mod_audio_fork_runtime)
|
||||
{
|
||||
fork_service_threads(&mod_running);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
*/
|
||||
59
mod_audio_fork/mod_audio_fork.h
Normal file
59
mod_audio_fork/mod_audio_fork.h
Normal file
@@ -0,0 +1,59 @@
|
||||
#ifndef __MOD_FORK_H__
|
||||
#define __MOD_FORK_H__
|
||||
|
||||
#include <switch.h>
|
||||
#include <libwebsockets.h>
|
||||
#include <speex/speex_resampler.h>
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
#define MY_BUG_NAME "audio_fork"
|
||||
#define MAX_BUG_LEN (64)
|
||||
#define MAX_SESSION_ID (256)
|
||||
#define MAX_WS_URL_LEN (512)
|
||||
#define MAX_PATH_LEN (4096)
|
||||
|
||||
#define EVENT_TRANSCRIPTION "mod_audio_fork::transcription"
|
||||
#define EVENT_TRANSFER "mod_audio_fork::transfer"
|
||||
#define EVENT_PLAY_AUDIO "mod_audio_fork::play_audio"
|
||||
#define EVENT_KILL_AUDIO "mod_audio_fork::kill_audio"
|
||||
#define EVENT_DISCONNECT "mod_audio_fork::disconnect"
|
||||
#define EVENT_ERROR "mod_audio_fork::error"
|
||||
#define EVENT_CONNECT_SUCCESS "mod_audio_fork::connect"
|
||||
#define EVENT_CONNECT_FAIL "mod_audio_fork::connect_failed"
|
||||
#define EVENT_BUFFER_OVERRUN "mod_audio_fork::buffer_overrun"
|
||||
#define EVENT_JSON "mod_audio_fork::json"
|
||||
|
||||
#define MAX_METADATA_LEN (8192)
|
||||
|
||||
struct playout {
|
||||
char *file;
|
||||
struct playout* next;
|
||||
};
|
||||
|
||||
typedef void (*responseHandler_t)(switch_core_session_t* session, const char* eventName, char* json);
|
||||
|
||||
struct private_data {
|
||||
switch_mutex_t *mutex;
|
||||
char sessionId[MAX_SESSION_ID];
|
||||
char bugname[MAX_BUG_LEN+1];
|
||||
SpeexResamplerState *resampler;
|
||||
responseHandler_t responseHandler;
|
||||
void *pAudioPipe;
|
||||
int ws_state;
|
||||
char host[MAX_WS_URL_LEN];
|
||||
unsigned int port;
|
||||
char path[MAX_PATH_LEN];
|
||||
int sampling;
|
||||
struct playout* playout;
|
||||
int channels;
|
||||
unsigned int id;
|
||||
int buffer_overrun_notified:1;
|
||||
int audio_paused:1;
|
||||
int graceful_shutdown:1;
|
||||
char initialMetadata[8192];
|
||||
};
|
||||
|
||||
typedef struct private_data private_t;
|
||||
|
||||
#endif
|
||||
21
mod_audio_fork/parser.cpp
Normal file
21
mod_audio_fork/parser.cpp
Normal file
@@ -0,0 +1,21 @@
|
||||
#include "parser.hpp"
|
||||
#include <switch.h>
|
||||
|
||||
cJSON* parse_json(switch_core_session_t* session, const std::string& data, std::string& type) {
|
||||
cJSON* json = NULL;
|
||||
const char *szType = NULL;
|
||||
json = cJSON_Parse(data.c_str());
|
||||
if (!json) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "parse - failed parsing incoming msg as JSON: %s\n", data.c_str());
|
||||
return NULL;
|
||||
}
|
||||
|
||||
szType = cJSON_GetObjectCstr(json, "type");
|
||||
if (szType) {
|
||||
type.assign(szType);
|
||||
}
|
||||
else {
|
||||
type.assign("json");
|
||||
}
|
||||
return json;
|
||||
}
|
||||
9
mod_audio_fork/parser.hpp
Normal file
9
mod_audio_fork/parser.hpp
Normal file
@@ -0,0 +1,9 @@
|
||||
#ifndef __PARSER_H__
|
||||
#define __PARSER_H__
|
||||
|
||||
#include <string>
|
||||
#include <switch_json.h>
|
||||
|
||||
cJSON* parse_json(switch_core_session_t* session, const std::string& data, std::string& type) ;
|
||||
|
||||
#endif
|
||||
8
mod_aws_lex/LICENSE
Normal file
8
mod_aws_lex/LICENSE
Normal file
@@ -0,0 +1,8 @@
|
||||
Copyright 2023, Drachtio Communications Services, LLC
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
10
mod_aws_lex/Makefile.am
Normal file
10
mod_aws_lex/Makefile.am
Normal file
@@ -0,0 +1,10 @@
|
||||
include $(top_srcdir)/build/modmake.rulesam
|
||||
MODNAME=mod_aws_lex
|
||||
|
||||
mod_LTLIBRARIES = mod_aws_lex.la
|
||||
mod_aws_lex_la_SOURCES = mod_aws_lex.c aws_lex_glue.cpp parser.cpp
|
||||
mod_aws_lex_la_CFLAGS = $(AM_CFLAGS)
|
||||
mod_aws_lex_la_CXXFLAGS = $(AM_CXXFLAGS) -std=c++11 -I${switch_srcdir}/libs/aws-sdk-cpp/aws-cpp-sdk-core/include -I${switch_srcdir}/libs/aws-sdk-cpp/aws-cpp-sdk-lexv2-runtime/include -I${switch_srcdir}/libs/aws-sdk-cpp/build/.deps/install/include
|
||||
|
||||
mod_aws_lex_la_LIBADD = $(switch_builddir)/libfreeswitch.la
|
||||
mod_aws_lex_la_LDFLAGS = -avoid-version -module -no-undefined -L${switch_srcdir}/libs/aws-sdk-cpp/build/.deps/install/lib -L${switch_srcdir}/libs/aws-sdk-cpp/build/aws-cpp-sdk-core -L${switch_srcdir}/libs/aws-sdk-cpp/build/aws-cpp-sdk-lexv2-runtime -laws-cpp-sdk-lexv2-runtime -laws-cpp-sdk-core -laws-c-event-stream -laws-checksums -laws-c-common -lpthread -lcurl -lcrypto -lssl -lz
|
||||
63
mod_aws_lex/README.md
Normal file
63
mod_aws_lex/README.md
Normal file
@@ -0,0 +1,63 @@
|
||||
# mod_aws_lex
|
||||
|
||||
A Freeswitch module that connects to [AWS Lex](https://docs.aws.amazon.com/lex/) using the streaming API.
|
||||
|
||||
Once a Freeswitch channel is connected to a Lex bot, media is streamed to Lex, which returns information describing the "intent" that was detected, along with transcriptions and audio prompts and text to play to the caller. The handling of returned audio by the module is two-fold:
|
||||
1. If an audio clip was returned, it is *not* immediately played to the caller, but instead is written to a temporary file on the Freeswitch server.
|
||||
2. Next, a Freeswitch custom event is sent to the application containing the details of the dialogflow response as well as the path to the audio file.
|
||||
|
||||
This allows the application whether to decide to play the returned audio clip (via the mod_dptools 'play' command), or to use a text-to-speech service to generate audio using the returned prompt text.
|
||||
|
||||
## API
|
||||
|
||||
### Commands
|
||||
The freeswitch module exposes the following API commands:
|
||||
|
||||
```
|
||||
aws_lex_start <uuid> botId aliasId region locale [welcome-intent]
|
||||
```
|
||||
Attaches media bug to channel and performs streaming recognize request.
|
||||
- `uuid` - freeswitch channel uuid
|
||||
- `bot` - name of Lex bot
|
||||
- `alias` - alias of Lex bot
|
||||
- `region` - AWS region name (e.g 'us-east-1')
|
||||
- `locale` - AWS language to use for speech recognition (e.g. 'en-US')
|
||||
- `welcome-intent` - name of intent to trigger initially
|
||||
|
||||
```
|
||||
aws_lex_dtmf <uuid> dtmf-entry
|
||||
```
|
||||
Notify Lex of a dtmf entry
|
||||
|
||||
```
|
||||
aws_lex_play_done <uuid>
|
||||
```
|
||||
Notify Lex that an audio prompt has completed playing. The application needs to call this if barge-in is enabled.
|
||||
```
|
||||
aws_lex_stop <uuid>
|
||||
```
|
||||
Stop dialogflow on the channel.
|
||||
|
||||
### Channel variables
|
||||
* `ACCESS_KEY_ID` - AWS access key id to use to authenticate; if not provided an environment variable of the same name is used if provided
|
||||
* `SECRET_ACCESS_KEY` - AWS secret access key to use to authenticate; if not provided an environment variable of the same name is used if provided
|
||||
* `LEX_WELCOME_MESSAGE` - text for a welcome message to play at audio start
|
||||
* `x-amz-lex:start-silence-threshold-ms` - no-input timeout in milliseconds (Lex defaults to 4000 if not provided)
|
||||
|
||||
### Events
|
||||
* `lex::intent` - an intent has been detected.
|
||||
* `lex::transcription` - a transcription has been returned
|
||||
* `lex::text_response` - a text response has been returned; the telephony application can play this using text-to-speech if desired.
|
||||
* `lex::audio_provided` - an audio response (.mp3 format) has been returned; the telephony application can play this file if TTS is not being used
|
||||
* `lex::text_response` - a text response was provided.
|
||||
* `lex::playback_interruption` - the caller has spoken during prompt playback; the telephony application should kill the current audio prompt
|
||||
* `lex::error` - dialogflow has returned an error
|
||||
## Usage
|
||||
When using [drachtio-fsrmf](https://www.npmjs.com/package/drachtio-fsmrf), you can access this API command via the api method on the 'endpoint' object.
|
||||
```js
|
||||
ep.api('aws_lex_start', `${ep.uuid} BookTrip Gamma us-east-1`);
|
||||
```
|
||||
|
||||
# Example application
|
||||
|
||||
See [drachtio-lex-gateway](https://github.com/drachtio/drachtio-lex-phone-gateway).
|
||||
829
mod_aws_lex/aws_lex_glue.cpp
Normal file
829
mod_aws_lex/aws_lex_glue.cpp
Normal file
@@ -0,0 +1,829 @@
|
||||
#include <cstdlib>
|
||||
|
||||
#include <switch.h>
|
||||
#include <switch_json.h>
|
||||
|
||||
#include <string.h>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <condition_variable>
|
||||
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <map>
|
||||
|
||||
#include <float.h>
|
||||
|
||||
#include <aws/core/Aws.h>
|
||||
#include <aws/core/auth/AWSCredentialsProvider.h>
|
||||
#include <aws/core/client/ClientConfiguration.h>
|
||||
#include <aws/core/utils/logging/DefaultLogSystem.h>
|
||||
#include <aws/core/utils/logging/AWSLogging.h>
|
||||
#include <aws/lexv2-runtime/LexRuntimeV2Client.h>
|
||||
#include <aws/lexv2-runtime/model/StartConversationRequest.h>
|
||||
|
||||
#include "mod_aws_lex.h"
|
||||
#include "parser.h"
|
||||
|
||||
using namespace Aws;
|
||||
using namespace Aws::Utils;
|
||||
using namespace Aws::Auth;
|
||||
using namespace Aws::LexRuntimeV2;
|
||||
using namespace Aws::LexRuntimeV2::Model;
|
||||
|
||||
|
||||
const char ALLOC_TAG[] = "drachtio";
|
||||
|
||||
static uint64_t playCount = 0;
|
||||
static std::multimap<std::string, std::string> audioFiles;
|
||||
static bool hasDefaultCredentials = false;
|
||||
static bool awsLoggingEnabled = false;
|
||||
static const char *endpointOverride = std::getenv("AWS_LEX_ENDPOINT_OVERRIDE");
|
||||
static std::vector<Aws::String> locales{"en_AU", "en_GB", "en_US", "fr_CA", "fr_FR", "es_ES", "es_US", "it_IT"};
|
||||
|
||||
static switch_status_t hanguphook(switch_core_session_t *session) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_channel_state_t state = switch_channel_get_state(channel);
|
||||
|
||||
if (state == CS_HANGUP || state == CS_ROUTING) {
|
||||
char * sessionId = switch_core_session_get_uuid(session);
|
||||
auto range = audioFiles.equal_range(sessionId);
|
||||
for (auto it = range.first; it != range.second; it++) {
|
||||
std::string filename = it->second;
|
||||
std::remove(filename.c_str());
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG,
|
||||
"aws_lex_session_cleanup: removed audio file %s\n", filename.c_str());
|
||||
}
|
||||
audioFiles.erase(sessionId);
|
||||
switch_core_event_hook_remove_state_change(session, hanguphook);
|
||||
}
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static bool parseMetadata(Aws::Map<Aws::String, Slot>& slots, Aws::Map<Aws::String, Aws::String>& attributes, char* metadata) {
|
||||
cJSON* json = cJSON_Parse(metadata);
|
||||
if (json) {
|
||||
int numItems = cJSON_GetArraySize(json);
|
||||
for (int i = 0; i < numItems; i++) {
|
||||
cJSON* item = cJSON_GetArrayItem(json, i);
|
||||
if (0 == strcmp("slots", item->string)) {
|
||||
// pre-fill slots
|
||||
if (cJSON_Object == item->type) {
|
||||
int numSlots = cJSON_GetArraySize(item);
|
||||
for (int j = 0; j < numSlots; j++) {
|
||||
Slot slot;
|
||||
Value value;
|
||||
cJSON* jSlot = cJSON_GetArrayItem(item, j);
|
||||
switch (jSlot->type) {
|
||||
case cJSON_False:
|
||||
value.SetInterpretedValue("false");
|
||||
slot.SetValue(value);
|
||||
slots[jSlot->string] = slot;
|
||||
break;
|
||||
case cJSON_True:
|
||||
value.SetInterpretedValue("true");
|
||||
slot.SetValue(value);
|
||||
slots[jSlot->string] = slot;
|
||||
break;
|
||||
case cJSON_Number:
|
||||
{
|
||||
double d = jSlot->valuedouble;
|
||||
char scratch[16];
|
||||
if ((fabs(((double)jSlot->valueint) - d) <= DBL_EPSILON) && (d <= INT_MAX) && (d >= INT_MIN)) {
|
||||
sprintf(scratch, "%d", jSlot->valueint);
|
||||
}
|
||||
else {
|
||||
sprintf(scratch, "%f", jSlot->valuedouble);
|
||||
}
|
||||
value.SetInterpretedValue(scratch);
|
||||
slot.SetValue(value);
|
||||
slots[jSlot->string] = slot;
|
||||
}
|
||||
break;
|
||||
case cJSON_String:
|
||||
value.SetInterpretedValue(jSlot->valuestring);
|
||||
slot.SetValue(value);
|
||||
slots[jSlot->string] = slot;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (0 == strcmp("context", item->string) && cJSON_Object == item->type) {
|
||||
char buf[4096];
|
||||
|
||||
// special case: json string passed as x-amz-lex:channels:context to bot
|
||||
|
||||
if (!cJSON_PrintPreallocated(item, buf, 4096, 0)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "parse metadata fails due to excessive length\n");
|
||||
}
|
||||
else {
|
||||
attributes["x-amz-lex:channels:context-format"] = "json";
|
||||
attributes["x-amz-lex:channels:context"] = buf;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// attributes
|
||||
switch (item->type) {
|
||||
case cJSON_False:
|
||||
attributes[item->string] = false;
|
||||
break;
|
||||
case cJSON_True:
|
||||
attributes[item->string] = true;
|
||||
break;
|
||||
case cJSON_Number:
|
||||
{
|
||||
double d = item->valuedouble;
|
||||
if ((fabs(((double)item->valueint) - d) <= DBL_EPSILON) && (d <= INT_MAX) && (d >= INT_MIN)) {
|
||||
attributes[item->string] = item->valueint;
|
||||
}
|
||||
else {
|
||||
attributes[item->string] = d;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case cJSON_String:
|
||||
attributes[item->string] = item->valuestring;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
int count = slots.size() + attributes.size();
|
||||
|
||||
cJSON_Delete(json);
|
||||
|
||||
return count > 0;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
class GStreamer {
|
||||
public:
|
||||
GStreamer(const char *sessionId,
|
||||
char* bot,
|
||||
char* alias,
|
||||
char* region,
|
||||
char *locale,
|
||||
char *intentName,
|
||||
char *metadata,
|
||||
const char* awsAccessKeyId,
|
||||
const char* awsSecretAccessKey,
|
||||
responseHandler_t responseHandler,
|
||||
errorHandler_t errorHandler) :
|
||||
m_bot(bot), m_alias(alias), m_region(region), m_sessionId(sessionId), m_finished(false), m_finishing(false), m_packets(0),
|
||||
m_pStream(nullptr), m_bPlayDone(false), m_bDiscardAudio(false)
|
||||
{
|
||||
Aws::String key(awsAccessKeyId);
|
||||
Aws::String secret(awsSecretAccessKey);
|
||||
Aws::String awsLocale(locale);
|
||||
Aws::Client::ClientConfiguration config;
|
||||
config.region = region;
|
||||
if (endpointOverride) config.endpointOverride = endpointOverride;
|
||||
char keySnippet[20];
|
||||
|
||||
strncpy(keySnippet, awsAccessKeyId, 4);
|
||||
for (int i = 4; i < 20; i++) keySnippet[i] = 'x';
|
||||
keySnippet[19] = '\0';
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p ACCESS_KEY_ID %s\n", this, keySnippet);
|
||||
if (*awsAccessKeyId && *awsSecretAccessKey) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "using AWS creds %s %s\n", awsAccessKeyId, awsSecretAccessKey);
|
||||
m_client = Aws::MakeUnique<LexRuntimeV2Client>(ALLOC_TAG, AWSCredentials(awsAccessKeyId, awsSecretAccessKey), config);
|
||||
}
|
||||
else {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "No AWS credentials so using default credentials\n");
|
||||
m_client = Aws::MakeUnique<LexRuntimeV2Client>(ALLOC_TAG, config);
|
||||
}
|
||||
|
||||
m_handler.SetHeartbeatEventCallback([this](const HeartbeatEvent&)
|
||||
{
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p recv heartbeat\n", this);
|
||||
});
|
||||
|
||||
m_handler.SetPlaybackInterruptionEventCallback([this, responseHandler](const PlaybackInterruptionEvent& ev)
|
||||
{
|
||||
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
||||
if (psession) {
|
||||
cJSON* json = lex2Json(ev);
|
||||
char* data = cJSON_PrintUnformatted(json);
|
||||
|
||||
responseHandler(psession, AWS_LEX_EVENT_PLAYBACK_INTERRUPTION, const_cast<char *>(data));
|
||||
|
||||
free(data);
|
||||
cJSON_Delete(json);
|
||||
switch_core_session_rwunlock(psession);
|
||||
}
|
||||
});
|
||||
|
||||
m_handler.SetTranscriptEventCallback([this, responseHandler](const TranscriptEvent& ev)
|
||||
{
|
||||
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
||||
if (psession) {
|
||||
cJSON* json = lex2Json(ev);
|
||||
char* data = cJSON_PrintUnformatted(json);
|
||||
|
||||
responseHandler(psession, AWS_LEX_EVENT_TRANSCRIPTION, const_cast<char *>(data));
|
||||
|
||||
free(data);
|
||||
cJSON_Delete(json);
|
||||
switch_core_session_rwunlock(psession);
|
||||
}
|
||||
});
|
||||
|
||||
m_handler.SetTextResponseEventCallback([this, responseHandler](const TextResponseEvent& ev){
|
||||
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
||||
if (psession) {
|
||||
cJSON* json = lex2Json(ev);
|
||||
char* data = cJSON_PrintUnformatted(json);
|
||||
|
||||
responseHandler(psession, AWS_LEX_EVENT_TEXT_RESPONSE, data);
|
||||
|
||||
free(data);
|
||||
cJSON_Delete(json);
|
||||
switch_core_session_rwunlock(psession);
|
||||
}
|
||||
});
|
||||
|
||||
m_handler.SetAudioResponseEventCallback([this, responseHandler](const AudioResponseEvent& ev){
|
||||
if (m_bDiscardAudio) return;
|
||||
|
||||
const Aws::Utils::ByteBuffer& audio = ev.GetAudioChunk();
|
||||
uint32_t bytes = audio.GetLength();
|
||||
auto contentType = ev.GetContentType();
|
||||
auto eventId = ev.GetEventId();
|
||||
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
||||
if (psession) {
|
||||
if (!m_f.is_open()) {
|
||||
if (0 == bytes) return;
|
||||
m_ostrCurrentPath.str("");
|
||||
m_ostrCurrentPath << SWITCH_GLOBAL_dirs.temp_dir << SWITCH_PATH_SEPARATOR << m_sessionId << "_" << ++playCount << ".mp3";
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG, "GStreamer %p: writing new audio file %s\n", this, m_ostrCurrentPath.str().c_str());
|
||||
m_f.open(m_ostrCurrentPath.str(), std::ofstream::binary);
|
||||
m_f.write((const char*) audio.GetUnderlyingData(), bytes);
|
||||
|
||||
// add the file to the list of files played for this session, we'll delete when session closes
|
||||
audioFiles.insert(std::pair<std::string, std::string>(m_sessionId, m_ostrCurrentPath.str().c_str()));
|
||||
}
|
||||
else if (0 == bytes) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG, "GStreamer %p: closing audio file %s\n", this, m_ostrCurrentPath.str().c_str());
|
||||
m_f.flush();
|
||||
m_f.close();
|
||||
|
||||
std::ostringstream s;
|
||||
s << "{\"path\": \"" << m_ostrCurrentPath.str() << "\"}";
|
||||
|
||||
responseHandler(psession, AWS_LEX_EVENT_AUDIO_PROVIDED, const_cast<char *>(s.str().c_str()));
|
||||
}
|
||||
else {
|
||||
m_f.write((const char*) audio.GetUnderlyingData(), bytes);
|
||||
}
|
||||
switch_core_session_rwunlock(psession);
|
||||
}
|
||||
});
|
||||
|
||||
m_handler.SetIntentResultEventCallback([this, responseHandler](const IntentResultEvent& ev)
|
||||
{
|
||||
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
||||
if (psession) {
|
||||
cJSON* json = lex2Json(ev);
|
||||
char* data = cJSON_PrintUnformatted(json);
|
||||
|
||||
responseHandler(psession, AWS_LEX_EVENT_INTENT, data);
|
||||
|
||||
free(data);
|
||||
cJSON_Delete(json);
|
||||
switch_core_session_rwunlock(psession);
|
||||
}
|
||||
});
|
||||
|
||||
m_handler.SetOnErrorCallback([this, errorHandler](const Aws::Client::AWSError<LexRuntimeV2Errors>& err)
|
||||
{
|
||||
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
||||
if (psession) {
|
||||
cJSON* json = lex2Json(err);
|
||||
char* data = cJSON_PrintUnformatted(json);
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "GStreamer %p stream got error: %s\n", this, data);
|
||||
|
||||
errorHandler(psession, data);
|
||||
|
||||
free(data);
|
||||
cJSON_Delete(json);
|
||||
switch_core_session_rwunlock(psession);
|
||||
}
|
||||
});
|
||||
|
||||
if (locales.end() == std::find(locales.begin(), locales.end(), awsLocale)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p invalid locale %s provided, defaulting to en-US\n", this, locale);
|
||||
awsLocale = "en_US";
|
||||
}
|
||||
|
||||
m_request.SetBotId(bot);
|
||||
m_request.SetBotAliasId(alias);
|
||||
m_request.SetSessionId(sessionId);
|
||||
m_request.SetEventStreamHandler(m_handler);
|
||||
m_request.SetLocaleId(awsLocale);
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p sessionId %s, botId %s, alias %s, region %s, locale %s \n", this, sessionId, bot, alias, region, awsLocale.c_str());
|
||||
|
||||
auto OnStreamReady = [this, metadata, intentName](StartConversationRequestEventStream& stream)
|
||||
{
|
||||
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
||||
if (psession) {
|
||||
switch_channel_t* channel = switch_core_session_get_channel(psession);
|
||||
Aws::Map<Aws::String, Aws::String> sessionAttributes;
|
||||
|
||||
m_pStream = &stream;
|
||||
|
||||
// check channel vars for lex session attributes
|
||||
bool bargein = false;
|
||||
const char* var;
|
||||
if (switch_channel_var_true(channel, "LEX_USE_TTS")) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "GStreamer %p using tts so audio packets will be discarded\n", this);
|
||||
m_bDiscardAudio = true;
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "x-amz-lex:audio:start-timeout-ms")) {
|
||||
sessionAttributes.insert({"x-amz-lex:audio:start-timeout-ms:*:*", var});
|
||||
}
|
||||
|
||||
Aws::Map<Aws::String, Slot> slots;
|
||||
if (metadata) parseMetadata(slots, sessionAttributes, metadata);
|
||||
|
||||
SessionState sessionState;
|
||||
sessionState.SetSessionAttributes(sessionAttributes);
|
||||
|
||||
ConfigurationEvent configurationEvent;
|
||||
configurationEvent.SetResponseContentType("audio/mpeg");
|
||||
|
||||
Intent intent;
|
||||
if (intentName && strlen(intentName) > 0) {
|
||||
DialogAction dialogAction;
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p setting initial intent to '%s'\n", this, intentName);
|
||||
intent.SetName(intentName);
|
||||
|
||||
for (auto const& pair : slots) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p setting slot %s\n", this, pair.first.c_str());
|
||||
intent.AddSlots(pair.first, pair.second);
|
||||
}
|
||||
|
||||
sessionState.SetIntent(intent);
|
||||
dialogAction.SetType(DialogActionType::Delegate);
|
||||
sessionState.SetDialogAction(dialogAction);
|
||||
}
|
||||
else if (var = switch_channel_get_variable(channel, "LEX_WELCOME_MESSAGE")) {
|
||||
Message message;
|
||||
DialogAction dialogAction;
|
||||
|
||||
dialogAction.SetType(DialogActionType::ElicitIntent);
|
||||
sessionState.SetDialogAction(dialogAction);
|
||||
message.SetContent(var);
|
||||
message.SetContentType(MessageContentType::PlainText);
|
||||
configurationEvent.AddWelcomeMessages(message);
|
||||
|
||||
// erase the channel var, so it is not reused in future intent
|
||||
switch_channel_set_variable(channel, "LEX_WELCOME_MESSAGE", nullptr);
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p setting welcome message: %s\n", this, var);
|
||||
}
|
||||
configurationEvent.SetSessionState(sessionState);
|
||||
|
||||
stream.WriteConfigurationEvent(configurationEvent);
|
||||
stream.flush();
|
||||
|
||||
PlaybackCompletionEvent playbackCompletionEvent;
|
||||
stream.WritePlaybackCompletionEvent(playbackCompletionEvent);
|
||||
stream.flush();
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p got stream ready\n", this);
|
||||
switch_core_session_rwunlock(psession);
|
||||
}
|
||||
};
|
||||
auto OnResponseCallback = [&](const LexRuntimeV2Client* pClient,
|
||||
const StartConversationRequest& request,
|
||||
const StartConversationOutcome& outcome,
|
||||
const std::shared_ptr<const Aws::Client::AsyncCallerContext>&)
|
||||
{
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p stream got final response\n", this);
|
||||
if (!outcome.IsSuccess()) {
|
||||
const LexRuntimeV2Error& err = outcome.GetError();
|
||||
auto message = err.GetMessage();
|
||||
auto exception = err.GetExceptionName();
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p stream got error response %s : %s\n", this, message.c_str(), exception.c_str());
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> lk(m_mutex);
|
||||
m_finished = true;
|
||||
m_cond.notify_one();
|
||||
};
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p starting conversation\n", this);
|
||||
m_client->StartConversationAsync(m_request, OnStreamReady, OnResponseCallback, nullptr/*context*/);
|
||||
}
|
||||
|
||||
~GStreamer() {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::~GStreamer wrote %d packets %p\n", m_packets, this);
|
||||
}
|
||||
|
||||
void dtmf(char* dtmf) {
|
||||
if (m_finishing || m_finished) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::dtmf not writing because we are finished, %p\n", this);
|
||||
return;
|
||||
}
|
||||
|
||||
DTMFInputEvent dtmfInputEvent;
|
||||
dtmfInputEvent.SetInputCharacter(dtmf);
|
||||
m_pStream->WriteDTMFInputEvent(dtmfInputEvent);
|
||||
m_pStream->flush();
|
||||
}
|
||||
|
||||
void notify_play_done() {
|
||||
std::lock_guard<std::mutex> lk(m_mutex);
|
||||
m_bPlayDone = true;
|
||||
m_cond.notify_one();
|
||||
}
|
||||
|
||||
bool write(void* data, uint32_t datalen) {
|
||||
if (m_finishing || m_finished) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::write not writing because we are finished, %p\n", this);
|
||||
return false;
|
||||
}
|
||||
//m_fOutgoingAudio.write((const char*) data, datalen);
|
||||
Aws::Utils::ByteBuffer audio((const unsigned char *) data, datalen);
|
||||
AudioInputEvent audioInputEvent;
|
||||
audioInputEvent.SetAudioChunk(audio);
|
||||
audioInputEvent.SetContentType("audio/lpcm; sample-rate=8000; sample-size-bits=16; channel-count=1; is-big-endian=false");
|
||||
m_pStream->WriteAudioInputEvent(audioInputEvent);
|
||||
m_pStream->flush();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void finish() {
|
||||
if (m_finishing) return;
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::finish %p\n", this);
|
||||
std::lock_guard<std::mutex> lk(m_mutex);
|
||||
|
||||
m_finishing = true;
|
||||
m_cond.notify_one();
|
||||
}
|
||||
|
||||
void processData() {
|
||||
bool shutdownInitiated = false;
|
||||
while (true) {
|
||||
std::unique_lock<std::mutex> lk(m_mutex);
|
||||
m_cond.wait(lk, [&, this] {
|
||||
return m_bPlayDone || m_finished || (m_finishing && !shutdownInitiated);
|
||||
});
|
||||
|
||||
// we have data to process or have been told we're done
|
||||
if (m_finished) return;
|
||||
if (m_finishing) {
|
||||
shutdownInitiated = true;
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::writing disconnect event %p\n", this);
|
||||
|
||||
if (m_pStream) {
|
||||
m_pStream->WriteAudioInputEvent({}); // per the spec, we have to send an empty event (i.e. without a payload) at the end.
|
||||
DisconnectionEvent disconnectionEvent;
|
||||
m_pStream->WriteDisconnectionEvent(disconnectionEvent);
|
||||
|
||||
m_pStream->flush();
|
||||
m_pStream->Close();
|
||||
m_pStream = nullptr;
|
||||
|
||||
//m_fOutgoingAudio.flush();
|
||||
//m_fOutgoingAudio.close();
|
||||
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (m_bPlayDone) {
|
||||
m_bPlayDone = false;
|
||||
PlaybackCompletionEvent playbackCompletionEvent;
|
||||
m_pStream->WritePlaybackCompletionEvent(playbackCompletionEvent);
|
||||
m_pStream->flush();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
std::string m_sessionId;
|
||||
std::string m_bot;
|
||||
std::string m_alias;
|
||||
std::string m_region;
|
||||
Aws::UniquePtr<LexRuntimeV2Client> m_client;
|
||||
StartConversationRequestEventStream* m_pStream;
|
||||
StartConversationRequest m_request;
|
||||
StartConversationHandler m_handler;
|
||||
|
||||
bool m_finishing;
|
||||
bool m_finished;
|
||||
uint32_t m_packets;
|
||||
std::mutex m_mutex;
|
||||
std::condition_variable m_cond;
|
||||
std::ofstream m_f;
|
||||
std::ostringstream m_ostrCurrentPath;
|
||||
//std::ofstream m_fOutgoingAudio;
|
||||
bool m_bPlayDone;
|
||||
bool m_bDiscardAudio;
|
||||
};
|
||||
|
||||
static void *SWITCH_THREAD_FUNC lex_thread(switch_thread_t *thread, void *obj) {
|
||||
struct cap_cb *cb = (struct cap_cb *) obj;
|
||||
bool ok = true;
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "lex_thread: starting cb %p\n", (void *) cb);
|
||||
GStreamer* pStreamer = new GStreamer(cb->sessionId, cb->bot, cb->alias, cb->region, cb->locale,
|
||||
cb->intent, cb->metadata, cb->awsAccessKeyId, cb->awsSecretAccessKey,
|
||||
cb->responseHandler, cb->errorHandler);
|
||||
if (!pStreamer) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "lex_thread: Error allocating streamer\n");
|
||||
return nullptr;
|
||||
}
|
||||
cb->streamer = pStreamer;
|
||||
|
||||
pStreamer->processData();
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "lex_thread: stopping cb %p\n", (void *) cb);
|
||||
delete pStreamer;
|
||||
cb->streamer = nullptr;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void killcb(struct cap_cb* cb) {
|
||||
if (cb) {
|
||||
if (cb->streamer) {
|
||||
GStreamer* p = (GStreamer *) cb->streamer;
|
||||
delete p;
|
||||
cb->streamer = NULL;
|
||||
}
|
||||
if (cb->resampler) {
|
||||
speex_resampler_destroy(cb->resampler);
|
||||
cb->resampler = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
extern "C" {
|
||||
switch_status_t aws_lex_init() {
|
||||
const char* accessKeyId = std::getenv("AWS_ACCESS_KEY_ID");
|
||||
const char* secretAccessKey= std::getenv("AWS_SECRET_ACCESS_KEY");
|
||||
const char* awsTrace = std::getenv("AWS_TRACE");
|
||||
if (NULL == accessKeyId && NULL == secretAccessKey) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE,
|
||||
"\"AWS_ACCESS_KEY_ID\" and/or \"AWS_SECRET_ACCESS_KEY\" env var not set; authentication will expect channel variables of same names to be set\n");
|
||||
}
|
||||
else {
|
||||
hasDefaultCredentials = true;
|
||||
|
||||
}
|
||||
Aws::SDKOptions options;
|
||||
|
||||
if (awsTrace && 0 == strcmp("1", awsTrace)) {
|
||||
awsLoggingEnabled = true;
|
||||
options.loggingOptions.logLevel = Aws::Utils::Logging::LogLevel::Trace;
|
||||
|
||||
Aws::Utils::Logging::InitializeAWSLogging(
|
||||
Aws::MakeShared<Aws::Utils::Logging::DefaultLogSystem>(
|
||||
ALLOC_TAG, Aws::Utils::Logging::LogLevel::Trace, "aws_sdk_"));
|
||||
}
|
||||
|
||||
Aws::InitAPI(options);
|
||||
|
||||
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_status_t aws_lex_cleanup() {
|
||||
Aws::SDKOptions options;
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "mod_aws_lex: shutting down API");
|
||||
if (awsLoggingEnabled) {
|
||||
options.loggingOptions.logLevel = Aws::Utils::Logging::LogLevel::Trace;
|
||||
Aws::Utils::Logging::ShutdownAWSLogging();
|
||||
}
|
||||
|
||||
Aws::ShutdownAPI(options);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "mod_aws_lex: shutdown API complete");
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// start lex on a channel
|
||||
switch_status_t aws_lex_session_init(
|
||||
switch_core_session_t *session,
|
||||
responseHandler_t responseHandler,
|
||||
errorHandler_t errorHandler,
|
||||
uint32_t samples_per_second,
|
||||
char* bot,
|
||||
char* alias,
|
||||
char* region,
|
||||
char* locale,
|
||||
char* intent,
|
||||
char* metadata,
|
||||
struct cap_cb **ppUserData
|
||||
) {
|
||||
switch_status_t status = SWITCH_STATUS_SUCCESS;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
int err;
|
||||
switch_threadattr_t *thd_attr = NULL;
|
||||
switch_memory_pool_t *pool = switch_core_session_get_pool(session);
|
||||
struct cap_cb* cb = (struct cap_cb *) switch_core_session_alloc(session, sizeof(*cb));
|
||||
memset(cb, sizeof(cb), 0);
|
||||
const char* awsAccessKeyId = switch_channel_get_variable(channel, "AWS_ACCESS_KEY_ID");
|
||||
const char* awsSecretAccessKey = switch_channel_get_variable(channel, "AWS_SECRET_ACCESS_KEY");
|
||||
|
||||
if (!hasDefaultCredentials && (!awsAccessKeyId || !awsSecretAccessKey)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
|
||||
"missing credentials: AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY must be suuplied either as an env variable or channel variable\n");
|
||||
status = SWITCH_STATUS_FALSE;
|
||||
goto done;
|
||||
}
|
||||
|
||||
strncpy(cb->sessionId, switch_core_session_get_uuid(session), 256);
|
||||
|
||||
if (awsAccessKeyId && awsSecretAccessKey) {
|
||||
strncpy(cb->awsAccessKeyId, awsAccessKeyId, 128);
|
||||
strncpy(cb->awsSecretAccessKey, awsSecretAccessKey, 128);
|
||||
}
|
||||
else {
|
||||
strncpy(cb->awsAccessKeyId, std::getenv("AWS_ACCESS_KEY_ID"), 128);
|
||||
strncpy(cb->awsSecretAccessKey, std::getenv("AWS_SECRET_ACCESS_KEY"), 128);
|
||||
}
|
||||
|
||||
cb->responseHandler = responseHandler;
|
||||
cb->errorHandler = errorHandler;
|
||||
|
||||
if (switch_mutex_init(&cb->mutex, SWITCH_MUTEX_NESTED, pool) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error initializing mutex\n");
|
||||
status = SWITCH_STATUS_FALSE;
|
||||
goto done;
|
||||
}
|
||||
|
||||
strncpy(cb->bot, bot, MAX_BOTNAME);
|
||||
strncpy(cb->alias, alias, MAX_BOTNAME);
|
||||
strncpy(cb->locale, locale, MAX_LOCALE);
|
||||
strncpy(cb->region, region, MAX_REGION);
|
||||
if (intent) strncpy(cb->intent, intent, MAX_INTENT);
|
||||
if (metadata) strncpy(cb->metadata, metadata, MAX_METADATA);
|
||||
cb->resampler = speex_resampler_init(1, 8000, /*16000*/ 8000, SWITCH_RESAMPLE_QUALITY, &err);
|
||||
if (0 != err) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error initializing resampler: %s.\n",
|
||||
switch_channel_get_name(channel), speex_resampler_strerror(err));
|
||||
status = SWITCH_STATUS_FALSE;
|
||||
goto done;
|
||||
}
|
||||
|
||||
// hangup hook to clear temp audio files
|
||||
switch_core_event_hook_add_state_change(session, hanguphook);
|
||||
|
||||
// create a thread to service the http/2 connection to lex
|
||||
switch_threadattr_create(&thd_attr, pool);
|
||||
switch_threadattr_stacksize_set(thd_attr, SWITCH_THREAD_STACKSIZE);
|
||||
switch_thread_create(&cb->thread, thd_attr, lex_thread, cb, pool);
|
||||
|
||||
*ppUserData = cb;
|
||||
|
||||
done:
|
||||
return status;
|
||||
}
|
||||
|
||||
switch_status_t aws_lex_session_dtmf(switch_core_session_t *session, char* dtmf) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, MY_BUG_NAME);
|
||||
|
||||
if (bug) {
|
||||
struct cap_cb *cb = (struct cap_cb *) switch_core_media_bug_get_user_data(bug);
|
||||
switch_status_t st;
|
||||
|
||||
// close connection and get final responses
|
||||
switch_mutex_lock(cb->mutex);
|
||||
GStreamer* streamer = (GStreamer *) cb->streamer;
|
||||
if (streamer) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "aws_lex_session_dtmf: sending dtmf %s\n", dtmf);
|
||||
streamer->dtmf(dtmf);
|
||||
}
|
||||
switch_mutex_unlock(cb->mutex);
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s Bug is not attached.\n", switch_channel_get_name(channel));
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
switch_status_t aws_lex_session_play_done(switch_core_session_t *session) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, MY_BUG_NAME);
|
||||
|
||||
if (bug) {
|
||||
struct cap_cb *cb = (struct cap_cb *) switch_core_media_bug_get_user_data(bug);
|
||||
switch_status_t st;
|
||||
|
||||
// close connection and get final responses
|
||||
switch_mutex_lock(cb->mutex);
|
||||
GStreamer* streamer = (GStreamer *) cb->streamer;
|
||||
if (streamer) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "aws_lex_session_play_done: sending play done\n");
|
||||
streamer->notify_play_done();
|
||||
}
|
||||
switch_mutex_unlock(cb->mutex);
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s Bug is not attached.\n", switch_channel_get_name(channel));
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
switch_status_t aws_lex_session_stop(switch_core_session_t *session, int channelIsClosing) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, MY_BUG_NAME);
|
||||
|
||||
if (bug) {
|
||||
struct cap_cb *cb = (struct cap_cb *) switch_core_media_bug_get_user_data(bug);
|
||||
switch_status_t st;
|
||||
|
||||
// close connection and get final responses
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "aws_lex_session_cleanup: acquiring lock\n");
|
||||
switch_mutex_lock(cb->mutex);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "aws_lex_session_cleanup: acquired lock\n");
|
||||
GStreamer* streamer = (GStreamer *) cb->streamer;
|
||||
if (streamer) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "aws_lex_session_cleanup: sending writesDone..\n");
|
||||
streamer->finish();
|
||||
}
|
||||
if (cb->thread) {
|
||||
switch_status_t retval;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "aws_lex_session_cleanup: waiting for read thread to complete\n");
|
||||
switch_thread_join(&retval, cb->thread);
|
||||
cb->thread = NULL;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "aws_lex_session_cleanup: read thread completed\n");
|
||||
}
|
||||
killcb(cb);
|
||||
|
||||
switch_channel_set_private(channel, MY_BUG_NAME, NULL);
|
||||
if (!channelIsClosing) switch_core_media_bug_remove(session, &bug);
|
||||
|
||||
switch_mutex_unlock(cb->mutex);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "aws_lex_session_cleanup: Closed aws session\n");
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s Bug is not attached.\n", switch_channel_get_name(channel));
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
switch_bool_t aws_lex_frame(switch_media_bug_t *bug, void* user_data) {
|
||||
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
|
||||
uint8_t data[SWITCH_RECOMMENDED_BUFFER_SIZE];
|
||||
switch_frame_t frame = {};
|
||||
struct cap_cb *cb = (struct cap_cb *) user_data;
|
||||
|
||||
frame.data = data;
|
||||
frame.buflen = SWITCH_RECOMMENDED_BUFFER_SIZE;
|
||||
|
||||
if (switch_mutex_trylock(cb->mutex) == SWITCH_STATUS_SUCCESS) {
|
||||
GStreamer* streamer = (GStreamer *) cb->streamer;
|
||||
if (streamer) {
|
||||
while (switch_core_media_bug_read(bug, &frame, SWITCH_TRUE) == SWITCH_STATUS_SUCCESS && !switch_test_flag((&frame), SFF_CNG)) {
|
||||
if (frame.datalen) {
|
||||
spx_int16_t out[SWITCH_RECOMMENDED_BUFFER_SIZE];
|
||||
spx_uint32_t out_len = SWITCH_RECOMMENDED_BUFFER_SIZE;
|
||||
spx_uint32_t in_len = frame.samples;
|
||||
size_t written;
|
||||
|
||||
speex_resampler_process_interleaved_int(cb->resampler, (const spx_int16_t *) frame.data, (spx_uint32_t *) &in_len, &out[0], &out_len);
|
||||
|
||||
streamer->write( &out[0], sizeof(spx_int16_t) * out_len);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG,
|
||||
"aws_lex_frame: not sending audio because aws channel has been closed\n");
|
||||
}
|
||||
switch_mutex_unlock(cb->mutex);
|
||||
}
|
||||
else {
|
||||
//switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG,
|
||||
// "aws_lex_frame: not sending audio since failed to get lock on mutex\n");
|
||||
}
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
|
||||
void destroyChannelUserData(struct cap_cb* cb) {
|
||||
killcb(cb);
|
||||
}
|
||||
|
||||
}
|
||||
14
mod_aws_lex/aws_lex_glue.h
Normal file
14
mod_aws_lex/aws_lex_glue.h
Normal file
@@ -0,0 +1,14 @@
|
||||
#ifndef __AWS_GLUE_H__
|
||||
#define __AWS_GLUE_H__
|
||||
|
||||
switch_status_t aws_lex_init();
|
||||
switch_status_t aws_lex_cleanup();
|
||||
switch_status_t aws_lex_session_init(switch_core_session_t *session, responseHandler_t responseHandler, errorHandler_t errorHandler,
|
||||
uint32_t samples_per_second, char* bot, char* alias, char* region, char* locale, char *intent, char* metadata, struct cap_cb **cb);
|
||||
switch_status_t aws_lex_session_stop(switch_core_session_t *session, int channelIsClosing);
|
||||
switch_status_t aws_lex_session_dtmf(switch_core_session_t *session, char* dtmf);
|
||||
switch_status_t aws_lex_session_play_done(switch_core_session_t *session);
|
||||
switch_bool_t aws_lex_frame(switch_media_bug_t *bug, void* user_data);
|
||||
|
||||
void destroyChannelUserData(struct cap_cb* cb);
|
||||
#endif
|
||||
375
mod_aws_lex/mod_aws_lex.c
Normal file
375
mod_aws_lex/mod_aws_lex.c
Normal file
@@ -0,0 +1,375 @@
|
||||
/*
|
||||
*
|
||||
* mod_lex.c -- Freeswitch module for running a aws lex conversation
|
||||
*
|
||||
*/
|
||||
#include "mod_aws_lex.h"
|
||||
#include "aws_lex_glue.h"
|
||||
|
||||
/* Prototypes */
|
||||
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_aws_lex_shutdown);
|
||||
SWITCH_MODULE_LOAD_FUNCTION(mod_aws_lex_load);
|
||||
|
||||
SWITCH_MODULE_DEFINITION(mod_aws_lex, mod_aws_lex_load, mod_aws_lex_shutdown, NULL);
|
||||
|
||||
static switch_status_t do_stop(switch_core_session_t *session);
|
||||
|
||||
static void responseHandler(switch_core_session_t* session, const char * type, char * json) {
|
||||
switch_event_t *event;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "json payload for type %s: %s.\n", type, json);
|
||||
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, type);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_body(event, "%s", json);
|
||||
switch_event_fire(&event);
|
||||
}
|
||||
static void errorHandler(switch_core_session_t* session, const char * json) {
|
||||
switch_event_t *event;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, AWS_LEX_EVENT_ERROR);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_body(event, "%s", json);
|
||||
|
||||
switch_event_fire(&event);
|
||||
|
||||
do_stop(session);
|
||||
}
|
||||
|
||||
static switch_bool_t capture_callback(switch_media_bug_t *bug, void *user_data, switch_abc_type_t type)
|
||||
{
|
||||
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
|
||||
|
||||
switch (type) {
|
||||
case SWITCH_ABC_TYPE_INIT:
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Got SWITCH_ABC_TYPE_INIT.\n");
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_CLOSE:
|
||||
{
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Got SWITCH_ABC_TYPE_CLOSE.\n");
|
||||
|
||||
aws_lex_session_stop(session, 1);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Finished SWITCH_ABC_TYPE_CLOSE.\n");
|
||||
}
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_READ:
|
||||
|
||||
return aws_lex_frame(bug, user_data);
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_WRITE:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
|
||||
static switch_status_t start_capture(switch_core_session_t *session, switch_media_bug_flag_t flags,
|
||||
char* bot, char*alias, char* region, char* locale, char* intent, char* metadata)
|
||||
{
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug;
|
||||
switch_codec_implementation_t read_impl = { 0 };
|
||||
struct cap_cb *cb = NULL;
|
||||
switch_status_t status = SWITCH_STATUS_SUCCESS;
|
||||
|
||||
if (switch_channel_get_private(channel, MY_BUG_NAME)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "a lex is already running on this channel, we will stop it.\n");
|
||||
do_stop(session);
|
||||
}
|
||||
|
||||
if (switch_channel_pre_answer(channel) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "channel must have at least early media to run lex.\n");
|
||||
status = SWITCH_STATUS_FALSE;
|
||||
goto done;
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "starting lex with bot %s, alias %s, region %s, locale %s, intent %s, metadata: %s\n",
|
||||
bot, alias, region, locale, intent ? intent : "(none)", metadata ? metadata : "(none)");
|
||||
|
||||
switch_core_session_get_read_impl(session, &read_impl);
|
||||
if (SWITCH_STATUS_FALSE == aws_lex_session_init(session, responseHandler, errorHandler,
|
||||
read_impl.samples_per_second, bot, alias, region, locale, intent, metadata, &cb)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error initializing aws lex session.\n");
|
||||
status = SWITCH_STATUS_FALSE;
|
||||
goto done;
|
||||
}
|
||||
|
||||
if ((status = switch_core_media_bug_add(session, "lex", NULL, capture_callback, (void *) cb, 0, flags, &bug)) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error adding bug.\n");
|
||||
status = SWITCH_STATUS_FALSE;
|
||||
goto done;
|
||||
}
|
||||
switch_channel_set_private(channel, MY_BUG_NAME, bug);
|
||||
|
||||
done:
|
||||
if (status == SWITCH_STATUS_FALSE) {
|
||||
if (cb) destroyChannelUserData(cb);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static switch_status_t do_stop(switch_core_session_t *session)
|
||||
{
|
||||
switch_status_t status = SWITCH_STATUS_SUCCESS;
|
||||
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = switch_channel_get_private(channel, MY_BUG_NAME);
|
||||
|
||||
if (bug) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Received user command command to stop lex.\n");
|
||||
status = aws_lex_session_stop(session, 0);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "stopped lex.\n");
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
#define LEX_API_START_SYNTAX "<uuid> bot alias region locale [intent] [json-metadata]"
|
||||
SWITCH_STANDARD_API(aws_lex_api_start_function)
|
||||
{
|
||||
char *mycmd = NULL, *argv[10] = { 0 };
|
||||
int argc = 0;
|
||||
switch_status_t status = SWITCH_STATUS_FALSE;
|
||||
switch_media_bug_flag_t flags = SMBF_READ_STREAM | SMBF_READ_STREAM | SMBF_READ_PING;
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "command %s\n", cmd);
|
||||
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
|
||||
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
|
||||
}
|
||||
|
||||
if (zstr(cmd) || argc < 5) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s %s %s.\n", cmd, argv[0], argv[1]);
|
||||
stream->write_function(stream, "-USAGE: %s\n", LEX_API_START_SYNTAX);
|
||||
goto done;
|
||||
} else {
|
||||
switch_core_session_t *lsession = NULL;
|
||||
|
||||
if ((lsession = switch_core_session_locate(argv[0]))) {
|
||||
char *bot = argv[1];
|
||||
char *alias = argv[2];
|
||||
char *region = argv[3];
|
||||
char *locale = argv[4];
|
||||
char *intent = NULL;
|
||||
char *metadata = NULL;
|
||||
|
||||
if (argc > 5) {
|
||||
if ('{' == *argv[5]) {
|
||||
metadata = argv[5];
|
||||
}
|
||||
else {
|
||||
intent = argv[5];
|
||||
if (argc > 6) metadata = argv[6];
|
||||
}
|
||||
}
|
||||
status = start_capture(lsession, flags, bot, alias, region, locale, intent, metadata);
|
||||
switch_core_session_rwunlock(lsession);
|
||||
}
|
||||
}
|
||||
|
||||
if (status == SWITCH_STATUS_SUCCESS) {
|
||||
stream->write_function(stream, "+OK Success\n");
|
||||
} else {
|
||||
stream->write_function(stream, "-ERR Operation Failed\n");
|
||||
}
|
||||
|
||||
done:
|
||||
|
||||
switch_safe_free(mycmd);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
#define LEX_API_DTMF_SYNTAX "<uuid> dtmf"
|
||||
SWITCH_STANDARD_API(aws_lex_api_dtmf_function)
|
||||
{
|
||||
char *mycmd = NULL, *argv[10] = { 0 };
|
||||
int argc = 0;
|
||||
switch_status_t status = SWITCH_STATUS_FALSE;
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "command %s\n", cmd);
|
||||
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
|
||||
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
|
||||
}
|
||||
|
||||
if (zstr(cmd) || argc < 2) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s %s %s.\n", cmd, argv[0], argv[1]);
|
||||
stream->write_function(stream, "-USAGE: %s\n", LEX_API_DTMF_SYNTAX);
|
||||
goto done;
|
||||
} else {
|
||||
switch_core_session_t *lsession = NULL;
|
||||
|
||||
if ((lsession = switch_core_session_locate(argv[0]))) {
|
||||
char *dtmf = argv[1];
|
||||
status = aws_lex_session_dtmf(lsession, dtmf);
|
||||
switch_core_session_rwunlock(lsession);
|
||||
}
|
||||
}
|
||||
|
||||
if (status == SWITCH_STATUS_SUCCESS) {
|
||||
stream->write_function(stream, "+OK Success\n");
|
||||
} else {
|
||||
stream->write_function(stream, "-ERR Operation Failed\n");
|
||||
}
|
||||
|
||||
done:
|
||||
|
||||
switch_safe_free(mycmd);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
#define LEX_API_PLAY_DONE_SYNTAX "<uuid>"
|
||||
SWITCH_STANDARD_API(aws_lex_api_play_done_function)
|
||||
{
|
||||
char *mycmd = NULL, *argv[10] = { 0 };
|
||||
int argc = 0;
|
||||
switch_status_t status = SWITCH_STATUS_FALSE;
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "command %s\n", cmd);
|
||||
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
|
||||
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
|
||||
}
|
||||
|
||||
if (zstr(cmd) || argc < 1) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s %s.\n", cmd, argv[0]);
|
||||
stream->write_function(stream, "-USAGE: %s\n", LEX_API_DTMF_SYNTAX);
|
||||
goto done;
|
||||
} else {
|
||||
switch_core_session_t *lsession = NULL;
|
||||
|
||||
if ((lsession = switch_core_session_locate(argv[0]))) {
|
||||
status = aws_lex_session_play_done(lsession);
|
||||
switch_core_session_rwunlock(lsession);
|
||||
}
|
||||
}
|
||||
|
||||
if (status == SWITCH_STATUS_SUCCESS) {
|
||||
stream->write_function(stream, "+OK Success\n");
|
||||
} else {
|
||||
stream->write_function(stream, "-ERR Operation Failed\n");
|
||||
}
|
||||
|
||||
done:
|
||||
|
||||
switch_safe_free(mycmd);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
#define LEX_API_STOP_SYNTAX "<uuid>"
|
||||
SWITCH_STANDARD_API(aws_lex_api_stop_function)
|
||||
{
|
||||
char *mycmd = NULL, *argv[10] = { 0 };
|
||||
int argc = 0;
|
||||
switch_status_t status = SWITCH_STATUS_FALSE;
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "command %s\n", cmd);
|
||||
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
|
||||
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
|
||||
}
|
||||
|
||||
if (zstr(cmd) || argc != 1) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s.\n", cmd);
|
||||
stream->write_function(stream, "-USAGE: %s\n", LEX_API_STOP_SYNTAX);
|
||||
goto done;
|
||||
} else {
|
||||
switch_core_session_t *lsession = NULL;
|
||||
|
||||
if ((lsession = switch_core_session_locate(argv[0]))) {
|
||||
status = do_stop(lsession);
|
||||
switch_core_session_rwunlock(lsession);
|
||||
}
|
||||
}
|
||||
|
||||
if (status == SWITCH_STATUS_SUCCESS) {
|
||||
stream->write_function(stream, "+OK Success\n");
|
||||
} else {
|
||||
stream->write_function(stream, "-ERR Operation Failed\n");
|
||||
}
|
||||
|
||||
done:
|
||||
|
||||
switch_safe_free(mycmd);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/* Macro expands to: switch_status_t mod_lex_load(switch_loadable_module_interface_t **module_interface, switch_memory_pool_t *pool) */
|
||||
SWITCH_MODULE_LOAD_FUNCTION(mod_aws_lex_load)
|
||||
{
|
||||
switch_api_interface_t *api_interface;
|
||||
|
||||
/* create/register custom event message types */
|
||||
if (switch_event_reserve_subclass(AWS_LEX_EVENT_INTENT) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", AWS_LEX_EVENT_INTENT);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(AWS_LEX_EVENT_TRANSCRIPTION) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", AWS_LEX_EVENT_TRANSCRIPTION);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(AWS_LEX_EVENT_TEXT_RESPONSE) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", AWS_LEX_EVENT_TEXT_RESPONSE);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(AWS_LEX_EVENT_PLAYBACK_INTERRUPTION) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", AWS_LEX_EVENT_PLAYBACK_INTERRUPTION);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(AWS_LEX_EVENT_AUDIO_PROVIDED) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", AWS_LEX_EVENT_AUDIO_PROVIDED);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
|
||||
if (switch_event_reserve_subclass(AWS_LEX_EVENT_ERROR) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", AWS_LEX_EVENT_ERROR);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
|
||||
|
||||
/* connect my internal structure to the blank pointer passed to me */
|
||||
*module_interface = switch_loadable_module_create_module_interface(pool, modname);
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "mod_aws_lex API loading..\n");
|
||||
|
||||
if (SWITCH_STATUS_FALSE == aws_lex_init()) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Failed initializing mod_aws_lex interface\n");
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "mod_aws_lex API successfully loaded\n");
|
||||
|
||||
SWITCH_ADD_API(api_interface, "aws_lex_start", "Start an aws lex conversation", aws_lex_api_start_function, LEX_API_START_SYNTAX);
|
||||
SWITCH_ADD_API(api_interface, "aws_lex_dtmf", "Send a dtmf entry to lex", aws_lex_api_dtmf_function, LEX_API_DTMF_SYNTAX);
|
||||
SWITCH_ADD_API(api_interface, "aws_lex_play_done", "Notify lex that a play completed", aws_lex_api_play_done_function, LEX_API_PLAY_DONE_SYNTAX);
|
||||
SWITCH_ADD_API(api_interface, "aws_lex_stop", "Terminate a aws lex", aws_lex_api_stop_function, LEX_API_STOP_SYNTAX);
|
||||
|
||||
switch_console_set_complete("add aws_lex_stop");
|
||||
switch_console_set_complete("add aws_lex_play_done");
|
||||
switch_console_set_complete("add aws_lex_dtmf dtmf-entry");
|
||||
switch_console_set_complete("add aws_lex_start project lang");
|
||||
switch_console_set_complete("add aws_lex_start bot alias region locale");
|
||||
|
||||
/* indicate that the module should continue to be loaded */
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Called when the system shuts down
|
||||
Macro expands to: switch_status_t mod_lex_shutdown() */
|
||||
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_aws_lex_shutdown)
|
||||
{
|
||||
aws_lex_cleanup();
|
||||
|
||||
switch_event_free_subclass(AWS_LEX_EVENT_INTENT);
|
||||
switch_event_free_subclass(AWS_LEX_EVENT_TRANSCRIPTION);
|
||||
switch_event_free_subclass(AWS_LEX_EVENT_TEXT_RESPONSE);
|
||||
switch_event_free_subclass(AWS_LEX_EVENT_PLAYBACK_INTERRUPTION);
|
||||
switch_event_free_subclass(AWS_LEX_EVENT_AUDIO_PROVIDED);
|
||||
switch_event_free_subclass(AWS_LEX_EVENT_ERROR);
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
46
mod_aws_lex/mod_aws_lex.h
Normal file
46
mod_aws_lex/mod_aws_lex.h
Normal file
@@ -0,0 +1,46 @@
|
||||
#ifndef __MOD_LEX_H__
|
||||
#define __MOD_LEX_H__
|
||||
|
||||
#include <switch.h>
|
||||
#include <speex/speex_resampler.h>
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
#define MY_BUG_NAME "__aws_lex_bug__"
|
||||
#define AWS_LEX_EVENT_INTENT "lex::intent"
|
||||
#define AWS_LEX_EVENT_TRANSCRIPTION "lex::transcription"
|
||||
#define AWS_LEX_EVENT_TEXT_RESPONSE "lex::text_response"
|
||||
#define AWS_LEX_EVENT_AUDIO_PROVIDED "lex::audio_provided"
|
||||
#define AWS_LEX_EVENT_PLAYBACK_INTERRUPTION "lex::playback_interruption"
|
||||
#define AWS_LEX_EVENT_ERROR "lex::error"
|
||||
|
||||
#define MAX_LANG (12)
|
||||
#define MAX_BOTNAME (128)
|
||||
#define MAX_REGION (16)
|
||||
#define MAX_LOCALE (7)
|
||||
#define MAX_INTENT (52)
|
||||
#define MAX_METADATA (1024)
|
||||
|
||||
/* per-channel data */
|
||||
typedef void (*responseHandler_t)(switch_core_session_t* session, const char * type, char* json);
|
||||
typedef void (*errorHandler_t)(switch_core_session_t* session, const char * reason);
|
||||
|
||||
struct cap_cb {
|
||||
switch_mutex_t *mutex;
|
||||
char sessionId[256];
|
||||
char awsAccessKeyId[128];
|
||||
char awsSecretAccessKey[128];
|
||||
SpeexResamplerState *resampler;
|
||||
void* streamer;
|
||||
responseHandler_t responseHandler;
|
||||
errorHandler_t errorHandler;
|
||||
switch_thread_t* thread;
|
||||
char bot[MAX_BOTNAME];
|
||||
char alias[MAX_BOTNAME];
|
||||
char region[MAX_REGION];
|
||||
char locale[MAX_LOCALE];
|
||||
char intent[MAX_INTENT];
|
||||
char metadata[MAX_METADATA];
|
||||
};
|
||||
|
||||
#endif
|
||||
195
mod_aws_lex/parser.cpp
Normal file
195
mod_aws_lex/parser.cpp
Normal file
@@ -0,0 +1,195 @@
|
||||
#include "parser.h"
|
||||
#include <switch.h>
|
||||
|
||||
|
||||
|
||||
cJSON* lex2Json(const TranscriptEvent& ev) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "transcript", cJSON_CreateString(ev.GetTranscript().c_str()));
|
||||
cJSON_AddItemToObject(json, "eventId", cJSON_CreateString(ev.GetEventId().c_str()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* lex2Json(const TextResponseEvent& ev) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "eventId", cJSON_CreateString(ev.GetEventId().c_str()));
|
||||
|
||||
cJSON* jMessages = cJSON_CreateArray();
|
||||
cJSON_AddItemToObject(json, "messages", jMessages);
|
||||
for (auto msg : ev.GetMessages()) {
|
||||
cJSON_AddItemToArray(jMessages, lex2Json(msg));
|
||||
}
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* lex2Json(const Message& msg) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "msg", cJSON_CreateString(msg.GetContent().c_str()));
|
||||
cJSON_AddItemToObject(json, "type",
|
||||
cJSON_CreateString(MessageContentTypeMapper::GetNameForMessageContentType(msg.GetContentType()).c_str()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* lex2Json(const IntentResultEvent& ev) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "eventId", cJSON_CreateString(ev.GetEventId().c_str()));
|
||||
cJSON_AddItemToObject(json, "sessionId", cJSON_CreateString(ev.GetSessionId().c_str()));
|
||||
cJSON_AddItemToObject(json, "sessionState", lex2Json(ev.GetSessionState()));
|
||||
cJSON_AddItemToObject(json, "requestAttributes", lex2Json(ev.GetRequestAttributes()));
|
||||
|
||||
cJSON* jInterpretations = cJSON_CreateArray();
|
||||
cJSON_AddItemToObject(json, "interpretations", jInterpretations);
|
||||
for (auto interp : ev.GetInterpretations()) {
|
||||
cJSON * jInterp = cJSON_CreateObject();
|
||||
cJSON_AddItemToArray(jInterpretations, jInterp);
|
||||
|
||||
cJSON_AddItemToObject(jInterp, "confidence", cJSON_CreateNumber(interp.GetNluConfidence().GetScore()));
|
||||
cJSON_AddItemToObject(jInterp, "sentiment", lex2Json(interp.GetSentimentResponse()));
|
||||
cJSON_AddItemToObject(jInterp, "intent", lex2Json(interp.GetIntent()));
|
||||
}
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* lex2Json(const PlaybackInterruptionEvent& ev) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "reason",
|
||||
cJSON_CreateString(PlaybackInterruptionReasonMapper::GetNameForPlaybackInterruptionReason(ev.GetEventReason()).c_str()));
|
||||
cJSON_AddItemToObject(json, "causedBy", cJSON_CreateString(ev.GetCausedByEventId().c_str()));
|
||||
cJSON_AddItemToObject(json, "eventId", cJSON_CreateString(ev.GetEventId().c_str()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* lex2Json(const Aws::Map<Aws::String, Aws::String>& attr) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
for (auto it: attr) {
|
||||
cJSON_AddItemToObject(json, it.first.c_str(), cJSON_CreateString(it.second.c_str()));
|
||||
}
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* lex2Json(const Aws::Map<Aws::String, Slot>& slots) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
for (auto it: slots) {
|
||||
cJSON_AddItemToObject(json, it.first.c_str(), lex2Json(it.second));
|
||||
}
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* lex2Json(const SessionState& state) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "dialogAction", lex2Json(state.GetDialogAction()));
|
||||
cJSON_AddItemToObject(json, "intent", lex2Json(state.GetIntent()));
|
||||
|
||||
cJSON* jContexts = cJSON_CreateArray();
|
||||
cJSON_AddItemToObject(json, "activeContexts", jContexts);
|
||||
for (auto context : state.GetActiveContexts()) {
|
||||
cJSON_AddItemToArray(jContexts, lex2Json(context));
|
||||
}
|
||||
|
||||
cJSON_AddItemToObject(json, "attributes", lex2Json(state.GetSessionAttributes()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* lex2Json(const SentimentResponse& sentiment) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "type",
|
||||
cJSON_CreateString(SentimentTypeMapper::GetNameForSentimentType(sentiment.GetSentiment()).c_str()));
|
||||
cJSON_AddItemToObject(json, "score", lex2Json(sentiment.GetSentimentScore()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* lex2Json(const Intent& intent) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "name", cJSON_CreateString(intent.GetName().c_str()));
|
||||
cJSON_AddItemToObject(json, "slots", lex2Json(intent.GetSlots()));
|
||||
cJSON_AddItemToObject(json, "intentState", cJSON_CreateString(IntentStateMapper::GetNameForIntentState(intent.GetState()).c_str()));
|
||||
cJSON_AddItemToObject(json, "confirmationState", cJSON_CreateString(ConfirmationStateMapper::GetNameForConfirmationState(intent.GetConfirmationState()).c_str()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* lex2Json(const DialogAction& dialogAction) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "type",
|
||||
cJSON_CreateString(DialogActionTypeMapper::GetNameForDialogActionType(dialogAction.GetType()).c_str()));
|
||||
cJSON_AddItemToObject(json, "slotToElicit", cJSON_CreateString(dialogAction.GetSlotToElicit().c_str()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* lex2Json(const ActiveContext& context) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "name", cJSON_CreateString(context.GetName().c_str()));
|
||||
cJSON_AddItemToObject(json, "ttl", lex2Json(context.GetTimeToLive()));
|
||||
cJSON_AddItemToObject(json, "attributes", lex2Json(context.GetContextAttributes()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* lex2Json(const SentimentScore& score) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "positive", cJSON_CreateNumber(score.GetPositive()));
|
||||
cJSON_AddItemToObject(json, "negative", cJSON_CreateNumber(score.GetNegative()));
|
||||
cJSON_AddItemToObject(json, "neutral", cJSON_CreateNumber(score.GetNeutral()));
|
||||
cJSON_AddItemToObject(json, "mixed", cJSON_CreateNumber(score.GetMixed()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* lex2Json(const ActiveContextTimeToLive& ttl) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "seconds", cJSON_CreateNumber(ttl.GetTimeToLiveInSeconds()));
|
||||
cJSON_AddItemToObject(json, "turns", cJSON_CreateNumber(ttl.GetTurnsToLive()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* lex2Json(const Slot& slot) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "value", lex2Json(slot.GetValue()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* lex2Json(const Value& value) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "originalValue", cJSON_CreateString(value.GetOriginalValue().c_str()));
|
||||
cJSON_AddItemToObject(json, "interpretedValue", cJSON_CreateString(value.GetInterpretedValue().c_str()));
|
||||
|
||||
cJSON* jResolved = cJSON_CreateArray();
|
||||
cJSON_AddItemToObject(json, "resolvedValues", jResolved);
|
||||
for (auto res : value.GetResolvedValues()) {
|
||||
cJSON_AddItemToArray(jResolved, cJSON_CreateString(res.c_str()));
|
||||
}
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* lex2Json(const Aws::Client::AWSError<LexRuntimeV2Errors>& err) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "message", cJSON_CreateString(err.GetMessage().c_str()));
|
||||
|
||||
return json;
|
||||
}
|
||||
29
mod_aws_lex/parser.h
Normal file
29
mod_aws_lex/parser.h
Normal file
@@ -0,0 +1,29 @@
|
||||
#ifndef __PARSER_H__
|
||||
#define __PARSER_H__
|
||||
|
||||
#include <switch_json.h>
|
||||
#include <aws/lexv2-runtime/LexRuntimeV2Client.h>
|
||||
#include <aws/lexv2-runtime/model/StartConversationRequest.h>
|
||||
|
||||
using namespace Aws::LexRuntimeV2;
|
||||
using namespace Aws::LexRuntimeV2::Model;
|
||||
|
||||
cJSON* lex2Json(const TranscriptEvent& ev);
|
||||
cJSON* lex2Json(const TextResponseEvent& ev);
|
||||
cJSON* lex2Json(const Message& msg);
|
||||
cJSON* lex2Json(const IntentResultEvent& ev);
|
||||
cJSON* lex2Json(const PlaybackInterruptionEvent& ev);
|
||||
cJSON* lex2Json(const Aws::Map<Aws::String, Aws::String>& attr);
|
||||
cJSON* lex2Json(const Aws::Map<Aws::String, Slot>& slots);
|
||||
cJSON* lex2Json(const SessionState& state) ;
|
||||
cJSON* lex2Json(const SentimentResponse& sentiment) ;
|
||||
cJSON* lex2Json(const Intent& intent) ;
|
||||
cJSON* lex2Json(const DialogAction& dialogAction);
|
||||
cJSON* lex2Json(const ActiveContext& context);
|
||||
cJSON* lex2Json(const SentimentScore& score);
|
||||
cJSON* lex2Json(const ActiveContextTimeToLive& ttl);
|
||||
cJSON* lex2Json(const Slot& slot);
|
||||
cJSON* lex2Json(const Value& value) ;
|
||||
cJSON* lex2Json(const Aws::Client::AWSError<LexRuntimeV2Errors>& err);
|
||||
|
||||
#endif
|
||||
8
mod_aws_transcribe/LICENSE
Normal file
8
mod_aws_transcribe/LICENSE
Normal file
@@ -0,0 +1,8 @@
|
||||
Copyright 2023, Drachtio Communications Services, LLC
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
10
mod_aws_transcribe/Makefile.am
Normal file
10
mod_aws_transcribe/Makefile.am
Normal file
@@ -0,0 +1,10 @@
|
||||
include $(top_srcdir)/build/modmake.rulesam
|
||||
MODNAME=mod_aws_transcribe
|
||||
|
||||
mod_LTLIBRARIES = mod_aws_transcribe.la
|
||||
mod_aws_transcribe_la_SOURCES = mod_aws_transcribe.c aws_transcribe_glue.cpp
|
||||
mod_aws_transcribe_la_CFLAGS = $(AM_CFLAGS)
|
||||
mod_aws_transcribe_la_CXXFLAGS = $(AM_CXXFLAGS) -std=c++11 -I${switch_srcdir}/libs/aws-sdk-cpp/aws-cpp-sdk-core/include -I${switch_srcdir}/libs/aws-sdk-cpp/aws-cpp-sdk-transcribestreaming/include -I${switch_srcdir}/libs/aws-sdk-cpp/build/.deps/install/include
|
||||
|
||||
mod_aws_transcribe_la_LIBADD = $(switch_builddir)/libfreeswitch.la
|
||||
mod_aws_transcribe_la_LDFLAGS = -avoid-version -module -no-undefined -L${switch_srcdir}/libs/aws-sdk-cpp/build/.deps/install/lib -L${switch_srcdir}/libs/aws-sdk-cpp/build/aws-cpp-sdk-core -L${switch_srcdir}/libs/aws-sdk-cpp/build/aws-cpp-sdk-transcribestreaming -laws-cpp-sdk-transcribestreaming -laws-cpp-sdk-core -laws-c-event-stream -laws-checksums -laws-c-common -lpthread -lcurl -lcrypto -lssl -lz
|
||||
58
mod_aws_transcribe/README.md
Normal file
58
mod_aws_transcribe/README.md
Normal file
@@ -0,0 +1,58 @@
|
||||
# mod_aws_transcribe
|
||||
|
||||
A Freeswitch module that generates real-time transcriptions on a Freeswitch channel by using AWS streaming transcription API
|
||||
|
||||
## API
|
||||
|
||||
### Commands
|
||||
The freeswitch module exposes the following API commands:
|
||||
|
||||
```
|
||||
aws_transcribe <uuid> start <lang-code> [interim]
|
||||
```
|
||||
Attaches media bug to channel and performs streaming recognize request.
|
||||
- `uuid` - unique identifier of Freeswitch channel
|
||||
- `lang-code` - a valid AWS [language code](https://docs.aws.amazon.com/transcribe/latest/dg/what-is-transcribe.html) that is supported for streaming transcription
|
||||
- `interim` - If the 'interim' keyword is present then both interim and final transcription results will be returned; otherwise only final transcriptions will be returned
|
||||
|
||||
```
|
||||
aws_transcribe <uuid> stop
|
||||
```
|
||||
Stop transcription on the channel.
|
||||
|
||||
### Authentication
|
||||
The plugin will first look for channel variables, then environment variables. If neither are found, then the default AWS profile on the server will be used.
|
||||
|
||||
The names of the channel variables and environment variables are:
|
||||
|
||||
| variable | Description |
|
||||
| --- | ----------- |
|
||||
| AWS_ACCESS_KEY_ID | The Aws access key ID |
|
||||
| AWS_SECRET_ACCESS_KEY | The Aws secret access key |
|
||||
| AWS_REGION | The Aws region |
|
||||
|
||||
|
||||
### Events
|
||||
`aws_transcribe::transcription` - returns an interim or final transcription. The event contains a JSON body describing the transcription result:
|
||||
```js
|
||||
[
|
||||
{
|
||||
"is_final": true,
|
||||
"alternatives": [{
|
||||
"transcript": "Hello. Can you hear me?"
|
||||
}]
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
## Usage
|
||||
When using [drachtio-fsrmf](https://www.npmjs.com/package/drachtio-fsmrf), you can access this API command via the api method on the 'endpoint' object.
|
||||
```js
|
||||
ep.api('aws_transcribe', `${ep.uuid} start en-US interim`);
|
||||
```
|
||||
|
||||
## Building
|
||||
You will need to build the AWS C++ SDK. You can use [this ansible role](https://github.com/davehorton/ansible-role-fsmrf), or refer to the specific steps [here](https://github.com/davehorton/ansible-role-fsmrf/blob/a1947cc24e89dee7d6b42053c53295f9198340c1/tasks/grpc.yml#L28).
|
||||
|
||||
## Examples
|
||||
[aws_transcribe.js](../../examples/aws_transcribe.js)
|
||||
594
mod_aws_transcribe/aws_transcribe_glue.cpp
Normal file
594
mod_aws_transcribe/aws_transcribe_glue.cpp
Normal file
@@ -0,0 +1,594 @@
|
||||
#include <cstdlib>
|
||||
|
||||
#include <switch.h>
|
||||
#include <switch_json.h>
|
||||
|
||||
#include <string.h>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <condition_variable>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <deque>
|
||||
|
||||
#include <aws/core/Aws.h>
|
||||
#include <aws/core/auth/AWSCredentialsProvider.h>
|
||||
#include <aws/core/client/ClientConfiguration.h>
|
||||
#include <aws/core/utils/logging/DefaultLogSystem.h>
|
||||
#include <aws/core/utils/logging/AWSLogging.h>
|
||||
#include <aws/transcribestreaming/TranscribeStreamingServiceClient.h>
|
||||
#include <aws/transcribestreaming/model/StartStreamTranscriptionHandler.h>
|
||||
#include <aws/transcribestreaming/model/StartStreamTranscriptionRequest.h>
|
||||
|
||||
#include "mod_aws_transcribe.h"
|
||||
#include "simple_buffer.h"
|
||||
|
||||
#define BUFFER_SECS (3)
|
||||
#define CHUNKSIZE (320)
|
||||
|
||||
using namespace Aws;
|
||||
using namespace Aws::Utils;
|
||||
using namespace Aws::Auth;
|
||||
using namespace Aws::TranscribeStreamingService;
|
||||
using namespace Aws::TranscribeStreamingService::Model;
|
||||
|
||||
|
||||
const char ALLOC_TAG[] = "drachtio";
|
||||
|
||||
static bool hasDefaultCredentials = false;
|
||||
|
||||
class GStreamer {
|
||||
public:
|
||||
GStreamer(
|
||||
const char *sessionId,
|
||||
const char *bugname,
|
||||
u_int16_t channels,
|
||||
char *lang,
|
||||
int interim,
|
||||
uint32_t samples_per_second,
|
||||
const char* region,
|
||||
const char* awsAccessKeyId,
|
||||
const char* awsSecretAccessKey,
|
||||
responseHandler_t responseHandler
|
||||
) : m_sessionId(sessionId), m_bugname(bugname), m_finished(false), m_interim(interim), m_finishing(false), m_connected(false), m_connecting(false),
|
||||
m_packets(0), m_responseHandler(responseHandler), m_pStream(nullptr),
|
||||
m_audioBuffer(320 * (samples_per_second == 8000 ? 1 : 2), 15) {
|
||||
Aws::String key(awsAccessKeyId);
|
||||
Aws::String secret(awsSecretAccessKey);
|
||||
Aws::Client::ClientConfiguration config;
|
||||
if (region != nullptr && strlen(region) > 0) config.region = region;
|
||||
char keySnippet[20];
|
||||
|
||||
strncpy(keySnippet, awsAccessKeyId, 4);
|
||||
for (int i = 4; i < 20; i++) keySnippet[i] = 'x';
|
||||
keySnippet[19] = '\0';
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p ACCESS_KEY_ID %s, region %s\n", this, keySnippet, region);
|
||||
if (*awsAccessKeyId && *awsSecretAccessKey) {
|
||||
m_client = Aws::MakeUnique<TranscribeStreamingServiceClient>(ALLOC_TAG, AWSCredentials(awsAccessKeyId, awsSecretAccessKey), config);
|
||||
}
|
||||
else {
|
||||
m_client = Aws::MakeUnique<TranscribeStreamingServiceClient>(ALLOC_TAG, config);
|
||||
}
|
||||
|
||||
m_handler.SetTranscriptEventCallback([this](const TranscriptEvent& ev)
|
||||
{
|
||||
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
||||
if (psession) {
|
||||
switch_channel_t* channel = switch_core_session_get_channel(psession);
|
||||
std::lock_guard<std::mutex> lk(m_mutex);
|
||||
m_transcript = ev;
|
||||
m_cond.notify_one();
|
||||
|
||||
switch_core_session_rwunlock(psession);
|
||||
}
|
||||
});
|
||||
|
||||
// not worth resampling to 16k if we get 8k ulaw or alaw in..
|
||||
m_request.SetMediaSampleRateHertz(samples_per_second > 8000 ? 16000 : 8000);
|
||||
m_request.SetLanguageCode(LanguageCodeMapper::GetLanguageCodeForName(lang));
|
||||
m_request.SetMediaEncoding(MediaEncoding::pcm);
|
||||
m_request.SetEventStreamHandler(m_handler);
|
||||
if (channels > 1) m_request.SetNumberOfChannels(channels);
|
||||
|
||||
const char* var;
|
||||
switch_core_session_t* session = switch_core_session_locate(sessionId);
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
|
||||
if (var = switch_channel_get_variable(channel, "AWS_SHOW_SPEAKER_LABEL")) {
|
||||
m_request.SetShowSpeakerLabel(true);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "AWS_ENABLE_CHANNEL_IDENTIFICATION")) {
|
||||
m_request.SetEnableChannelIdentification(true);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "AWS_VOCABULARY_NAME")) {
|
||||
m_request.SetVocabularyName(var);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "AWS_VOCABULARY_FILTER_NAME")) {
|
||||
m_request.SetVocabularyFilterName(var);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "AWS_VOCABULARY_FILTER_METHOD")) {
|
||||
m_request.SetVocabularyFilterMethod(VocabularyFilterMethodMapper::GetVocabularyFilterMethodForName(var));
|
||||
}
|
||||
switch_core_session_rwunlock(session);
|
||||
}
|
||||
|
||||
void connect() {
|
||||
if (m_connecting) return;
|
||||
m_connecting = true;
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer:connect %p connecting to aws speech..\n", this);
|
||||
|
||||
auto OnStreamReady = [this](Model::AudioStream& stream)
|
||||
{
|
||||
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
||||
if (psession) {
|
||||
switch_channel_t* channel = switch_core_session_get_channel(psession);
|
||||
|
||||
m_pStream = &stream;
|
||||
m_connected = true;
|
||||
|
||||
|
||||
// send any buffered audio
|
||||
int nFrames = m_audioBuffer.getNumItems();
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p got stream ready, %d buffered frames\n", this, nFrames);
|
||||
if (nFrames) {
|
||||
char *p;
|
||||
do {
|
||||
p = m_audioBuffer.getNextChunk();
|
||||
if (p) {
|
||||
write(p, CHUNKSIZE);
|
||||
}
|
||||
} while (p);
|
||||
}
|
||||
|
||||
switch_core_session_rwunlock(psession);
|
||||
}
|
||||
};
|
||||
auto OnResponseCallback = [this](const TranscribeStreamingServiceClient* pClient,
|
||||
const Model::StartStreamTranscriptionRequest& request,
|
||||
const Model::StartStreamTranscriptionOutcome& outcome,
|
||||
const std::shared_ptr<const Aws::Client::AsyncCallerContext>& context)
|
||||
{
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p stream got final response\n", this);
|
||||
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
||||
if (psession) {
|
||||
if (!outcome.IsSuccess()) {
|
||||
const TranscribeStreamingServiceError& err = outcome.GetError();
|
||||
auto message = err.GetMessage();
|
||||
auto exception = err.GetExceptionName();
|
||||
cJSON* json = cJSON_CreateObject();
|
||||
cJSON_AddStringToObject(json, "type", "error");
|
||||
cJSON_AddStringToObject(json, "error", message.c_str());
|
||||
char* jsonString = cJSON_PrintUnformatted(json);
|
||||
m_responseHandler(psession, jsonString, m_bugname.c_str());
|
||||
free(jsonString);
|
||||
cJSON_Delete(json);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p stream got error response %s : %s\n", this, message.c_str(), exception.c_str());
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> lk(m_mutex);
|
||||
m_finished = true;
|
||||
m_cond.notify_one();
|
||||
|
||||
switch_core_session_rwunlock(psession);
|
||||
} else {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p session is closed/hungup. Need to unblock thread.\n", this);
|
||||
std::lock_guard<std::mutex> lk(m_mutex);
|
||||
m_finished = true;
|
||||
m_cond.notify_one();
|
||||
}
|
||||
};
|
||||
|
||||
m_client->StartStreamTranscriptionAsync(m_request, OnStreamReady, OnResponseCallback, nullptr);
|
||||
}
|
||||
|
||||
|
||||
~GStreamer() {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::~GStreamer wrote %u packets %p\n", m_packets, this);
|
||||
}
|
||||
|
||||
bool write(void* data, uint32_t datalen) {
|
||||
if (m_finishing || m_finished) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::write not writing because we are finished, %p\n", this);
|
||||
return false;
|
||||
}
|
||||
if (!m_connected) {
|
||||
if (datalen % CHUNKSIZE == 0) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::write queuing %d bytes\n", datalen);
|
||||
m_audioBuffer.add(data, datalen);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> lk(m_mutex);
|
||||
|
||||
const auto beg = static_cast<const unsigned char*>(data);
|
||||
const auto end = beg + datalen;
|
||||
Aws::Vector<unsigned char> bits { beg, end };
|
||||
m_deqAudio.push_back(bits);
|
||||
m_packets++;
|
||||
|
||||
m_cond.notify_one();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void finish() {
|
||||
if (m_finishing) return;
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::finish %p\n", this);
|
||||
std::lock_guard<std::mutex> lk(m_mutex);
|
||||
|
||||
m_finishing = true;
|
||||
m_cond.notify_one();
|
||||
}
|
||||
|
||||
void processData() {
|
||||
bool shutdownInitiated = false;
|
||||
while (true) {
|
||||
std::unique_lock<std::mutex> lk(m_mutex);
|
||||
m_cond.wait(lk, [&, this] {
|
||||
return (!m_deqAudio.empty() && !m_finishing) || m_transcript.TranscriptHasBeenSet() || m_finished || (m_finishing && !shutdownInitiated);
|
||||
});
|
||||
|
||||
|
||||
// we have data to process or have been told we're done
|
||||
if (m_finished || !m_connected) return;
|
||||
|
||||
if (m_transcript.TranscriptHasBeenSet()) {
|
||||
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
||||
if (psession) {
|
||||
|
||||
//switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::got a transcript to send out %p\n", this);
|
||||
bool isFinal = false;
|
||||
std::ostringstream s;
|
||||
s << "[";
|
||||
for (auto&& r : m_transcript.GetTranscript().GetResults()) {
|
||||
int count = 0;
|
||||
std::ostringstream t1;
|
||||
if (!isFinal && !r.GetIsPartial()) isFinal = true;
|
||||
t1 << "{\"is_final\": " << (r.GetIsPartial() ? "false" : "true") << ", \"alternatives\": [";
|
||||
for (auto&& alt : r.GetAlternatives()) {
|
||||
std::ostringstream t2;
|
||||
if (count++ == 0) t2 << "{\"transcript\": \"" << alt.GetTranscript() << "\"}";
|
||||
else t2 << ", {\"transcript\": \"" << alt.GetTranscript() << "\"}";
|
||||
t1 << t2.str();
|
||||
}
|
||||
t1 << "]}";
|
||||
s << t1.str();
|
||||
}
|
||||
s << "]";
|
||||
if (0 != s.str().compare("[]") && (isFinal || m_interim)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::writing transcript %p: %s\n", this, s.str().c_str() );
|
||||
m_responseHandler(psession, s.str().c_str(), m_bugname.c_str());
|
||||
}
|
||||
TranscriptEvent empty;
|
||||
m_transcript = empty;
|
||||
|
||||
switch_core_session_rwunlock(psession);
|
||||
}
|
||||
}
|
||||
if (m_finishing) {
|
||||
shutdownInitiated = true;
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::writing disconnect event %p\n", this);
|
||||
|
||||
if (m_pStream) {
|
||||
m_pStream->flush();
|
||||
m_pStream->Close();
|
||||
m_pStream = nullptr;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// send out any queued speech packets
|
||||
while (!m_deqAudio.empty()) {
|
||||
Aws::Vector<unsigned char>& bits = m_deqAudio.front();
|
||||
Aws::TranscribeStreamingService::Model::AudioEvent event(std::move(bits));
|
||||
m_pStream->WriteAudioEvent(event);
|
||||
m_deqAudio.pop_front();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool isConnecting() {
|
||||
return m_connecting;
|
||||
}
|
||||
|
||||
private:
|
||||
std::string m_sessionId;
|
||||
std::string m_bugname;
|
||||
std::string m_region;
|
||||
Aws::UniquePtr<TranscribeStreamingServiceClient> m_client;
|
||||
AudioStream* m_pStream;
|
||||
StartStreamTranscriptionRequest m_request;
|
||||
StartStreamTranscriptionHandler m_handler;
|
||||
TranscriptEvent m_transcript;
|
||||
responseHandler_t m_responseHandler;
|
||||
bool m_finishing;
|
||||
bool m_interim;
|
||||
bool m_finished;
|
||||
bool m_connected;
|
||||
bool m_connecting;
|
||||
uint32_t m_packets;
|
||||
std::mutex m_mutex;
|
||||
std::condition_variable m_cond;
|
||||
std::deque< Aws::Vector<unsigned char> > m_deqAudio;
|
||||
SimpleBuffer m_audioBuffer;
|
||||
};
|
||||
|
||||
static void *SWITCH_THREAD_FUNC aws_transcribe_thread(switch_thread_t *thread, void *obj) {
|
||||
struct cap_cb *cb = (struct cap_cb *) obj;
|
||||
bool ok = true;
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "transcribe_thread: starting cb %p\n", (void *) cb);
|
||||
GStreamer* pStreamer = new GStreamer(cb->sessionId, cb->bugname, cb->channels, cb->lang, cb->interim, cb->samples_per_second, cb->region, cb->awsAccessKeyId, cb->awsSecretAccessKey,
|
||||
cb->responseHandler);
|
||||
if (!pStreamer) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "transcribe_thread: Error allocating streamer\n");
|
||||
return nullptr;
|
||||
}
|
||||
if (!cb->vad) pStreamer->connect();
|
||||
cb->streamer = pStreamer;
|
||||
pStreamer->processData(); //blocks until done
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "transcribe_thread: stopping cb %p\n", (void *) cb);
|
||||
delete pStreamer;
|
||||
cb->streamer = nullptr;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
static void killcb(struct cap_cb* cb) {
|
||||
if (cb) {
|
||||
if (cb->streamer) {
|
||||
GStreamer* p = (GStreamer *) cb->streamer;
|
||||
delete p;
|
||||
cb->streamer = nullptr;
|
||||
}
|
||||
if (cb->resampler) {
|
||||
speex_resampler_destroy(cb->resampler);
|
||||
cb->resampler = nullptr;
|
||||
}
|
||||
if (cb->vad) {
|
||||
switch_vad_destroy(&cb->vad);
|
||||
cb->vad = nullptr;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
switch_status_t aws_transcribe_init() {
|
||||
const char* accessKeyId = std::getenv("AWS_ACCESS_KEY_ID");
|
||||
const char* secretAccessKey = std::getenv("AWS_SECRET_ACCESS_KEY");
|
||||
const char* region = std::getenv("AWS_REGION");
|
||||
if (NULL == accessKeyId && NULL == secretAccessKey) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE,
|
||||
"\"AWS_ACCESS_KEY_ID\" and/or \"AWS_SECRET_ACCESS_KEY\" env var not set; authentication will expect channel variables of same names to be set\n");
|
||||
}
|
||||
else {
|
||||
hasDefaultCredentials = true;
|
||||
|
||||
}
|
||||
Aws::SDKOptions options;
|
||||
/*
|
||||
options.loggingOptions.logLevel = Aws::Utils::Logging::LogLevel::Trace;
|
||||
|
||||
Aws::Utils::Logging::InitializeAWSLogging(
|
||||
Aws::MakeShared<Aws::Utils::Logging::DefaultLogSystem>(
|
||||
ALLOC_TAG, Aws::Utils::Logging::LogLevel::Trace, "aws_sdk_transcribe"));
|
||||
*/
|
||||
Aws::InitAPI(options);
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_status_t aws_transcribe_cleanup() {
|
||||
Aws::SDKOptions options;
|
||||
/*
|
||||
options.loggingOptions.logLevel = Aws::Utils::Logging::LogLevel::Trace;
|
||||
Aws::Utils::Logging::ShutdownAWSLogging();
|
||||
*/
|
||||
Aws::ShutdownAPI(options);
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// start transcribe on a channel
|
||||
switch_status_t aws_transcribe_session_init(switch_core_session_t *session, responseHandler_t responseHandler,
|
||||
uint32_t samples_per_second, uint32_t channels, char* lang, int interim, char* bugname, void **ppUserData
|
||||
) {
|
||||
switch_status_t status = SWITCH_STATUS_SUCCESS;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
int err;
|
||||
switch_threadattr_t *thd_attr = NULL;
|
||||
switch_memory_pool_t *pool = switch_core_session_get_pool(session);
|
||||
auto read_codec = switch_core_session_get_read_codec(session);
|
||||
uint32_t sampleRate = read_codec->implementation->actual_samples_per_second;
|
||||
|
||||
struct cap_cb* cb = (struct cap_cb *) switch_core_session_alloc(session, sizeof(*cb));
|
||||
memset(cb, sizeof(cb), 0);
|
||||
const char* awsAccessKeyId = switch_channel_get_variable(channel, "AWS_ACCESS_KEY_ID");
|
||||
const char* awsSecretAccessKey = switch_channel_get_variable(channel, "AWS_SECRET_ACCESS_KEY");
|
||||
const char* awsRegion = switch_channel_get_variable(channel, "AWS_REGION");
|
||||
cb->channels = channels;
|
||||
LanguageCode code = LanguageCodeMapper::GetLanguageCodeForName(lang);
|
||||
if(LanguageCode::NOT_SET == code) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "invalid language code %s\n", lang);
|
||||
status = SWITCH_STATUS_FALSE;
|
||||
goto done;
|
||||
}
|
||||
strncpy(cb->sessionId, switch_core_session_get_uuid(session), MAX_SESSION_ID);
|
||||
strncpy(cb->bugname, bugname, MAX_BUG_LEN);
|
||||
|
||||
if (awsAccessKeyId && awsSecretAccessKey && awsRegion) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Using channel vars for aws authentication\n");
|
||||
strncpy(cb->awsAccessKeyId, awsAccessKeyId, 128);
|
||||
strncpy(cb->awsSecretAccessKey, awsSecretAccessKey, 128);
|
||||
strncpy(cb->region, awsRegion, MAX_REGION);
|
||||
|
||||
}
|
||||
else if (std::getenv("AWS_ACCESS_KEY_ID") &&
|
||||
std::getenv("AWS_SECRET_ACCESS_KEY") &&
|
||||
std::getenv("AWS_REGION")) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Using env vars for aws authentication\n");
|
||||
strncpy(cb->awsAccessKeyId, std::getenv("AWS_ACCESS_KEY_ID"), 128);
|
||||
strncpy(cb->awsSecretAccessKey, std::getenv("AWS_SECRET_ACCESS_KEY"), 128);
|
||||
strncpy(cb->region, std::getenv("AWS_REGION"), MAX_REGION);
|
||||
}
|
||||
else {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "No channel vars or env vars for aws authentication..will use default profile if found\n");
|
||||
}
|
||||
|
||||
cb->responseHandler = responseHandler;
|
||||
|
||||
if (switch_mutex_init(&cb->mutex, SWITCH_MUTEX_NESTED, pool) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error initializing mutex\n");
|
||||
status = SWITCH_STATUS_FALSE;
|
||||
goto done;
|
||||
}
|
||||
|
||||
cb->interim = interim;
|
||||
strncpy(cb->lang, lang, MAX_LANG);
|
||||
cb->samples_per_second = sampleRate;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "sample rate of rtp stream is %d\n", samples_per_second);
|
||||
if (sampleRate != 8000) {
|
||||
cb->resampler = speex_resampler_init(1, sampleRate, 16000, SWITCH_RESAMPLE_QUALITY, &err);
|
||||
if (0 != err) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error initializing resampler: %s.\n",
|
||||
switch_channel_get_name(channel), speex_resampler_strerror(err));
|
||||
status = SWITCH_STATUS_FALSE;
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
|
||||
// allocate vad if we are delaying connecting to the recognizer until we detect speech
|
||||
if (switch_channel_var_true(channel, "START_RECOGNIZING_ON_VAD")) {
|
||||
cb->vad = switch_vad_init(sampleRate, 1);
|
||||
if (cb->vad) {
|
||||
const char* var;
|
||||
int mode = 2;
|
||||
int silence_ms = 150;
|
||||
int voice_ms = 250;
|
||||
int debug = 0;
|
||||
|
||||
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_MODE")) {
|
||||
mode = atoi(var);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_SILENCE_MS")) {
|
||||
silence_ms = atoi(var);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_VOICE_MS")) {
|
||||
voice_ms = atoi(var);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_DEBUG")) {
|
||||
debug = atoi(var);
|
||||
}
|
||||
switch_vad_set_mode(cb->vad, mode);
|
||||
switch_vad_set_param(cb->vad, "silence_ms", silence_ms);
|
||||
switch_vad_set_param(cb->vad, "voice_ms", voice_ms);
|
||||
switch_vad_set_param(cb->vad, "debug", debug);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "%s: delaying connection until vad, voice_ms %d, mode %d\n",
|
||||
switch_channel_get_name(channel), voice_ms, mode);
|
||||
}
|
||||
}
|
||||
|
||||
// create a thread to service the http/2 connection to aws
|
||||
switch_threadattr_create(&thd_attr, pool);
|
||||
switch_threadattr_stacksize_set(thd_attr, SWITCH_THREAD_STACKSIZE);
|
||||
switch_thread_create(&cb->thread, thd_attr, aws_transcribe_thread, cb, pool);
|
||||
|
||||
*ppUserData = cb;
|
||||
|
||||
done:
|
||||
return status;
|
||||
}
|
||||
|
||||
switch_status_t aws_transcribe_session_stop(switch_core_session_t *session, int channelIsClosing, char* bugname) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, bugname);
|
||||
|
||||
if (bug) {
|
||||
struct cap_cb *cb = (struct cap_cb *) switch_core_media_bug_get_user_data(bug);
|
||||
switch_status_t st;
|
||||
|
||||
// close connection and get final responses
|
||||
switch_mutex_lock(cb->mutex);
|
||||
GStreamer* streamer = (GStreamer *) cb->streamer;
|
||||
if (streamer) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "aws_transcribe_session_stop: finish..%s\n", bugname);
|
||||
streamer->finish();
|
||||
}
|
||||
if (cb->thread) {
|
||||
switch_status_t retval;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "aws_transcribe_session_stop: waiting for read thread to complete %s\n", bugname);
|
||||
switch_thread_join(&retval, cb->thread);
|
||||
cb->thread = NULL;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "aws_transcribe_session_stop: read thread completed %s, %d\n", bugname, retval);
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "aws_transcribe_session_stop: bugname - %s; going to kill callback\n", bugname);
|
||||
killcb(cb);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "aws_transcribe_session_stop: bugname - %s; killed callback\n", bugname);
|
||||
|
||||
switch_channel_set_private(channel, bugname, NULL);
|
||||
if (!channelIsClosing) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "aws_transcribe_session_stop: removing bug %s\n", bugname);
|
||||
switch_core_media_bug_remove(session, &bug);
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "aws_transcribe_session_stop: bugname - %s; unlocking callback mutex\n", bugname);
|
||||
switch_mutex_unlock(cb->mutex);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "aws_transcribe_session_stop: Closed aws session\n");
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s Bug is not attached.\n", switch_channel_get_name(channel));
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
switch_bool_t aws_transcribe_frame(switch_media_bug_t *bug, void* user_data) {
|
||||
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
|
||||
uint8_t data[SWITCH_RECOMMENDED_BUFFER_SIZE];
|
||||
switch_frame_t frame = {};
|
||||
struct cap_cb *cb = (struct cap_cb *) user_data;
|
||||
|
||||
frame.data = data;
|
||||
frame.buflen = SWITCH_RECOMMENDED_BUFFER_SIZE;
|
||||
|
||||
if (switch_mutex_trylock(cb->mutex) == SWITCH_STATUS_SUCCESS) {
|
||||
GStreamer* streamer = (GStreamer *) cb->streamer;
|
||||
if (streamer) {
|
||||
while (switch_core_media_bug_read(bug, &frame, SWITCH_TRUE) == SWITCH_STATUS_SUCCESS && !switch_test_flag((&frame), SFF_CNG)) {
|
||||
if (frame.datalen) {
|
||||
spx_int16_t out[SWITCH_RECOMMENDED_BUFFER_SIZE];
|
||||
spx_uint32_t out_len = SWITCH_RECOMMENDED_BUFFER_SIZE;
|
||||
spx_uint32_t in_len = frame.samples;
|
||||
size_t written;
|
||||
|
||||
if (cb->vad && !streamer->isConnecting()) {
|
||||
switch_vad_state_t state = switch_vad_process(cb->vad, (int16_t*) frame.data, frame.samples);
|
||||
if (state == SWITCH_VAD_STATE_START_TALKING) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "detected speech, connect to aws speech now\n");
|
||||
streamer->connect();
|
||||
cb->responseHandler(session, "vad_detected", cb->bugname);
|
||||
}
|
||||
}
|
||||
|
||||
if (cb->resampler) {
|
||||
speex_resampler_process_interleaved_int(cb->resampler, (const spx_int16_t *) frame.data, (spx_uint32_t *) &in_len, &out[0], &out_len);
|
||||
streamer->write( &out[0], sizeof(spx_int16_t) * out_len);
|
||||
}
|
||||
else {
|
||||
streamer->write( frame.data, sizeof(spx_int16_t) * frame.samples);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG,
|
||||
"aws_transcribe_frame: not sending audio because aws channel has been closed\n");
|
||||
}
|
||||
switch_mutex_unlock(cb->mutex);
|
||||
}
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
}
|
||||
11
mod_aws_transcribe/aws_transcribe_glue.h
Normal file
11
mod_aws_transcribe/aws_transcribe_glue.h
Normal file
@@ -0,0 +1,11 @@
|
||||
#ifndef __AWS_GLUE_H__
|
||||
#define __AWS_GLUE_H__
|
||||
|
||||
switch_status_t aws_transcribe_init();
|
||||
switch_status_t aws_transcribe_cleanup();
|
||||
switch_status_t aws_transcribe_session_init(switch_core_session_t *session, responseHandler_t responseHandler,
|
||||
uint32_t samples_per_second, uint32_t channels, char* lang, int interim, char *bugname, void **ppUserData);
|
||||
switch_status_t aws_transcribe_session_stop(switch_core_session_t *session, int channelIsClosing, char* bugname);
|
||||
switch_bool_t aws_transcribe_frame(switch_media_bug_t *bug, void* user_data);
|
||||
|
||||
#endif
|
||||
240
mod_aws_transcribe/mod_aws_transcribe.c
Normal file
240
mod_aws_transcribe/mod_aws_transcribe.c
Normal file
@@ -0,0 +1,240 @@
|
||||
/*
|
||||
*
|
||||
* mod_aws_transcribe.c -- Freeswitch module for using aws streaming transcribe api
|
||||
*
|
||||
*/
|
||||
#include "mod_aws_transcribe.h"
|
||||
#include "aws_transcribe_glue.h"
|
||||
|
||||
/* Prototypes */
|
||||
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_aws_transcribe_shutdown);
|
||||
SWITCH_MODULE_LOAD_FUNCTION(mod_aws_transcribe_load);
|
||||
|
||||
SWITCH_MODULE_DEFINITION(mod_aws_transcribe, mod_aws_transcribe_load, mod_aws_transcribe_shutdown, NULL);
|
||||
|
||||
static switch_status_t do_stop(switch_core_session_t *session, char* bugname);
|
||||
|
||||
static void responseHandler(switch_core_session_t* session, const char * json, const char* bugname) {
|
||||
switch_event_t *event;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
|
||||
if (0 == strcmp("vad_detected", json)) {
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_VAD_DETECTED);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "aws");
|
||||
}
|
||||
else if (0 == strcmp("end_of_transcript", json)) {
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_END_OF_TRANSCRIPT);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "aws");
|
||||
}
|
||||
else if (0 == strcmp("max_duration_exceeded", json)) {
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_MAX_DURATION_EXCEEDED);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "aws");
|
||||
}
|
||||
else if (0 == strcmp("no_audio", json)) {
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_NO_AUDIO_DETECTED);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "aws");
|
||||
}
|
||||
else {
|
||||
int error = 0;
|
||||
cJSON* jMessage = cJSON_Parse(json);
|
||||
if (jMessage) {
|
||||
const char* type = cJSON_GetStringValue(cJSON_GetObjectItem(jMessage, "type"));
|
||||
if (type && 0 == strcmp(type, "error")) {
|
||||
error = 1;
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_ERROR);
|
||||
}
|
||||
cJSON_Delete(jMessage);
|
||||
}
|
||||
if (!error) {
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_RESULTS);
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "json payload: %s.\n", json);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "aws");
|
||||
switch_event_add_body(event, "%s", json);
|
||||
}
|
||||
if (bugname) switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "media-bugname", bugname);
|
||||
switch_event_fire(&event);
|
||||
}
|
||||
|
||||
|
||||
static switch_bool_t capture_callback(switch_media_bug_t *bug, void *user_data, switch_abc_type_t type)
|
||||
{
|
||||
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
|
||||
|
||||
switch (type) {
|
||||
case SWITCH_ABC_TYPE_INIT:
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Got SWITCH_ABC_TYPE_INIT.\n");
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_CLOSE:
|
||||
{
|
||||
struct cap_cb* cb = (struct cap_cb*) switch_core_media_bug_get_user_data(bug);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Got SWITCH_ABC_TYPE_CLOSE.\n");
|
||||
aws_transcribe_session_stop(session, 1, cb->bugname);
|
||||
//switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Finished SWITCH_ABC_TYPE_CLOSE.\n");
|
||||
}
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_READ:
|
||||
|
||||
return aws_transcribe_frame(bug, user_data);
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_WRITE:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
|
||||
static switch_status_t start_capture(switch_core_session_t *session, switch_media_bug_flag_t flags,
|
||||
char* lang, int interim, char* bugname)
|
||||
{
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug;
|
||||
switch_status_t status;
|
||||
switch_codec_implementation_t read_impl = { 0 };
|
||||
void *pUserData;
|
||||
uint32_t samples_per_second;
|
||||
|
||||
if (switch_channel_get_private(channel, bugname)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "removing bug from previous transcribe\n");
|
||||
do_stop(session, bugname);
|
||||
}
|
||||
|
||||
switch_core_session_get_read_impl(session, &read_impl);
|
||||
|
||||
if (switch_channel_pre_answer(channel) != SWITCH_STATUS_SUCCESS) {
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
samples_per_second = !strcasecmp(read_impl.iananame, "g722") ? read_impl.actual_samples_per_second : read_impl.samples_per_second;
|
||||
|
||||
if (SWITCH_STATUS_FALSE == aws_transcribe_session_init(session, responseHandler, samples_per_second, flags & SMBF_STEREO ? 2 : 1, lang, interim, bugname, &pUserData)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error initializing aws speech session.\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
if ((status = switch_core_media_bug_add(session, bugname, NULL, capture_callback, pUserData, 0, flags, &bug)) != SWITCH_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
switch_channel_set_private(channel, bugname, bug);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "added media bug for aws transcribe\n");
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static switch_status_t do_stop(switch_core_session_t *session, char* bugname)
|
||||
{
|
||||
switch_status_t status = SWITCH_STATUS_SUCCESS;
|
||||
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = switch_channel_get_private(channel, bugname);
|
||||
|
||||
if (bug) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Received user command command to stop transcribe on %s.\n", bugname);
|
||||
status = aws_transcribe_session_stop(session, 0, bugname);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "stopped transcribe.\n");
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
#define TRANSCRIBE_API_SYNTAX "<uuid> [start|stop] lang-code [interim] [stereo|mono] [bugname]"
|
||||
SWITCH_STANDARD_API(aws_transcribe_function)
|
||||
{
|
||||
char *mycmd = NULL, *argv[6] = { 0 };
|
||||
int argc = 0;
|
||||
switch_status_t status = SWITCH_STATUS_FALSE;
|
||||
switch_media_bug_flag_t flags = SMBF_READ_STREAM /* | SMBF_WRITE_STREAM | SMBF_READ_PING */;
|
||||
|
||||
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
|
||||
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
|
||||
}
|
||||
|
||||
if (zstr(cmd) ||
|
||||
(!strcasecmp(argv[1], "stop") && argc < 2) ||
|
||||
(!strcasecmp(argv[1], "start") && argc < 3) ||
|
||||
zstr(argv[0])) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s %s %s.\n", cmd, argv[0], argv[1]);
|
||||
stream->write_function(stream, "-USAGE: %s\n", TRANSCRIBE_API_SYNTAX);
|
||||
goto done;
|
||||
} else {
|
||||
switch_core_session_t *lsession = NULL;
|
||||
|
||||
if ((lsession = switch_core_session_locate(argv[0]))) {
|
||||
if (!strcasecmp(argv[1], "stop")) {
|
||||
char *bugname = argc > 2 ? argv[2] : MY_BUG_NAME;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "stop transcribing\n");
|
||||
status = do_stop(lsession, bugname);
|
||||
} else if (!strcasecmp(argv[1], "start")) {
|
||||
char* lang = argv[2];
|
||||
int interim = argc > 3 && !strcmp(argv[3], "interim");
|
||||
char *bugname = argc > 5 ? argv[5] : MY_BUG_NAME;
|
||||
if (argc > 4 && !strcmp(argv[4], "stereo")) {
|
||||
flags |= SMBF_WRITE_STREAM ;
|
||||
flags |= SMBF_STEREO;
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "start transcribing %s %s %s\n", lang, interim ? "interim": "complete", bugname);
|
||||
status = start_capture(lsession, flags, lang, interim, bugname);
|
||||
}
|
||||
switch_core_session_rwunlock(lsession);
|
||||
}
|
||||
}
|
||||
|
||||
if (status == SWITCH_STATUS_SUCCESS) {
|
||||
stream->write_function(stream, "+OK Success\n");
|
||||
} else {
|
||||
stream->write_function(stream, "-ERR Operation Failed\n");
|
||||
}
|
||||
|
||||
done:
|
||||
|
||||
switch_safe_free(mycmd);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
SWITCH_MODULE_LOAD_FUNCTION(mod_aws_transcribe_load)
|
||||
{
|
||||
switch_api_interface_t *api_interface;
|
||||
|
||||
/* create/register custom event message type */
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_RESULTS) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_RESULTS);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
|
||||
/* connect my internal structure to the blank pointer passed to me */
|
||||
*module_interface = switch_loadable_module_create_module_interface(pool, modname);
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "AWS Speech Transcription API loading..\n");
|
||||
|
||||
if (SWITCH_STATUS_FALSE == aws_transcribe_init()) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Failed initializing aws speech interface\n");
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "AWS Speech Transcription API successfully loaded\n");
|
||||
|
||||
SWITCH_ADD_API(api_interface, "uuid_aws_transcribe", "AWS Speech Transcription API", aws_transcribe_function, TRANSCRIBE_API_SYNTAX);
|
||||
switch_console_set_complete("add uuid_aws_transcribe start lang-code [interim|final] [stereo|mono]");
|
||||
switch_console_set_complete("add uuid_aws_transcribe stop ");
|
||||
|
||||
/* indicate that the module should continue to be loaded */
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Called when the system shuts down
|
||||
Macro expands to: switch_status_t mod_aws_transcribe_shutdown() */
|
||||
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_aws_transcribe_shutdown)
|
||||
{
|
||||
aws_transcribe_cleanup();
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_RESULTS);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
45
mod_aws_transcribe/mod_aws_transcribe.h
Normal file
45
mod_aws_transcribe/mod_aws_transcribe.h
Normal file
@@ -0,0 +1,45 @@
|
||||
#ifndef __MOD_AWS_TRANSCRIBE_H__
|
||||
#define __MOD_AWS_TRANSCRIBE_H__
|
||||
|
||||
#include <switch.h>
|
||||
#include <speex/speex_resampler.h>
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
#define MY_BUG_NAME "aws_transcribe"
|
||||
#define MAX_BUG_LEN (64)
|
||||
#define MAX_SESSION_ID (256)
|
||||
#define TRANSCRIBE_EVENT_RESULTS "aws_transcribe::transcription"
|
||||
#define TRANSCRIBE_EVENT_END_OF_TRANSCRIPT "aws_transcribe::end_of_transcript"
|
||||
#define TRANSCRIBE_EVENT_NO_AUDIO_DETECTED "aws_transcribe::no_audio_detected"
|
||||
#define TRANSCRIBE_EVENT_MAX_DURATION_EXCEEDED "aws_transcribe::max_duration_exceeded"
|
||||
#define TRANSCRIBE_EVENT_VAD_DETECTED "aws_transcribe::vad_detected"
|
||||
#define TRANSCRIBE_EVENT_ERROR "jambonz_transcribe::error"
|
||||
|
||||
#define MAX_LANG (12)
|
||||
#define MAX_REGION (32)
|
||||
|
||||
/* per-channel data */
|
||||
typedef void (*responseHandler_t)(switch_core_session_t* session, const char * json, const char* bugname);
|
||||
|
||||
struct cap_cb {
|
||||
switch_mutex_t *mutex;
|
||||
char bugname[MAX_BUG_LEN+1];
|
||||
char sessionId[MAX_SESSION_ID+1];
|
||||
char awsAccessKeyId[128];
|
||||
char awsSecretAccessKey[128];
|
||||
uint32_t channels;
|
||||
SpeexResamplerState *resampler;
|
||||
void* streamer;
|
||||
responseHandler_t responseHandler;
|
||||
switch_thread_t* thread;
|
||||
int interim;
|
||||
|
||||
char lang[MAX_LANG];
|
||||
char region[MAX_REGION];
|
||||
|
||||
switch_vad_t * vad;
|
||||
uint32_t samples_per_second;
|
||||
};
|
||||
|
||||
#endif
|
||||
51
mod_aws_transcribe/simple_buffer.h
Normal file
51
mod_aws_transcribe/simple_buffer.h
Normal file
@@ -0,0 +1,51 @@
|
||||
/**
|
||||
* (very) simple and limited circular buffer,
|
||||
* supporting only the use case of doing all of the adds
|
||||
* and then subsquently retrieves.
|
||||
*
|
||||
*/
|
||||
class SimpleBuffer {
|
||||
public:
|
||||
SimpleBuffer(uint32_t chunkSize, uint32_t numChunks) : numItems(0),
|
||||
m_numChunks(numChunks), m_chunkSize(chunkSize) {
|
||||
m_pData = new char[chunkSize * numChunks];
|
||||
m_pNextWrite = m_pData;
|
||||
}
|
||||
~SimpleBuffer() {
|
||||
delete [] m_pData;
|
||||
}
|
||||
|
||||
void add(void *data, uint32_t datalen) {
|
||||
if (datalen % m_chunkSize != 0) return;
|
||||
int numChunks = datalen / m_chunkSize;
|
||||
for (int i = 0; i < numChunks; i++) {
|
||||
memcpy(m_pNextWrite, data, m_chunkSize);
|
||||
data = static_cast<char*>(data) + m_chunkSize;
|
||||
if (numItems < m_numChunks) numItems++;
|
||||
|
||||
uint32_t offset = (m_pNextWrite - m_pData) / m_chunkSize;
|
||||
if (offset >= m_numChunks - 1) m_pNextWrite = m_pData;
|
||||
else m_pNextWrite += m_chunkSize;
|
||||
}
|
||||
}
|
||||
|
||||
char* getNextChunk() {
|
||||
if (numItems--) {
|
||||
char *p = m_pNextWrite;
|
||||
uint32_t offset = (m_pNextWrite - m_pData) / m_chunkSize;
|
||||
if (offset >= m_numChunks - 1) m_pNextWrite = m_pData;
|
||||
else m_pNextWrite += m_chunkSize;
|
||||
return p;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
uint32_t getNumItems() { return numItems;}
|
||||
|
||||
private:
|
||||
char *m_pData;
|
||||
uint32_t numItems;
|
||||
uint32_t m_chunkSize;
|
||||
uint32_t m_numChunks;
|
||||
char* m_pNextWrite;
|
||||
};
|
||||
BIN
mod_azure_transcribe/.DS_Store
vendored
Normal file
BIN
mod_azure_transcribe/.DS_Store
vendored
Normal file
Binary file not shown.
8
mod_azure_transcribe/LICENSE
Normal file
8
mod_azure_transcribe/LICENSE
Normal file
@@ -0,0 +1,8 @@
|
||||
Copyright 2023, Drachtio Communications Services, LLC
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
10
mod_azure_transcribe/Makefile.am
Normal file
10
mod_azure_transcribe/Makefile.am
Normal file
@@ -0,0 +1,10 @@
|
||||
include $(top_srcdir)/build/modmake.rulesam
|
||||
MODNAME=mod_azure_transcribe
|
||||
|
||||
mod_LTLIBRARIES = mod_azure_transcribe.la
|
||||
mod_azure_transcribe_la_SOURCES = mod_azure_transcribe.c azure_transcribe_glue.cpp
|
||||
mod_azure_transcribe_la_CFLAGS = $(AM_CFLAGS)
|
||||
mod_azure_transcribe_la_CXXFLAGS = $(AM_CXXFLAGS) -std=c++14 -I/usr/local/include/MicrosoftSpeechSDK/cxx_api -I/usr/local/include/MicrosoftSpeechSDK/c_api
|
||||
|
||||
mod_azure_transcribe_la_LIBADD = $(switch_builddir)/libfreeswitch.la
|
||||
mod_azure_transcribe_la_LDFLAGS = -avoid-version -module -no-undefined -L/usr/local/lib/MicrosoftSpeechSDK/x64 -lMicrosoft.CognitiveServices.Speech.core -l:libasound.so.2 -lpthread -lcrypto -lssl -lz
|
||||
61
mod_azure_transcribe/README.md
Normal file
61
mod_azure_transcribe/README.md
Normal file
@@ -0,0 +1,61 @@
|
||||
# mod_azure_transcribe
|
||||
|
||||
A Freeswitch module that generates real-time transcriptions on a Freeswitch channel by using the Microsoft streaming transcription API
|
||||
|
||||
## API
|
||||
|
||||
### Commands
|
||||
The freeswitch module exposes the following API commands:
|
||||
|
||||
```
|
||||
azure_transcribe <uuid> start <lang-code> [interim]
|
||||
```
|
||||
Attaches media bug to channel and performs streaming recognize request.
|
||||
- `uuid` - unique identifier of Freeswitch channel
|
||||
- `lang-code` - a valid AWS [language code](https://docs.aws.amazon.com/transcribe/latest/dg/what-is-transcribe.html) that is supported for streaming transcription
|
||||
- `interim` - If the 'interim' keyword is present then both interim and final transcription results will be returned; otherwise only final transcriptions will be returned
|
||||
|
||||
```
|
||||
azure_transcribe <uuid> stop
|
||||
```
|
||||
Stop transcription on the channel.
|
||||
|
||||
### Authentication
|
||||
The plugin will first look for channel variables, then environment variables. If neither are found, then the default AWS profile on the server will be used.
|
||||
|
||||
The names of the channel variables and environment variables are:
|
||||
|
||||
| variable | Description |
|
||||
| --- | ----------- |
|
||||
| AZURE_SUBSCRIPTION_KEY | The Azure subscription key |
|
||||
| AZURE_REGION | The Azure region |
|
||||
|
||||
### Channel variables
|
||||
The following channel variables can be set to configure the Azure speech to text service
|
||||
|
||||
| variable | Description | Default |
|
||||
| --- | ----------- | ---|
|
||||
| AZURE_PROFANITY_OPTION | "masked", "removed", "raw" | raw|
|
||||
| AZURE_REQUEST_SNR | if set to 1 or true, enables signal to noise ratio reporting | off |
|
||||
| AZURE_INITIAL_SPEECH_TIMEOUT_MS | initial time to wait for speech before returning no match | none |
|
||||
| AZURE_SPEECH_HINTS | comma-separated list of phrases or words to expect | none |
|
||||
| AZURE_USE_OUTPUT_FORMAT_DETAILED | if set to true or 1, provide n-best and confidence levels | off |
|
||||
|
||||
|
||||
### Events
|
||||
`azure_transcribe::transcription` - returns an interim or final transcription. The event contains a JSON body describing the transcription result; if the body contains a property with "RecognitionStatus": "Success" it is a final transcript, otherwise it is an interim transcript.
|
||||
```json
|
||||
{
|
||||
"Id": "1708f0bffc2d4d66b8347280447e9dde",
|
||||
"RecognitionStatus": "Success",
|
||||
"DisplayText": "This is a test.",
|
||||
"Offset": 14400000,
|
||||
"Duration": 12200000
|
||||
}
|
||||
```
|
||||
|
||||
## Usage
|
||||
When using [drachtio-fsrmf](https://www.npmjs.com/package/drachtio-fsmrf), you can access this API command via the api method on the 'endpoint' object.
|
||||
```js
|
||||
ep.api('azure_transcribe', `${ep.uuid} start en-US interim`);
|
||||
```
|
||||
559
mod_azure_transcribe/azure_transcribe_glue.cpp
Normal file
559
mod_azure_transcribe/azure_transcribe_glue.cpp
Normal file
@@ -0,0 +1,559 @@
|
||||
#include <cstdlib>
|
||||
|
||||
#include <switch.h>
|
||||
#include <switch_json.h>
|
||||
|
||||
#include <string.h>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <condition_variable>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <deque>
|
||||
#include <memory>
|
||||
|
||||
#include <speechapi_cxx.h>
|
||||
|
||||
#include "mod_azure_transcribe.h"
|
||||
#include "simple_buffer.h"
|
||||
|
||||
#define CHUNKSIZE (320)
|
||||
#define DEFAULT_SPEECH_TIMEOUT "180000"
|
||||
|
||||
using namespace Microsoft::CognitiveServices::Speech;
|
||||
using namespace Microsoft::CognitiveServices::Speech::Audio;
|
||||
|
||||
const char ALLOC_TAG[] = "drachtio";
|
||||
|
||||
static bool hasDefaultCredentials = false;
|
||||
static bool sdkInitialized = false;
|
||||
static const char* sdkLog = std::getenv("AZURE_SDK_LOGFILE");
|
||||
static const char* proxyIP = std::getenv("JAMBONES_HTTP_PROXY_IP");
|
||||
static const char* proxyPort = std::getenv("JAMBONES_HTTP_PROXY_PORT");
|
||||
static const char* proxyUsername = std::getenv("JAMBONES_HTTP_PROXY_USERNAME");
|
||||
static const char* proxyPassword = std::getenv("JAMBONES_HTTP_PROXY_PASSWORD");
|
||||
|
||||
class GStreamer {
|
||||
public:
|
||||
GStreamer(
|
||||
const char *sessionId,
|
||||
const char *bugname,
|
||||
u_int16_t channels,
|
||||
char *lang,
|
||||
int interim,
|
||||
uint32_t samples_per_second,
|
||||
const char* region,
|
||||
const char* subscriptionKey,
|
||||
responseHandler_t responseHandler
|
||||
) : m_sessionId(sessionId), m_bugname(bugname), m_finished(false), m_stopped(false), m_interim(interim),
|
||||
m_connected(false), m_connecting(false), m_audioBuffer(320 * (samples_per_second == 8000 ? 1 : 2), 15),
|
||||
m_responseHandler(responseHandler) {
|
||||
|
||||
switch_core_session_t* psession = switch_core_session_locate(sessionId);
|
||||
if (!psession) throw std::invalid_argument( "session id no longer active" );
|
||||
switch_channel_t *channel = switch_core_session_get_channel(psession);
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::GStreamer(%p) region %s, language %s\n",
|
||||
this, region, lang);
|
||||
|
||||
|
||||
const char* endpoint = switch_channel_get_variable(channel, "AZURE_SERVICE_ENDPOINT");
|
||||
const char* endpointId = switch_channel_get_variable(channel, "AZURE_SERVICE_ENDPOINT_ID");
|
||||
|
||||
auto sourceLanguageConfig = SourceLanguageConfig::FromLanguage(lang);
|
||||
auto format = AudioStreamFormat::GetWaveFormatPCM(8000, 16, channels);
|
||||
auto options = AudioProcessingOptions::Create(AUDIO_INPUT_PROCESSING_ENABLE_DEFAULT);
|
||||
auto speechConfig = nullptr != endpoint ?
|
||||
(nullptr != subscriptionKey ?
|
||||
SpeechConfig::FromEndpoint(endpoint, subscriptionKey) :
|
||||
SpeechConfig::FromEndpoint(endpoint)) :
|
||||
SpeechConfig::FromSubscription(subscriptionKey, region);
|
||||
if (switch_true(switch_channel_get_variable(channel, "AZURE_USE_OUTPUT_FORMAT_DETAILED"))) {
|
||||
speechConfig->SetOutputFormat(OutputFormat::Detailed);
|
||||
}
|
||||
if (nullptr != endpointId) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG, "setting endpoint id: %s\n", endpointId);
|
||||
speechConfig->SetEndpointId(endpointId);
|
||||
}
|
||||
if (!sdkInitialized && sdkLog) {
|
||||
sdkInitialized = true;
|
||||
speechConfig->SetProperty(PropertyId::Speech_LogFilename, sdkLog);
|
||||
}
|
||||
if (switch_true(switch_channel_get_variable(channel, "AZURE_AUDIO_LOGGING"))) {
|
||||
speechConfig->EnableAudioLogging();
|
||||
}
|
||||
|
||||
if (nullptr != proxyIP && nullptr != proxyPort) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG, "setting proxy: %s:%s\n", proxyIP, proxyPort);
|
||||
speechConfig->SetProxy(proxyIP, atoi(proxyPort), proxyUsername, proxyPassword);
|
||||
}
|
||||
|
||||
m_pushStream = AudioInputStream::CreatePushStream(format);
|
||||
auto audioConfig = AudioConfig::FromStreamInput(m_pushStream);
|
||||
|
||||
// alternative language
|
||||
const char* var;
|
||||
if (var = switch_channel_get_variable(channel, "AZURE_SPEECH_ALTERNATIVE_LANGUAGE_CODES")) {
|
||||
std::vector<std::string> languages;
|
||||
char *alt_langs[3] = { 0 };
|
||||
int argc = switch_separate_string((char *) var, ',', alt_langs, 3);
|
||||
|
||||
languages.push_back(lang); // primary language
|
||||
for (int i = 0; i < argc; i++) {
|
||||
languages.push_back( alt_langs[i]);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG, "added alternative lang %s\n", alt_langs[i]);
|
||||
}
|
||||
auto autoDetectSourceLanguageConfig = AutoDetectSourceLanguageConfig::FromLanguages(languages);
|
||||
m_recognizer = SpeechRecognizer::FromConfig(speechConfig, autoDetectSourceLanguageConfig, audioConfig);
|
||||
}
|
||||
else {
|
||||
auto sourceLanguageConfig = SourceLanguageConfig::FromLanguage(lang);
|
||||
m_recognizer = SpeechRecognizer::FromConfig(speechConfig, sourceLanguageConfig, audioConfig);
|
||||
}
|
||||
|
||||
|
||||
// set properties
|
||||
auto &properties = m_recognizer->Properties;
|
||||
|
||||
// profanity options: Allowed values are "masked", "removed", and "raw".
|
||||
const char* profanity = switch_channel_get_variable(channel, "AZURE_PROFANITY_OPTION");
|
||||
if (profanity) {
|
||||
properties.SetProperty(PropertyId::SpeechServiceResponse_ProfanityOption, profanity);
|
||||
}
|
||||
// report signal-to-noise ratio
|
||||
if (switch_true(switch_channel_get_variable(channel, "AZURE_REQUEST_SNR"))) {
|
||||
properties.SetProperty(PropertyId::SpeechServiceResponse_RequestSnr, TrueString);
|
||||
}
|
||||
// initial speech timeout in milliseconds
|
||||
const char* timeout = switch_channel_get_variable(channel, "AZURE_INITIAL_SPEECH_TIMEOUT_MS");
|
||||
if (timeout) properties.SetProperty(PropertyId::SpeechServiceConnection_InitialSilenceTimeoutMs, timeout);
|
||||
else properties.SetProperty(PropertyId::SpeechServiceConnection_InitialSilenceTimeoutMs, DEFAULT_SPEECH_TIMEOUT);
|
||||
|
||||
const char* segmentationInterval = switch_channel_get_variable(channel, "AZURE_SPEECH_SEGMENTATION_SILENCE_TIMEOUT_MS");
|
||||
if (segmentationInterval) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG, "setting segmentation interval to %s ms\n", segmentationInterval);
|
||||
properties.SetProperty(PropertyId::Speech_SegmentationSilenceTimeoutMs, segmentationInterval);
|
||||
}
|
||||
|
||||
// recognition mode - readonly according to Azure docs:
|
||||
// https://docs.microsoft.com/en-us/javascript/api/microsoft-cognitiveservices-speech-sdk/propertyid?view=azure-node-latest
|
||||
/*
|
||||
const char* recoMode = switch_channel_get_variable(channel, "AZURE_RECOGNITION_MODE");
|
||||
if (recoMode) {
|
||||
properties.SetProperty(PropertyId::SpeechServiceConnection_RecoMode, recoMode);
|
||||
}
|
||||
*/
|
||||
|
||||
// hints
|
||||
const char* hints = switch_channel_get_variable(channel, "AZURE_SPEECH_HINTS");
|
||||
if (hints) {
|
||||
auto grammar = PhraseListGrammar::FromRecognizer(m_recognizer);
|
||||
char *phrases[500] = { 0 };
|
||||
int argc = switch_separate_string((char *)hints, ',', phrases, 500);
|
||||
for (int i = 0; i < argc; i++) {
|
||||
grammar->AddPhrase(phrases[i]);
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG, "added %d hints\n", argc);
|
||||
}
|
||||
|
||||
auto onSessionStopped = [this](const SessionEventArgs& args) {
|
||||
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
||||
m_stopped = true;
|
||||
if (psession) {
|
||||
auto sessionId = args.SessionId;
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer: got session stopped from microsoft\n");
|
||||
switch_core_session_rwunlock(psession);
|
||||
}
|
||||
};
|
||||
auto onSpeechStartDetected = [this, responseHandler](const RecognitionEventArgs& args) {
|
||||
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
||||
if (psession) {
|
||||
auto sessionId = args.SessionId;
|
||||
responseHandler(psession, TRANSCRIBE_EVENT_START_OF_UTTERANCE, NULL, m_bugname.c_str(), m_finished);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer start of speech\n");
|
||||
switch_core_session_rwunlock(psession);
|
||||
}
|
||||
};
|
||||
auto onSpeechEndDetected = [this, responseHandler](const RecognitionEventArgs& args) {
|
||||
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
||||
if (psession) {
|
||||
auto sessionId = args.SessionId;
|
||||
responseHandler(psession, TRANSCRIBE_EVENT_END_OF_UTTERANCE, NULL, m_bugname.c_str(), m_finished);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer end of speech\n");
|
||||
switch_core_session_rwunlock(psession);
|
||||
}
|
||||
};
|
||||
auto onRecognitionEvent = [this, responseHandler](const SpeechRecognitionEventArgs& args) {
|
||||
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
||||
if (psession) {
|
||||
auto result = args.Result;
|
||||
auto reason = result->Reason;
|
||||
const auto& properties = result->Properties;
|
||||
auto json = properties.GetProperty(PropertyId::SpeechServiceResponse_JsonResult);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer onRecognitionEvent reason %d results: %s,\n", reason, json.c_str());
|
||||
|
||||
switch (reason) {
|
||||
case ResultReason::RecognizingSpeech:
|
||||
case ResultReason::RecognizedSpeech:
|
||||
// note: interim results don't have "RecognitionStatus": "Success"
|
||||
responseHandler(psession, TRANSCRIBE_EVENT_RESULTS, json.c_str(), m_bugname.c_str(), m_finished);
|
||||
break;
|
||||
case ResultReason::NoMatch:
|
||||
responseHandler(psession, TRANSCRIBE_EVENT_NO_SPEECH_DETECTED, json.c_str(), m_bugname.c_str(), m_finished);
|
||||
break;
|
||||
|
||||
default:
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "GStreamer unexpected result '%s': reason %d\n",
|
||||
json.c_str(), reason);
|
||||
responseHandler(psession, TRANSCRIBE_EVENT_ERROR, json.c_str(), m_bugname.c_str(), m_finished);
|
||||
|
||||
break;
|
||||
}
|
||||
switch_core_session_rwunlock(psession);
|
||||
}
|
||||
};
|
||||
|
||||
auto onCanceled = [this, responseHandler](const SpeechRecognitionCanceledEventArgs& args) {
|
||||
if (m_finished) return;
|
||||
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
||||
if (psession) {
|
||||
auto result = args.Result;
|
||||
auto details = args.ErrorDetails;
|
||||
auto code = args.ErrorCode;
|
||||
cJSON* json = cJSON_CreateObject();
|
||||
cJSON_AddStringToObject(json, "type", "error");
|
||||
cJSON_AddStringToObject(json, "error", details.c_str());
|
||||
char* jsonString = cJSON_PrintUnformatted(json);
|
||||
responseHandler(psession, TRANSCRIBE_EVENT_ERROR, jsonString, m_bugname.c_str(), m_finished);
|
||||
free(jsonString);
|
||||
cJSON_Delete(json);
|
||||
switch_core_session_rwunlock(psession);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer recognition canceled, error %d: %s\n", code, details.c_str());
|
||||
}
|
||||
};
|
||||
|
||||
m_recognizer->SessionStopped += onSessionStopped;
|
||||
m_recognizer->SpeechStartDetected += onSpeechStartDetected;
|
||||
m_recognizer->SpeechEndDetected += onSpeechEndDetected;
|
||||
if (interim) m_recognizer->Recognizing += onRecognitionEvent;
|
||||
m_recognizer->Recognized += onRecognitionEvent;
|
||||
m_recognizer->Canceled += onCanceled;
|
||||
|
||||
switch_core_session_rwunlock(psession);
|
||||
}
|
||||
|
||||
~GStreamer() {
|
||||
//switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::~GStreamer %p\n", this);
|
||||
}
|
||||
|
||||
void connect() {
|
||||
if (m_connecting) return;
|
||||
m_connecting = true;
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer:connect %p connecting to azure speech..\n", this);
|
||||
|
||||
auto onSessionStarted = [this](const SessionEventArgs& args) {
|
||||
m_connected = true;
|
||||
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
||||
if (psession) {
|
||||
auto sessionId = args.SessionId;
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer got session started from microsoft\n");
|
||||
|
||||
// send any buffered audio
|
||||
int nFrames = m_audioBuffer.getNumItems();
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p got session started from azure, %d buffered frames\n", this, nFrames);
|
||||
if (nFrames) {
|
||||
char *p;
|
||||
do {
|
||||
p = m_audioBuffer.getNextChunk();
|
||||
if (p) {
|
||||
write(p, CHUNKSIZE);
|
||||
}
|
||||
} while (p);
|
||||
}
|
||||
switch_core_session_rwunlock(psession);
|
||||
}
|
||||
};
|
||||
m_recognizer->SessionStarted += onSessionStarted;
|
||||
m_recognizer->StartContinuousRecognitionAsync();
|
||||
|
||||
}
|
||||
|
||||
bool write(void* data, uint32_t datalen) {
|
||||
if (m_finished) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::write not writing because we are finished, %p\n", this);
|
||||
return false;
|
||||
}
|
||||
if (!m_connected) {
|
||||
if (datalen % CHUNKSIZE == 0) {
|
||||
m_audioBuffer.add(data, datalen);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
m_pushStream->Write(static_cast<uint8_t*>(data), datalen);
|
||||
return true;
|
||||
}
|
||||
|
||||
void finish() {
|
||||
if (m_finished) return;
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::finish - calling StopContinuousRecognitionAsync (%p)\n", this);
|
||||
m_finished = true;
|
||||
m_recognizer->StopContinuousRecognitionAsync().get();
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "GStreamer::finish - recognition has completed (%p)\n", this);
|
||||
}
|
||||
|
||||
bool isStopped() {
|
||||
return m_stopped;
|
||||
}
|
||||
|
||||
bool isConnecting() {
|
||||
return m_connecting;
|
||||
}
|
||||
|
||||
private:
|
||||
std::string m_sessionId;
|
||||
std::string m_bugname;
|
||||
std::string m_region;
|
||||
std::shared_ptr<SpeechRecognizer> m_recognizer;
|
||||
std::shared_ptr<PushAudioInputStream> m_pushStream;
|
||||
|
||||
responseHandler_t m_responseHandler;
|
||||
bool m_interim;
|
||||
bool m_finished;
|
||||
bool m_connected;
|
||||
bool m_connecting;
|
||||
bool m_stopped;
|
||||
SimpleBuffer m_audioBuffer;
|
||||
};
|
||||
|
||||
static void reaper(struct cap_cb *cb) {
|
||||
std::shared_ptr<GStreamer> pStreamer;
|
||||
pStreamer.reset((GStreamer *)cb->streamer);
|
||||
cb->streamer = nullptr;
|
||||
|
||||
std::thread t([pStreamer]{
|
||||
pStreamer->finish();
|
||||
});
|
||||
t.detach();
|
||||
}
|
||||
|
||||
static void killcb(struct cap_cb* cb) {
|
||||
if (cb) {
|
||||
if (cb->streamer) {
|
||||
GStreamer* p = (GStreamer *) cb->streamer;
|
||||
delete p;
|
||||
cb->streamer = NULL;
|
||||
}
|
||||
if (cb->resampler) {
|
||||
speex_resampler_destroy(cb->resampler);
|
||||
cb->resampler = NULL;
|
||||
}
|
||||
if (cb->vad) {
|
||||
switch_vad_destroy(&cb->vad);
|
||||
cb->vad = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
switch_status_t azure_transcribe_init() {
|
||||
const char* subscriptionKey = std::getenv("AZURE_SUBSCRIPTION_KEY");
|
||||
const char* region = std::getenv("AZURE_REGION");
|
||||
if (NULL == subscriptionKey) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE,
|
||||
"\"AZURE_SUBSCRIPTION_KEY\" env var not set; authentication will expect channel variables of same names to be set\n");
|
||||
}
|
||||
else {
|
||||
hasDefaultCredentials = true;
|
||||
}
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_status_t azure_transcribe_cleanup() {
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// start transcribe on a channel
|
||||
switch_status_t azure_transcribe_session_init(switch_core_session_t *session, responseHandler_t responseHandler,
|
||||
uint32_t samples_per_second, uint32_t channels, char* lang, int interim, char* bugname, void **ppUserData
|
||||
) {
|
||||
GStreamer *streamer = NULL;
|
||||
switch_status_t status = SWITCH_STATUS_SUCCESS;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
int err;
|
||||
switch_threadattr_t *thd_attr = NULL;
|
||||
switch_memory_pool_t *pool = switch_core_session_get_pool(session);
|
||||
auto read_codec = switch_core_session_get_read_codec(session);
|
||||
uint32_t sampleRate = read_codec->implementation->actual_samples_per_second;
|
||||
const char* sessionId = switch_core_session_get_uuid(session);
|
||||
struct cap_cb* cb = (struct cap_cb *) switch_core_session_alloc(session, sizeof(*cb));
|
||||
memset(cb, sizeof(cb), 0);
|
||||
const char* subscriptionKey = switch_channel_get_variable(channel, "AZURE_SUBSCRIPTION_KEY");
|
||||
const char* region = switch_channel_get_variable(channel, "AZURE_REGION");
|
||||
cb->channels = channels;
|
||||
strncpy(cb->sessionId, sessionId, MAX_SESSION_ID);
|
||||
strncpy(cb->bugname, bugname, MAX_BUG_LEN);
|
||||
|
||||
if (subscriptionKey && region) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Using channel vars for azure authentication\n");
|
||||
strncpy(cb->subscriptionKey, subscriptionKey, MAX_SUBSCRIPTION_KEY_LEN);
|
||||
strncpy(cb->region, region, MAX_REGION);
|
||||
}
|
||||
else if (std::getenv("AZURE_SUBSCRIPTION_KEY") && std::getenv("AZURE_REGION")) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Using env vars for azure authentication\n");
|
||||
strncpy(cb->subscriptionKey, std::getenv("AZURE_SUBSCRIPTION_KEY"), MAX_SUBSCRIPTION_KEY_LEN);
|
||||
strncpy(cb->region, std::getenv("AZURE_REGION"), MAX_REGION);
|
||||
}
|
||||
else {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "No channel vars or env vars for azure authentication..will use default profile if found\n");
|
||||
}
|
||||
|
||||
cb->responseHandler = responseHandler;
|
||||
|
||||
if (switch_mutex_init(&cb->mutex, SWITCH_MUTEX_NESTED, pool) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error initializing mutex\n");
|
||||
status = SWITCH_STATUS_FALSE;
|
||||
goto done;
|
||||
}
|
||||
|
||||
cb->interim = interim;
|
||||
strncpy(cb->lang, lang, MAX_LANG);
|
||||
|
||||
/* determine if we need to resample the audio to 16-bit 8khz */
|
||||
if (sampleRate != 8000) {
|
||||
cb->resampler = speex_resampler_init(1, sampleRate, 8000, SWITCH_RESAMPLE_QUALITY, &err);
|
||||
if (0 != err) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error initializing resampler: %s.\n",
|
||||
switch_channel_get_name(channel), speex_resampler_strerror(err));
|
||||
status = SWITCH_STATUS_FALSE;
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
|
||||
// allocate vad if we are delaying connecting to the recognizer until we detect speech
|
||||
if (switch_channel_var_true(channel, "START_RECOGNIZING_ON_VAD")) {
|
||||
cb->vad = switch_vad_init(sampleRate, 1);
|
||||
if (cb->vad) {
|
||||
const char* var;
|
||||
int mode = 2;
|
||||
int silence_ms = 150;
|
||||
int voice_ms = 250;
|
||||
int debug = 0;
|
||||
|
||||
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_MODE")) {
|
||||
mode = atoi(var);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_SILENCE_MS")) {
|
||||
silence_ms = atoi(var);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_VOICE_MS")) {
|
||||
voice_ms = atoi(var);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_DEBUG")) {
|
||||
debug = atoi(var);
|
||||
}
|
||||
switch_vad_set_mode(cb->vad, mode);
|
||||
switch_vad_set_param(cb->vad, "silence_ms", silence_ms);
|
||||
switch_vad_set_param(cb->vad, "voice_ms", voice_ms);
|
||||
switch_vad_set_param(cb->vad, "debug", debug);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "%s: delaying connection until vad, voice_ms %d, mode %d\n",
|
||||
switch_channel_get_name(channel), voice_ms, mode);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "%s: initializing gstreamer with %s\n",
|
||||
switch_channel_get_name(channel), bugname);
|
||||
streamer = new GStreamer(sessionId, bugname, channels, lang, interim, sampleRate, cb->region, subscriptionKey, responseHandler);
|
||||
cb->streamer = streamer;
|
||||
if (!cb->vad) streamer->connect();
|
||||
} catch (std::exception& e) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error initializing gstreamer: %s.\n",
|
||||
switch_channel_get_name(channel), e.what());
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
|
||||
*ppUserData = cb;
|
||||
|
||||
done:
|
||||
return status;
|
||||
}
|
||||
|
||||
switch_status_t azure_transcribe_session_stop(switch_core_session_t *session, int channelIsClosing, char* bugname) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, bugname);
|
||||
|
||||
if (bug) {
|
||||
struct cap_cb *cb = (struct cap_cb *) switch_core_media_bug_get_user_data(bug);
|
||||
switch_status_t st;
|
||||
|
||||
// close connection and get final responses
|
||||
switch_mutex_lock(cb->mutex);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "azure_transcribe_session_stop: locked session\n");
|
||||
|
||||
switch_channel_set_private(channel, bugname, NULL);
|
||||
if (!channelIsClosing) switch_core_media_bug_remove(session, &bug);
|
||||
|
||||
GStreamer* streamer = (GStreamer *) cb->streamer;
|
||||
if (streamer) reaper(cb);
|
||||
killcb(cb);
|
||||
switch_mutex_unlock(cb->mutex);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "azure_transcribe_session_stop: unlocked session\n");
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s Bug is not attached.\n", switch_channel_get_name(channel));
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
switch_bool_t azure_transcribe_frame(switch_media_bug_t *bug, void* user_data) {
|
||||
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
|
||||
uint8_t data[SWITCH_RECOMMENDED_BUFFER_SIZE];
|
||||
switch_frame_t frame = {};
|
||||
struct cap_cb *cb = (struct cap_cb *) user_data;
|
||||
|
||||
frame.data = data;
|
||||
frame.buflen = SWITCH_RECOMMENDED_BUFFER_SIZE;
|
||||
|
||||
if (switch_mutex_trylock(cb->mutex) == SWITCH_STATUS_SUCCESS) {
|
||||
GStreamer* streamer = (GStreamer *) cb->streamer;
|
||||
if (streamer) {
|
||||
while (switch_core_media_bug_read(bug, &frame, SWITCH_TRUE) == SWITCH_STATUS_SUCCESS && !switch_test_flag((&frame), SFF_CNG)) {
|
||||
if (frame.datalen) {
|
||||
if (cb->vad && !streamer->isConnecting()) {
|
||||
switch_vad_state_t state = switch_vad_process(cb->vad, (int16_t*) frame.data, frame.samples);
|
||||
if (state == SWITCH_VAD_STATE_START_TALKING) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "detected speech, connect to azure speech now\n");
|
||||
streamer->connect();
|
||||
cb->responseHandler(session, TRANSCRIBE_EVENT_VAD_DETECTED, NULL, cb->bugname, 0);
|
||||
}
|
||||
}
|
||||
|
||||
if (cb->resampler) {
|
||||
spx_int16_t out[SWITCH_RECOMMENDED_BUFFER_SIZE];
|
||||
spx_uint32_t out_len = SWITCH_RECOMMENDED_BUFFER_SIZE;
|
||||
spx_uint32_t in_len = frame.samples;
|
||||
size_t written;
|
||||
|
||||
speex_resampler_process_interleaved_int(
|
||||
cb->resampler,
|
||||
(const spx_int16_t *) frame.data,
|
||||
(spx_uint32_t *) &in_len,
|
||||
&out[0],
|
||||
&out_len);
|
||||
streamer->write( &out[0], sizeof(spx_int16_t) * out_len);
|
||||
}
|
||||
else {
|
||||
streamer->write( frame.data, frame.datalen);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
switch_mutex_unlock(cb->mutex);
|
||||
}
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
}
|
||||
11
mod_azure_transcribe/azure_transcribe_glue.h
Normal file
11
mod_azure_transcribe/azure_transcribe_glue.h
Normal file
@@ -0,0 +1,11 @@
|
||||
#ifndef __AZURE_GLUE_H__
|
||||
#define __AZURE_GLUE_H__
|
||||
|
||||
switch_status_t azure_transcribe_init();
|
||||
switch_status_t azure_transcribe_cleanup();
|
||||
switch_status_t azure_transcribe_session_init(switch_core_session_t *session, responseHandler_t responseHandler,
|
||||
uint32_t samples_per_second, uint32_t channels, char* lang, int interim, char* bugname, void **ppUserData);
|
||||
switch_status_t azure_transcribe_session_stop(switch_core_session_t *session, int channelIsClosing, char* bugname);
|
||||
switch_bool_t azure_transcribe_frame(switch_media_bug_t *bug, void* user_data);
|
||||
|
||||
#endif
|
||||
226
mod_azure_transcribe/mod_azure_transcribe.c
Normal file
226
mod_azure_transcribe/mod_azure_transcribe.c
Normal file
@@ -0,0 +1,226 @@
|
||||
/*
|
||||
*
|
||||
* mod_azure_transcribe.c -- Freeswitch module for using azure streaming transcribe api
|
||||
*
|
||||
*/
|
||||
#include "mod_azure_transcribe.h"
|
||||
#include "azure_transcribe_glue.h"
|
||||
|
||||
/* Prototypes */
|
||||
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_azure_transcribe_shutdown);
|
||||
SWITCH_MODULE_LOAD_FUNCTION(mod_azure_transcribe_load);
|
||||
|
||||
SWITCH_MODULE_DEFINITION(mod_azure_transcribe, mod_azure_transcribe_load, mod_azure_transcribe_shutdown, NULL);
|
||||
|
||||
static switch_status_t do_stop(switch_core_session_t *session, char* bugname);
|
||||
|
||||
static void responseHandler(switch_core_session_t* session, const char* eventName, const char * json, const char* bugname, int finished) {
|
||||
switch_event_t *event;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "responseHandler event %s, body %s.\n", eventName, json);
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, eventName);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "microsoft");
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-session-finished", finished ? "true" : "false");
|
||||
if (finished) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "responseHandler returning event %s, from finished recognition session\n", eventName);
|
||||
}
|
||||
if (json) switch_event_add_body(event, "%s", json);
|
||||
if (bugname) switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "media-bugname", bugname);
|
||||
switch_event_fire(&event);
|
||||
}
|
||||
|
||||
static switch_bool_t capture_callback(switch_media_bug_t *bug, void *user_data, switch_abc_type_t type)
|
||||
{
|
||||
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
|
||||
|
||||
switch (type) {
|
||||
case SWITCH_ABC_TYPE_INIT:
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Got SWITCH_ABC_TYPE_INIT.\n");
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_CLOSE:
|
||||
{
|
||||
struct cap_cb* cb = (struct cap_cb*) switch_core_media_bug_get_user_data(bug);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Got SWITCH_ABC_TYPE_CLOSE.\n");
|
||||
|
||||
azure_transcribe_session_stop(session, 1, cb->bugname);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Finished SWITCH_ABC_TYPE_CLOSE.\n");
|
||||
}
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_READ:
|
||||
|
||||
return azure_transcribe_frame(bug, user_data);
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_WRITE:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
|
||||
static switch_status_t start_capture(switch_core_session_t *session, switch_media_bug_flag_t flags,
|
||||
char* lang, int interim, char* bugname)
|
||||
{
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug;
|
||||
switch_status_t status;
|
||||
switch_codec_implementation_t read_impl = { 0 };
|
||||
void *pUserData;
|
||||
uint32_t samples_per_second;
|
||||
|
||||
|
||||
if (switch_channel_get_private(channel, bugname)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "removing bug from previous transcribe\n");
|
||||
do_stop(session, bugname);
|
||||
}
|
||||
|
||||
switch_core_session_get_read_impl(session, &read_impl);
|
||||
|
||||
if (switch_channel_pre_answer(channel) != SWITCH_STATUS_SUCCESS) {
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
samples_per_second = !strcasecmp(read_impl.iananame, "g722") ? read_impl.actual_samples_per_second : read_impl.samples_per_second;
|
||||
|
||||
if (SWITCH_STATUS_FALSE == azure_transcribe_session_init(session, responseHandler,
|
||||
samples_per_second, flags & SMBF_STEREO ? 2 : 1, lang, interim, bugname, &pUserData)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error initializing azure speech session.\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
if ((status = switch_core_media_bug_add(session, bugname, NULL, capture_callback, pUserData, 0, flags, &bug)) != SWITCH_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
switch_channel_set_private(channel, bugname, bug);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "added media bug for azure transcribe\n");
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static switch_status_t do_stop(switch_core_session_t *session, char* bugname)
|
||||
{
|
||||
switch_status_t status = SWITCH_STATUS_SUCCESS;
|
||||
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = switch_channel_get_private(channel, bugname);
|
||||
|
||||
if (bug) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "do_stop: Received user command command to stop transcribe.\n");
|
||||
status = azure_transcribe_session_stop(session, 0, bugname);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "do_stop: stopped transcribe.\n");
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
#define TRANSCRIBE_API_SYNTAX "<uuid> [start|stop] lang-code [interim] [stereo|mono] [bugname]"
|
||||
SWITCH_STANDARD_API(azure_transcribe_function)
|
||||
{
|
||||
char *mycmd = NULL, *argv[6] = { 0 };
|
||||
int argc = 0;
|
||||
switch_status_t status = SWITCH_STATUS_FALSE;
|
||||
switch_media_bug_flag_t flags = SMBF_READ_STREAM /* | SMBF_WRITE_STREAM | SMBF_READ_PING */;
|
||||
|
||||
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
|
||||
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
|
||||
}
|
||||
|
||||
if (zstr(cmd) ||
|
||||
(!strcasecmp(argv[1], "stop") && argc < 2) ||
|
||||
(!strcasecmp(argv[1], "start") && argc < 3) ||
|
||||
zstr(argv[0])) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s %s %s.\n", cmd, argv[0], argv[1]);
|
||||
stream->write_function(stream, "-USAGE: %s\n", TRANSCRIBE_API_SYNTAX);
|
||||
goto done;
|
||||
} else {
|
||||
switch_core_session_t *lsession = NULL;
|
||||
|
||||
if ((lsession = switch_core_session_locate(argv[0]))) {
|
||||
if (!strcasecmp(argv[1], "stop")) {
|
||||
char *bugname = argc > 2 ? argv[2] : MY_BUG_NAME;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "stop transcribing %s\n", bugname);
|
||||
status = do_stop(lsession, bugname);
|
||||
} else if (!strcasecmp(argv[1], "start")) {
|
||||
char* lang = argv[2];
|
||||
int interim = argc > 3 && !strcmp(argv[3], "interim");
|
||||
char *bugname = argc > 5 ? argv[5] : MY_BUG_NAME;
|
||||
if (argc > 4 && !strcmp(argv[4], "stereo")) {
|
||||
flags |= SMBF_WRITE_STREAM ;
|
||||
flags |= SMBF_STEREO;
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "start transcribing %s %s %s\n", lang, interim ? "interim": "complete", bugname);
|
||||
status = start_capture(lsession, flags, lang, interim, bugname);
|
||||
}
|
||||
switch_core_session_rwunlock(lsession);
|
||||
}
|
||||
}
|
||||
|
||||
if (status == SWITCH_STATUS_SUCCESS) {
|
||||
stream->write_function(stream, "+OK Success\n");
|
||||
} else {
|
||||
stream->write_function(stream, "-ERR Operation Failed\n");
|
||||
}
|
||||
|
||||
done:
|
||||
|
||||
switch_safe_free(mycmd);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
SWITCH_MODULE_LOAD_FUNCTION(mod_azure_transcribe_load)
|
||||
{
|
||||
switch_api_interface_t *api_interface;
|
||||
|
||||
/* create/register custom event message type */
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_RESULTS) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_RESULTS);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_START_OF_UTTERANCE) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_START_OF_UTTERANCE);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_END_OF_UTTERANCE) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_END_OF_UTTERANCE);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_NO_SPEECH_DETECTED) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_NO_SPEECH_DETECTED);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
|
||||
/* connect my internal structure to the blank pointer passed to me */
|
||||
*module_interface = switch_loadable_module_create_module_interface(pool, modname);
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "azure Speech Transcription API loading..\n");
|
||||
|
||||
if (SWITCH_STATUS_FALSE == azure_transcribe_init()) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Failed initializing azure speech interface\n");
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "azure Speech Transcription API successfully loaded\n");
|
||||
|
||||
SWITCH_ADD_API(api_interface, "uuid_azure_transcribe", "azure Speech Transcription API", azure_transcribe_function, TRANSCRIBE_API_SYNTAX);
|
||||
switch_console_set_complete("add uuid_azure_transcribe start lang-code [interim|final] [stereo|mono] [bugname]");
|
||||
switch_console_set_complete("add uuid_azure_transcribe stop ");
|
||||
|
||||
/* indicate that the module should continue to be loaded */
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Called when the system shuts down
|
||||
Macro expands to: switch_status_t mod_azure_transcribe_shutdown() */
|
||||
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_azure_transcribe_shutdown)
|
||||
{
|
||||
azure_transcribe_cleanup();
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_RESULTS);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_START_OF_UTTERANCE);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_END_OF_UTTERANCE);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_NO_SPEECH_DETECTED);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
43
mod_azure_transcribe/mod_azure_transcribe.h
Normal file
43
mod_azure_transcribe/mod_azure_transcribe.h
Normal file
@@ -0,0 +1,43 @@
|
||||
#ifndef __MOD_AZURE_TRANSCRIBE_H__
|
||||
#define __MOD_AZURE_TRANSCRIBE_H__
|
||||
|
||||
#include <switch.h>
|
||||
#include <speex/speex_resampler.h>
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
#define MY_BUG_NAME "azure_transcribe"
|
||||
#define MAX_BUG_LEN (64)
|
||||
#define MAX_SESSION_ID (256)
|
||||
#define TRANSCRIBE_EVENT_RESULTS "azure_transcribe::transcription"
|
||||
#define TRANSCRIBE_EVENT_START_OF_UTTERANCE "azure_transcribe::start_of_utterance"
|
||||
#define TRANSCRIBE_EVENT_END_OF_UTTERANCE "azure_transcribe::end_of_utterance"
|
||||
#define TRANSCRIBE_EVENT_NO_SPEECH_DETECTED "azure_transcribe::no_speech_detected"
|
||||
#define TRANSCRIBE_EVENT_VAD_DETECTED "azure_transcribe::vad_detected"
|
||||
#define TRANSCRIBE_EVENT_ERROR "jambonz_transcribe::error"
|
||||
|
||||
#define MAX_LANG (12)
|
||||
#define MAX_REGION (32)
|
||||
#define MAX_SUBSCRIPTION_KEY_LEN (256)
|
||||
|
||||
/* per-channel data */
|
||||
typedef void (*responseHandler_t)(switch_core_session_t* session, const char* event, const char * json, const char* bugname, int finished);
|
||||
|
||||
struct cap_cb {
|
||||
switch_mutex_t *mutex;
|
||||
char sessionId[MAX_SESSION_ID+1];
|
||||
char bugname[MAX_BUG_LEN+1];
|
||||
char subscriptionKey[MAX_SUBSCRIPTION_KEY_LEN];
|
||||
uint32_t channels;
|
||||
SpeexResamplerState *resampler;
|
||||
void* streamer;
|
||||
responseHandler_t responseHandler;
|
||||
int interim;
|
||||
|
||||
char lang[MAX_LANG];
|
||||
char region[MAX_REGION];
|
||||
|
||||
switch_vad_t * vad;
|
||||
};
|
||||
|
||||
#endif
|
||||
51
mod_azure_transcribe/simple_buffer.h
Normal file
51
mod_azure_transcribe/simple_buffer.h
Normal file
@@ -0,0 +1,51 @@
|
||||
/**
|
||||
* (very) simple and limited circular buffer,
|
||||
* supporting only the use case of doing all of the adds
|
||||
* and then subsquently retrieves.
|
||||
*
|
||||
*/
|
||||
class SimpleBuffer {
|
||||
public:
|
||||
SimpleBuffer(uint32_t chunkSize, uint32_t numChunks) : numItems(0),
|
||||
m_numChunks(numChunks), m_chunkSize(chunkSize) {
|
||||
m_pData = new char[chunkSize * numChunks];
|
||||
m_pNextWrite = m_pData;
|
||||
}
|
||||
~SimpleBuffer() {
|
||||
delete [] m_pData;
|
||||
}
|
||||
|
||||
void add(void *data, uint32_t datalen) {
|
||||
if (datalen % m_chunkSize != 0) return;
|
||||
int numChunks = datalen / m_chunkSize;
|
||||
for (int i = 0; i < numChunks; i++) {
|
||||
memcpy(m_pNextWrite, data, m_chunkSize);
|
||||
data = static_cast<char*>(data) + m_chunkSize;
|
||||
if (numItems < m_numChunks) numItems++;
|
||||
|
||||
uint32_t offset = (m_pNextWrite - m_pData) / m_chunkSize;
|
||||
if (offset >= m_numChunks - 1) m_pNextWrite = m_pData;
|
||||
else m_pNextWrite += m_chunkSize;
|
||||
}
|
||||
}
|
||||
|
||||
char* getNextChunk() {
|
||||
if (numItems--) {
|
||||
char *p = m_pNextWrite;
|
||||
uint32_t offset = (m_pNextWrite - m_pData) / m_chunkSize;
|
||||
if (offset >= m_numChunks - 1) m_pNextWrite = m_pData;
|
||||
else m_pNextWrite += m_chunkSize;
|
||||
return p;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
uint32_t getNumItems() { return numItems;}
|
||||
|
||||
private:
|
||||
char *m_pData;
|
||||
uint32_t numItems;
|
||||
uint32_t m_chunkSize;
|
||||
uint32_t m_numChunks;
|
||||
char* m_pNextWrite;
|
||||
};
|
||||
BIN
mod_cobalt_transcribe/.DS_Store
vendored
Normal file
BIN
mod_cobalt_transcribe/.DS_Store
vendored
Normal file
Binary file not shown.
8
mod_cobalt_transcribe/LICENSE
Normal file
8
mod_cobalt_transcribe/LICENSE
Normal file
@@ -0,0 +1,8 @@
|
||||
Copyright 2023, Drachtio Communications Services, LLC
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
11
mod_cobalt_transcribe/Makefile.am
Normal file
11
mod_cobalt_transcribe/Makefile.am
Normal file
@@ -0,0 +1,11 @@
|
||||
include $(top_srcdir)/build/modmake.rulesam
|
||||
MODNAME=mod_cobalt_transcribe
|
||||
|
||||
mod_LTLIBRARIES = mod_cobalt_transcribe.la
|
||||
mod_cobalt_transcribe_la_SOURCES = mod_cobalt_transcribe.c cobalt_glue.cpp
|
||||
mod_cobalt_transcribe_la_CFLAGS = $(AM_CFLAGS)
|
||||
mod_cobalt_transcribe_la_CXXFLAGS = -I $(top_srcdir)/libs/googleapis/gens -I $(top_srcdir)/libs/cobalt-asr-grpc-api/stubs $(AM_CXXFLAGS) -std=c++17
|
||||
|
||||
mod_cobalt_transcribe_la_LIBADD = $(switch_builddir)/libfreeswitch.la
|
||||
mod_cobalt_transcribe_la_LDFLAGS = -avoid-version -module -no-undefined -shared `pkg-config --libs grpc++ grpc`
|
||||
|
||||
53
mod_cobalt_transcribe/README.md
Normal file
53
mod_cobalt_transcribe/README.md
Normal file
@@ -0,0 +1,53 @@
|
||||
# mod_cobalt_transcribe
|
||||
|
||||
A Freeswitch module that generates real-time transcriptions on a Freeswitch channel by using the [streaming transcription API](https://docs-v2.cobaltspeech.com/docs/asr/) from [Cobalt Speech](https://www.cobaltspeech.com/). Cobalt Speech provides a speech recognition product that can be run on-prem on a Linux server.
|
||||
|
||||
## API
|
||||
|
||||
### Commands
|
||||
The freeswitch module exposes the following API commands:
|
||||
|
||||
```
|
||||
uuid_cobalt_get_version <uuid> <hostport>
|
||||
```
|
||||
Returns version information about the Cobalt server listening at the specified ip address and port
|
||||
|
||||
```
|
||||
uuid_cobalt_list_models <uuid> <hostport>
|
||||
```
|
||||
Lists the available models for a Cobalt speech server
|
||||
|
||||
```
|
||||
uuid_cobalt_compile_context <uuid> <hostport> <model> <token> <phrases>
|
||||
```
|
||||
Compiles a list of hint phrases into a context string that can later be used in a transcribe command. The context string is returned as a base64-encoded string. Hints must be compiled within the context of a single model, thus it is required to provide the model name. Hints must also be associated with a "token"; the default token that you may generally use is "unk:default". See [here](https://docs-v2.cobaltspeech.com/docs/asr/transcribe/recognition_context/) for more details.
|
||||
|
||||
```
|
||||
uuid_cobalt_transcribe <uuid> hostport start model [interim|full] [stereo|mono] [bug-name]
|
||||
```
|
||||
Attaches media bug to channel and performs streaming recognize request.
|
||||
|
||||
```
|
||||
uuid_cobalt_transcribe <uuid> hostport stop model
|
||||
```
|
||||
Stop transcription on a channel.
|
||||
|
||||
|
||||
### Channel Variables
|
||||
|
||||
| variable | Description |
|
||||
| --- | ----------- |
|
||||
| COBALT_ENABLE_CONFUSION_NETWORK | if true, enable [confusion network](https://docs-v2.cobaltspeech.com/docs/asr/transcribe/#confusion-network) |
|
||||
| COBALT_METADATA | custom metadata to send with a transcribe request |
|
||||
| COBALT_COMPILED_CONTEXT_DATA | base64-encoded compiled context hints to include with the transcribe request |
|
||||
|
||||
|
||||
### Events
|
||||
`cobalt_speech::transcription` - returns an interim or final transcription. The event contains a JSON body describing the transcription result.
|
||||
|
||||
`cobalt_speech::version_response` - returns the response to a `uuid_cobalt_get_version` request. The event contains a JSON body describing the version.
|
||||
|
||||
`cobalt_speech::model_list_response` - returns the response to a `uuid_cobalt_list_models` request. The event contains a JSON body describing the available models.
|
||||
|
||||
`cobalt_speech::compile_context_response` - returns the response to a uuid_cobalt_compile_context request. The event contains a JSON body containing the base64-encoded context.
|
||||
|
||||
775
mod_cobalt_transcribe/cobalt_glue.cpp
Normal file
775
mod_cobalt_transcribe/cobalt_glue.cpp
Normal file
@@ -0,0 +1,775 @@
|
||||
#include <cstdlib>
|
||||
#include <algorithm>
|
||||
#include <future>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
|
||||
#include <switch.h>
|
||||
#include <switch_json.h>
|
||||
#include <grpc++/grpc++.h>
|
||||
|
||||
#include "cobaltspeech/transcribe/v5/transcribe.grpc.pb.h"
|
||||
|
||||
namespace cobalt_asr = cobaltspeech::transcribe::v5;
|
||||
|
||||
#include "mod_cobalt_transcribe.h"
|
||||
#include "simple_buffer.h"
|
||||
|
||||
#define CHUNKSIZE (320)
|
||||
#define DEFAULT_CONTEXT_TOKEN "unk:default"
|
||||
|
||||
namespace {
|
||||
int case_insensitive_match(std::string s1, std::string s2) {
|
||||
std::transform(s1.begin(), s1.end(), s1.begin(), ::tolower);
|
||||
std::transform(s2.begin(), s2.end(), s2.begin(), ::tolower);
|
||||
if(s1.compare(s2) == 0)
|
||||
return 1; //The strings are same
|
||||
return 0; //not matched
|
||||
}
|
||||
std::string trim(const std::string& str) {
|
||||
size_t start = str.find_first_not_of(" \t\n\r");
|
||||
size_t end = str.find_last_not_of(" \t\n\r");
|
||||
if (start == std::string::npos) {
|
||||
return "";
|
||||
}
|
||||
return str.substr(start, end - start + 1);
|
||||
}
|
||||
|
||||
std::vector<std::string> splitAndTrim(const char* input, char delimiter) {
|
||||
std::string s(input);
|
||||
std::vector<std::string> result;
|
||||
std::stringstream ss(s);
|
||||
std::string token;
|
||||
|
||||
while (getline(ss, token, delimiter)) {
|
||||
result.push_back(trim(token));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
std::string base64_encode(const std::string &input) {
|
||||
const std::string chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
||||
std::string encoded;
|
||||
int val = 0;
|
||||
int valb = -6;
|
||||
for (unsigned char c : input) {
|
||||
val = (val << 8) + c;
|
||||
valb += 8;
|
||||
while (valb >= 0) {
|
||||
encoded.push_back(chars[(val >> valb) & 0x3F]);
|
||||
valb -= 6;
|
||||
}
|
||||
}
|
||||
if (valb > -6) encoded.push_back(chars[((val << 8) >> (valb + 8)) & 0x3F]);
|
||||
while (encoded.size() % 4) encoded.push_back('=');
|
||||
return encoded;
|
||||
}
|
||||
|
||||
std::string base64_decode(const std::string &input) {
|
||||
const std::string chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
||||
std::vector<int> T(256, -1);
|
||||
for (int i = 0; i < 64; i++) T[chars[i]] = i;
|
||||
|
||||
std::string decoded;
|
||||
int val = 0;
|
||||
int valb = -8;
|
||||
for (unsigned char c : input) {
|
||||
if (T[c] == -1) break;
|
||||
val = (val << 6) + T[c];
|
||||
valb += 6;
|
||||
while (valb >= 0) {
|
||||
decoded.push_back(char((val >> valb) & 0xFF));
|
||||
valb -= 8;
|
||||
}
|
||||
}
|
||||
return decoded;
|
||||
}
|
||||
|
||||
const char* compile_context_phrases(switch_core_session_t *session, const char* hostport, const char* model, const char* token, const char* phrases) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_event_t *event;
|
||||
|
||||
grpc::ClientContext context;
|
||||
std::shared_ptr<grpc::Channel> grpcChannel ;
|
||||
grpcChannel = grpc::CreateChannel(hostport, grpc::InsecureChannelCredentials());
|
||||
|
||||
if (!grpcChannel) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "failed creating grpc channel\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "compile context, model: %s, token: %s, phrases: %s\n", model, token, phrases);
|
||||
|
||||
std::unique_ptr<cobalt_asr::TranscribeService::Stub> stub = std::move(cobalt_asr::TranscribeService::NewStub(grpcChannel));
|
||||
|
||||
cobalt_asr::CompileContextRequest request;
|
||||
cobalt_asr::CompileContextResponse response;
|
||||
|
||||
request.set_model_id(model);
|
||||
request.set_token(token);
|
||||
|
||||
|
||||
// hints are either a simple comma-separated list of phrases, or a json array of objects
|
||||
// containing a phrase and a boost value
|
||||
char* originalPhrases = strdup(phrases);
|
||||
request.clear_phrases();
|
||||
auto *jPhrases = cJSON_Parse((char *) phrases);
|
||||
if (jPhrases) {
|
||||
int i = 0;
|
||||
cJSON *jPhrase = NULL;
|
||||
cJSON_ArrayForEach(jPhrase, jPhrase) {
|
||||
auto* contextPhrase = request.add_phrases();
|
||||
cJSON *jItem = cJSON_GetObjectItem(jPhrase, "phrase");
|
||||
if (jItem) {
|
||||
auto text = cJSON_GetStringValue(jItem);
|
||||
contextPhrase->set_text(text);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "added text: %f\n", text);
|
||||
if (cJSON_GetObjectItem(jPhrase, "boost")) {
|
||||
float boost = (float) cJSON_GetObjectItem(jPhrase, "boost")->valuedouble;
|
||||
contextPhrase->set_boost(boost);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "added boost value: %f\n", boost);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
}
|
||||
cJSON_Delete(jPhrases);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "added %d hints\n", i);
|
||||
}
|
||||
else {
|
||||
std::vector<std::string> tokens = splitAndTrim(phrases, ',');
|
||||
for (const std::string& token : tokens) {
|
||||
auto* contextPhrase = request.add_phrases();
|
||||
contextPhrase->set_text(token);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "added: %s\n", token.c_str());
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "added %d hints\n", request.phrases_size());
|
||||
}
|
||||
|
||||
stub->CompileContext(&context, request, &response);
|
||||
|
||||
cJSON * jResult = cJSON_CreateObject();
|
||||
cJSON_AddBoolToObject(jResult, "has_context", response.has_context());
|
||||
auto& c = response.context();
|
||||
auto data = base64_encode(c.data());
|
||||
cJSON_AddItemToObject(jResult, "compiled_context", cJSON_CreateString(data.c_str()));
|
||||
cJSON_AddItemToObject(jResult, "phrases", cJSON_CreateString(phrases));
|
||||
|
||||
char* json = cJSON_PrintUnformatted(jResult);
|
||||
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_COMPILE_CONTEXT_RESPONSE);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "cobalt");
|
||||
switch_event_add_body(event, "%s", json);
|
||||
switch_event_fire(&event);
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "compile context response for cobalt speech: %s\n", json);
|
||||
|
||||
free(json);
|
||||
cJSON_Delete(jResult);
|
||||
|
||||
free(originalPhrases);
|
||||
|
||||
return response.has_context() ? c.data().c_str() : nullptr;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class GStreamer {
|
||||
public:
|
||||
GStreamer(
|
||||
switch_core_session_t *session, const char* hostport, const char* model, uint32_t channels, int interim) :
|
||||
m_session(session),
|
||||
m_writesDone(false),
|
||||
m_connected(false),
|
||||
m_interim(interim),
|
||||
m_hostport(hostport),
|
||||
m_model(model),
|
||||
m_channelCount(channels),
|
||||
m_audioBuffer(CHUNKSIZE, 15) {
|
||||
|
||||
const char* var;
|
||||
char sessionId[256];
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
strncpy(m_sessionId, switch_core_session_get_uuid(session), 256);
|
||||
}
|
||||
|
||||
~GStreamer() {
|
||||
//switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_INFO, "GStreamer::~GStreamer - deleting channel and stub: %p\n", (void*)this);
|
||||
}
|
||||
|
||||
std::shared_ptr<grpc::Channel> createGrpcConnection() {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(m_session);
|
||||
|
||||
std::shared_ptr<grpc::Channel> grpcChannel ;
|
||||
grpcChannel = grpc::CreateChannel(m_hostport, grpc::InsecureChannelCredentials());
|
||||
|
||||
if (!grpcChannel) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "GStreamer %p failed creating grpc channel\n", this);
|
||||
throw std::runtime_error(std::string("Error creating grpc channel"));
|
||||
}
|
||||
|
||||
m_stub = std::move(cobalt_asr::TranscribeService::NewStub(grpcChannel));
|
||||
return grpcChannel;
|
||||
}
|
||||
|
||||
void connect() {
|
||||
const char* var;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(m_session);
|
||||
|
||||
assert(!m_connected);
|
||||
// Begin a stream.
|
||||
|
||||
std::shared_ptr<grpc::Channel> grpcChannel = createGrpcConnection();
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p creating streamer\n", this);
|
||||
m_streamer = m_stub->StreamingRecognize(&m_context);
|
||||
m_connected = true;
|
||||
|
||||
/* set configuration parameters which are carried in the RecognitionInitMessage */
|
||||
auto config = m_request.mutable_config();
|
||||
auto format = config->mutable_audio_format_raw();
|
||||
config->set_model_id(m_model);
|
||||
format->set_encoding(cobalt_asr::AudioEncoding::AUDIO_ENCODING_SIGNED);
|
||||
format->set_bit_depth(16);
|
||||
format->set_sample_rate(8000);
|
||||
format->set_channels(m_channelCount);
|
||||
format->set_byte_order(cobalt_asr::ByteOrder::BYTE_ORDER_LITTLE_ENDIAN);
|
||||
|
||||
// confusion network
|
||||
if (switch_true(switch_channel_get_variable(channel, "COBALT_ENABLE_CONFUSION_NETWORK"))) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p set_enable_confusion_network true\n", this);
|
||||
config->set_enable_confusion_network(true);
|
||||
}
|
||||
// metadata
|
||||
if (var = switch_channel_get_variable(channel, "COBALT_METADATA")) {
|
||||
auto metadata = config->mutable_metadata();
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p cobalt metadata %s\n", this, var);
|
||||
metadata->set_custom_metadata(var);
|
||||
}
|
||||
|
||||
// set_enable_word_details
|
||||
if (switch_true(switch_channel_get_variable(channel, "COBALT_ENABLE_WORD_TIME_OFFSETS"))) {
|
||||
config->set_enable_word_details(true);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p enable word-level details\n", this);
|
||||
}
|
||||
|
||||
// compiled context data
|
||||
if (var = switch_channel_get_variable(channel, "COBALT_COMPILED_CONTEXT_DATA")) {
|
||||
auto data = base64_decode(var);
|
||||
config->mutable_context()->add_compiled()->set_data(data);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p set compiled context %s\n", this, var);
|
||||
}
|
||||
|
||||
// read thread is waiting on this
|
||||
m_promise.set_value();
|
||||
|
||||
// Write the first request, containing the config only.
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p sending initial message\n", this);
|
||||
m_streamer->Write(m_request);
|
||||
m_request.clear_config();
|
||||
|
||||
// send any buffered audio
|
||||
int nFrames = m_audioBuffer.getNumItems();
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p got stream ready, %d buffered frames\n", this, nFrames);
|
||||
if (nFrames) {
|
||||
char *p;
|
||||
do {
|
||||
p = m_audioBuffer.getNextChunk();
|
||||
if (p) {
|
||||
write(p, CHUNKSIZE);
|
||||
}
|
||||
} while (p);
|
||||
}
|
||||
}
|
||||
|
||||
bool write(void* data, uint32_t datalen) {
|
||||
if (!m_connected) {
|
||||
if (datalen % CHUNKSIZE == 0) {
|
||||
m_audioBuffer.add(data, datalen);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
m_request.clear_audio();
|
||||
m_request.mutable_audio()->set_data(data, datalen);
|
||||
bool ok = m_streamer->Write(m_request);
|
||||
return ok;
|
||||
}
|
||||
|
||||
uint32_t nextMessageSize(void) {
|
||||
uint32_t size = 0;
|
||||
m_streamer->NextMessageSize(&size);
|
||||
return size;
|
||||
}
|
||||
|
||||
bool read(cobalt_asr::StreamingRecognizeResponse* response) {
|
||||
return m_streamer->Read(response);
|
||||
}
|
||||
|
||||
grpc::Status finish() {
|
||||
return m_streamer->Finish();
|
||||
}
|
||||
|
||||
void writesDone() {
|
||||
// grpc crashes if we call this twice on a stream
|
||||
if (!m_connected) {
|
||||
cancelConnect();
|
||||
}
|
||||
else if (!m_writesDone) {
|
||||
m_streamer->WritesDone();
|
||||
m_writesDone = true;
|
||||
}
|
||||
}
|
||||
|
||||
bool waitForConnect() {
|
||||
std::shared_future<void> sf(m_promise.get_future());
|
||||
sf.wait();
|
||||
return m_connected;
|
||||
}
|
||||
|
||||
void cancelConnect() {
|
||||
assert(!m_connected);
|
||||
m_promise.set_value();
|
||||
}
|
||||
|
||||
bool isConnected() {
|
||||
return m_connected;
|
||||
}
|
||||
|
||||
private:
|
||||
switch_core_session_t* m_session;
|
||||
grpc::ClientContext m_context;
|
||||
std::shared_ptr<grpc::Channel> m_channel;
|
||||
std::unique_ptr<cobalt_asr::TranscribeService::Stub> m_stub;
|
||||
cobalt_asr::StreamingRecognizeRequest m_request;
|
||||
std::unique_ptr< grpc::ClientReaderWriterInterface<cobalt_asr::StreamingRecognizeRequest, cobalt_asr::StreamingRecognizeResponse> > m_streamer;
|
||||
bool m_writesDone;
|
||||
bool m_connected;
|
||||
bool m_interim;
|
||||
std::string m_hostport;
|
||||
std::string m_model;
|
||||
std::promise<void> m_promise;
|
||||
SimpleBuffer m_audioBuffer;
|
||||
uint32_t m_channelCount;
|
||||
char m_sessionId[256];
|
||||
};
|
||||
|
||||
static void *SWITCH_THREAD_FUNC grpc_read_thread(switch_thread_t *thread, void *obj) {
|
||||
struct cap_cb *cb = (struct cap_cb *) obj;
|
||||
GStreamer* streamer = (GStreamer *) cb->streamer;
|
||||
|
||||
bool connected = streamer->waitForConnect();
|
||||
if (!connected) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "cobalt transcribe grpc read thread exiting since we didnt connect\n") ;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Read responses.
|
||||
cobalt_asr::StreamingRecognizeResponse response;
|
||||
while (streamer->read(&response)) { // Returns false when no more to read.
|
||||
switch_core_session_t* session = switch_core_session_locate(cb->sessionId);
|
||||
if (!session) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "grpc_read_thread: session %s is gone!\n", cb->sessionId) ;
|
||||
return nullptr;
|
||||
}
|
||||
if (response.has_error()) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "grpc_read_thread: error: %s\n", response.error().message().c_str()) ;
|
||||
}
|
||||
if (!response.has_result()) {
|
||||
switch_core_session_rwunlock(session);
|
||||
continue;
|
||||
}
|
||||
|
||||
const auto& result = response.result();
|
||||
auto is_final = !result.is_partial();
|
||||
auto audio_channel = result.audio_channel();
|
||||
|
||||
cJSON * jResult = cJSON_CreateObject();
|
||||
cJSON * jAlternatives = cJSON_CreateArray();
|
||||
cJSON_AddItemToObject(jResult, "is_final", cJSON_CreateBool(is_final));
|
||||
cJSON_AddItemToObject(jResult, "channel", cJSON_CreateNumber(audio_channel));
|
||||
cJSON_AddItemToObject(jResult, "alternatives", jAlternatives);
|
||||
|
||||
for (int a = 0; a < result.alternatives_size(); ++a) {
|
||||
auto alternative = result.alternatives(a);
|
||||
cJSON* jAlt = cJSON_CreateObject();
|
||||
cJSON* jTranscriptRaw = cJSON_CreateString(alternative.transcript_raw().c_str());
|
||||
|
||||
cJSON_AddItemToObject(jAlt, "confidence", cJSON_CreateNumber(alternative.confidence()));
|
||||
cJSON_AddItemToObject(jAlt, "transcript_formatted", cJSON_CreateString(alternative.transcript_formatted().c_str()));
|
||||
cJSON_AddItemToObject(jAlt, "transcript_raw", cJSON_CreateString(alternative.transcript_raw().c_str()));
|
||||
cJSON_AddItemToObject(jAlt, "start_time_ms", cJSON_CreateNumber(alternative.start_time_ms()));
|
||||
cJSON_AddItemToObject(jAlt, "duration_ms", cJSON_CreateNumber(alternative.duration_ms()));
|
||||
|
||||
if (alternative.has_word_details()) {
|
||||
cJSON * jWords = cJSON_CreateArray();
|
||||
cJSON * jWordsRaw = cJSON_CreateArray();
|
||||
auto& word_details = alternative.word_details();
|
||||
for (int b = 0; b < word_details.formatted_size(); ++b) {
|
||||
cJSON* jWord = cJSON_CreateObject();
|
||||
auto& word_info = word_details.formatted(b);
|
||||
cJSON_AddItemToObject(jWord, "word", cJSON_CreateString(word_info.word().c_str()));
|
||||
cJSON_AddItemToObject(jWord, "confidence", cJSON_CreateNumber(word_info.confidence()));
|
||||
cJSON_AddItemToObject(jWord, "start_time_ms", cJSON_CreateNumber(word_info.start_time_ms()));
|
||||
cJSON_AddItemToObject(jWord, "duration_ms", cJSON_CreateNumber(word_info.duration_ms()));
|
||||
|
||||
cJSON_AddItemToArray(jWords, jWord);
|
||||
}
|
||||
cJSON_AddItemToObject(jAlt, "formatted_words", jWords);
|
||||
|
||||
for (int c = 0; c < word_details.raw_size(); ++c) {
|
||||
cJSON* jWord = cJSON_CreateObject();
|
||||
auto& word_info = word_details.raw(c);
|
||||
cJSON_AddItemToObject(jWord, "word", cJSON_CreateString(word_info.word().c_str()));
|
||||
cJSON_AddItemToObject(jWord, "confidence", cJSON_CreateNumber(word_info.confidence()));
|
||||
cJSON_AddItemToObject(jWord, "start_time_ms", cJSON_CreateNumber(word_info.start_time_ms()));
|
||||
cJSON_AddItemToObject(jWord, "duration_ms", cJSON_CreateNumber(word_info.duration_ms()));
|
||||
|
||||
cJSON_AddItemToArray(jWordsRaw, jWord);
|
||||
}
|
||||
cJSON_AddItemToObject(jAlt, "raw_words", jWordsRaw);
|
||||
|
||||
}
|
||||
cJSON_AddItemToArray(jAlternatives, jAlt);
|
||||
}
|
||||
char* json = cJSON_PrintUnformatted(jResult);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "cobalt models: %s\n", json) ;
|
||||
cb->responseHandler(session, (const char *) json, cb->bugname, NULL);
|
||||
free(json);
|
||||
|
||||
cJSON_Delete(jResult);
|
||||
|
||||
switch_core_session_rwunlock(session);
|
||||
}
|
||||
|
||||
{
|
||||
switch_core_session_t* session = switch_core_session_locate(cb->sessionId);
|
||||
if (session) {
|
||||
grpc::Status status = streamer->finish();
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "grpc_read_thread: finish() status %s (%d)\n", status.error_message().c_str(), status.error_code()) ;
|
||||
switch_core_session_rwunlock(session);
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
switch_status_t cobalt_speech_get_version(switch_core_session_t *session, char* hostport) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_event_t *event;
|
||||
|
||||
grpc::ClientContext context;
|
||||
std::shared_ptr<grpc::Channel> grpcChannel ;
|
||||
grpcChannel = grpc::CreateChannel(hostport, grpc::InsecureChannelCredentials());
|
||||
|
||||
if (!grpcChannel) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "failed creating grpc channel\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
std::unique_ptr<cobalt_asr::TranscribeService::Stub> stub = std::move(cobalt_asr::TranscribeService::NewStub(grpcChannel));
|
||||
|
||||
cobalt_asr::VersionResponse response;
|
||||
stub->Version(&context, cobalt_asr::VersionRequest(), &response);
|
||||
|
||||
cJSON * jResult = cJSON_CreateObject();
|
||||
cJSON_AddItemToObject(jResult, "version", cJSON_CreateString(response.version().c_str()));
|
||||
|
||||
char* json = cJSON_PrintUnformatted(jResult);
|
||||
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_VERSION_RESPONSE);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "cobalt");
|
||||
switch_event_add_body(event, "%s", json);
|
||||
switch_event_fire(&event);
|
||||
switch_event_destroy(&event);
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "retrieved version for cobalt speech: %s\n", json);
|
||||
|
||||
free(json);
|
||||
cJSON_Delete(jResult);
|
||||
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_status_t cobalt_speech_compile_context(switch_core_session_t *session, char* hostport, char* model, char* token, char* phrases) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_event_t *event;
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "compile context, model: %s, token: %s, phrases: %s\n", model, token, phrases);
|
||||
|
||||
return compile_context_phrases(session, hostport, model, token, phrases) != nullptr ?
|
||||
SWITCH_STATUS_SUCCESS :
|
||||
SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
|
||||
switch_status_t cobalt_speech_list_models(switch_core_session_t *session, char* hostport) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_event_t *event;
|
||||
|
||||
grpc::ClientContext context;
|
||||
std::shared_ptr<grpc::Channel> grpcChannel ;
|
||||
grpcChannel = grpc::CreateChannel(hostport, grpc::InsecureChannelCredentials());
|
||||
|
||||
if (!grpcChannel) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "failed creating grpc channel\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
std::unique_ptr<cobalt_asr::TranscribeService::Stub> stub = std::move(cobalt_asr::TranscribeService::NewStub(grpcChannel));
|
||||
|
||||
cobalt_asr::ListModelsResponse response;
|
||||
stub->ListModels(&context, cobalt_asr::ListModelsRequest(), &response);
|
||||
cJSON * jModels = cJSON_CreateArray();
|
||||
for (int i = 0; i < response.models_size(); i++) {
|
||||
auto model = response.models(i);
|
||||
cJSON* jModel = cJSON_CreateObject();
|
||||
cJSON * jAttributes = cJSON_CreateArray();
|
||||
|
||||
cJSON_AddItemToArray(jModels, jModel);
|
||||
cJSON_AddItemToObject(jModel, "attributes", jAttributes);
|
||||
cJSON_AddItemToObject(jModel, "id", cJSON_CreateString(model.id().c_str()));
|
||||
cJSON_AddItemToObject(jModel, "name", cJSON_CreateString(model.name().c_str()));
|
||||
|
||||
if (model.has_attributes()) {
|
||||
auto& attributes = model.attributes();
|
||||
cJSON* jAttr = cJSON_CreateObject();
|
||||
cJSON_AddItemToArray(jAttributes, jAttr);
|
||||
|
||||
/* supported sample rates */
|
||||
cJSON * jSupportedSampleRates = cJSON_CreateArray();
|
||||
cJSON_AddItemToObject(jAttr, "supported_sample_rates", jSupportedSampleRates);
|
||||
for (int j = 0; j < attributes.supported_sample_rates_size(); j++) {
|
||||
cJSON_AddItemToObject(jSupportedSampleRates, "supported_sample_rates", cJSON_CreateNumber(attributes.supported_sample_rates(j)));
|
||||
}
|
||||
|
||||
/* sample rate */
|
||||
cJSON_AddItemToObject(jAttr, "sample_rate", cJSON_CreateNumber(attributes.sample_rate()));
|
||||
|
||||
/* context info */
|
||||
auto& context_info = attributes.context_info();
|
||||
cJSON * jContextInfo = cJSON_CreateObject();
|
||||
cJSON* jAllowedContextTokens = cJSON_CreateArray();
|
||||
cJSON_AddItemToObject(jAttr, "context_info", jContextInfo);
|
||||
cJSON_AddItemToObject(jContextInfo, "allowed_context_tokens", jAllowedContextTokens);
|
||||
for (int j = 0; j < context_info.allowed_context_tokens_size(); j++) {
|
||||
cJSON_AddItemToArray(jAllowedContextTokens, cJSON_CreateString(context_info.allowed_context_tokens(j).c_str()));
|
||||
}
|
||||
|
||||
cJSON_AddBoolToObject(jContextInfo, "supports_context", context_info.supports_context());
|
||||
}
|
||||
}
|
||||
|
||||
char* json = cJSON_PrintUnformatted(jModels);
|
||||
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_MODEL_LIST_RESPONSE);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "cobalt");
|
||||
switch_event_add_body(event, "%s", json);
|
||||
switch_event_fire(&event);
|
||||
switch_event_destroy(&event);
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "retrieved %d models for cobalt speech: %s\n", response.models_size(), json);
|
||||
|
||||
free(json);
|
||||
cJSON_Delete(jModels);
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
switch_status_t cobalt_speech_init() {
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_status_t cobalt_speech_cleanup() {
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
switch_status_t cobalt_speech_session_init(switch_core_session_t *session, responseHandler_t responseHandler, char* hostport,
|
||||
uint32_t samples_per_second, uint32_t channels, char* model, int interim, char *bugname, void **ppUserData) {
|
||||
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
auto read_codec = switch_core_session_get_read_codec(session);
|
||||
uint32_t sampleRate = read_codec->implementation->actual_samples_per_second;
|
||||
struct cap_cb *cb;
|
||||
int err;
|
||||
|
||||
cb =(struct cap_cb *) switch_core_session_alloc(session, sizeof(*cb));
|
||||
strncpy(cb->sessionId, switch_core_session_get_uuid(session), MAX_SESSION_ID);
|
||||
strncpy(cb->bugname, bugname, MAX_BUG_LEN);
|
||||
cb->end_of_utterance = 0;
|
||||
|
||||
switch_mutex_init(&cb->mutex, SWITCH_MUTEX_NESTED, switch_core_session_get_pool(session));
|
||||
if (sampleRate != 8000) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "cobalt_speech_session_init: initializing resampler\n");
|
||||
cb->resampler = speex_resampler_init(channels, sampleRate, 8000, SWITCH_RESAMPLE_QUALITY, &err);
|
||||
if (0 != err) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error initializing resampler: %s.\n",
|
||||
switch_channel_get_name(channel), speex_resampler_strerror(err));
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
} else {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "%s: no resampling needed for this call (bug: %s)\n", switch_channel_get_name(channel), bugname);
|
||||
}
|
||||
cb->responseHandler = responseHandler;
|
||||
|
||||
// allocate vad if we are delaying connecting to the recognizer until we detect speech
|
||||
if (switch_channel_var_true(channel, "START_RECOGNIZING_ON_VAD")) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "cobalt_speech_session_init: initializing vad\n");
|
||||
cb->vad = switch_vad_init(sampleRate, channels);
|
||||
if (cb->vad) {
|
||||
const char* var;
|
||||
int mode = 2;
|
||||
int silence_ms = 150;
|
||||
int voice_ms = 250;
|
||||
int debug = 0;
|
||||
|
||||
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_MODE")) {
|
||||
mode = atoi(var);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_SILENCE_MS")) {
|
||||
silence_ms = atoi(var);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_VOICE_MS")) {
|
||||
voice_ms = atoi(var);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_VOICE_MS")) {
|
||||
voice_ms = atoi(var);
|
||||
}
|
||||
switch_vad_set_mode(cb->vad, mode);
|
||||
switch_vad_set_param(cb->vad, "silence_ms", silence_ms);
|
||||
switch_vad_set_param(cb->vad, "voice_ms", voice_ms);
|
||||
switch_vad_set_param(cb->vad, "debug", debug);
|
||||
}
|
||||
}
|
||||
|
||||
GStreamer *streamer = NULL;
|
||||
try {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "cobalt_speech_session_init: allocating streamer\n");
|
||||
streamer = new GStreamer(session, hostport, model, channels, interim);
|
||||
cb->streamer = streamer;
|
||||
} catch (std::exception& e) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error initializing gstreamer: %s.\n",
|
||||
switch_channel_get_name(channel), e.what());
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
if (!cb->vad) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "cobalt_speech_session_init: no vad so connecting to cobalt immediately\n");
|
||||
streamer->connect();
|
||||
}
|
||||
|
||||
// create the read thread
|
||||
switch_threadattr_t *thd_attr = NULL;
|
||||
switch_memory_pool_t *pool = switch_core_session_get_pool(session);
|
||||
|
||||
switch_threadattr_create(&thd_attr, pool);
|
||||
switch_threadattr_stacksize_set(thd_attr, SWITCH_THREAD_STACKSIZE);
|
||||
switch_thread_create(&cb->thread, thd_attr, grpc_read_thread, cb, pool);
|
||||
|
||||
*ppUserData = cb;
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_status_t cobalt_speech_session_cleanup(switch_core_session_t *session, int channelIsClosing, switch_media_bug_t *bug) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
|
||||
if (bug) {
|
||||
struct cap_cb *cb = (struct cap_cb *) switch_core_media_bug_get_user_data(bug);
|
||||
switch_mutex_lock(cb->mutex);
|
||||
|
||||
if (!switch_channel_get_private(channel, cb->bugname)) {
|
||||
// race condition
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s Bug %s is not attached (race).\n", switch_channel_get_name(channel), cb->bugname);
|
||||
switch_mutex_unlock(cb->mutex);
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
switch_channel_set_private(channel, cb->bugname, NULL);
|
||||
|
||||
// close connection and get final responses
|
||||
GStreamer* streamer = (GStreamer *) cb->streamer;
|
||||
|
||||
if (streamer) {
|
||||
streamer->writesDone();
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "cobalt_speech_session_cleanup: GStreamer (%p) waiting for read thread to complete\n", (void*)streamer);
|
||||
switch_status_t st;
|
||||
switch_thread_join(&st, cb->thread);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "cobalt_speech_session_cleanup: GStreamer (%p) read thread completed\n", (void*)streamer);
|
||||
|
||||
delete streamer;
|
||||
cb->streamer = NULL;
|
||||
}
|
||||
|
||||
if (cb->resampler) {
|
||||
speex_resampler_destroy(cb->resampler);
|
||||
}
|
||||
if (cb->vad) {
|
||||
switch_vad_destroy(&cb->vad);
|
||||
cb->vad = nullptr;
|
||||
}
|
||||
if (!channelIsClosing) {
|
||||
switch_core_media_bug_remove(session, &bug);
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "cobalt_speech_session_cleanup: Closed stream\n");
|
||||
|
||||
switch_mutex_unlock(cb->mutex);
|
||||
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s Bug is not attached.\n", switch_channel_get_name(channel));
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
switch_bool_t cobalt_speech_frame(switch_media_bug_t *bug, void* user_data) {
|
||||
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
|
||||
struct cap_cb *cb = (struct cap_cb *) user_data;
|
||||
if (cb->streamer && !cb->end_of_utterance) {
|
||||
GStreamer* streamer = (GStreamer *) cb->streamer;
|
||||
uint8_t data[SWITCH_RECOMMENDED_BUFFER_SIZE];
|
||||
switch_frame_t frame = {};
|
||||
frame.data = data;
|
||||
frame.buflen = SWITCH_RECOMMENDED_BUFFER_SIZE;
|
||||
|
||||
if (switch_mutex_trylock(cb->mutex) == SWITCH_STATUS_SUCCESS) {
|
||||
while (switch_core_media_bug_read(bug, &frame, SWITCH_TRUE) == SWITCH_STATUS_SUCCESS && !switch_test_flag((&frame), SFF_CNG)) {
|
||||
if (frame.datalen) {
|
||||
if (cb->vad && !streamer->isConnected()) {
|
||||
switch_vad_state_t state = switch_vad_process(cb->vad, (int16_t*) frame.data, frame.samples);
|
||||
if (state == SWITCH_VAD_STATE_START_TALKING) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "detected speech, connect to google speech now\n");
|
||||
streamer->connect();
|
||||
cb->responseHandler(session, "vad_detected", cb->bugname, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
if (cb->resampler) {
|
||||
spx_int16_t out[SWITCH_RECOMMENDED_BUFFER_SIZE];
|
||||
spx_uint32_t out_len = SWITCH_RECOMMENDED_BUFFER_SIZE;
|
||||
spx_uint32_t in_len = frame.samples;
|
||||
size_t written;
|
||||
|
||||
speex_resampler_process_interleaved_int(cb->resampler,
|
||||
(const spx_int16_t *) frame.data,
|
||||
(spx_uint32_t *) &in_len,
|
||||
&out[0],
|
||||
&out_len);
|
||||
streamer->write( &out[0], sizeof(spx_int16_t) * out_len);
|
||||
}
|
||||
else {
|
||||
streamer->write( frame.data, sizeof(spx_int16_t) * frame.samples);
|
||||
}
|
||||
}
|
||||
}
|
||||
switch_mutex_unlock(cb->mutex);
|
||||
}
|
||||
}
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
}
|
||||
14
mod_cobalt_transcribe/cobalt_glue.h
Normal file
14
mod_cobalt_transcribe/cobalt_glue.h
Normal file
@@ -0,0 +1,14 @@
|
||||
#ifndef __COBALT_GLUE_H__
|
||||
#define __COBALT_GLUE_H__
|
||||
|
||||
switch_status_t cobalt_speech_init();
|
||||
switch_status_t cobalt_speech_cleanup();
|
||||
switch_status_t cobalt_speech_session_init(switch_core_session_t *session, responseHandler_t responseHandler, char* hostport,
|
||||
uint32_t samples_per_second, uint32_t channels, char* lang, int interim, char *bugname, void **ppUserData);
|
||||
switch_status_t cobalt_speech_session_cleanup(switch_core_session_t *session, int channelIsClosing, switch_media_bug_t *bug);
|
||||
switch_bool_t cobalt_speech_frame(switch_media_bug_t *bug, void* user_data);
|
||||
switch_status_t cobalt_speech_list_models(switch_core_session_t *session, char* hostport);
|
||||
switch_status_t cobalt_speech_get_version(switch_core_session_t *session, char* hostport);
|
||||
switch_status_t cobalt_speech_compile_context(switch_core_session_t *session, char* hostport, char* model, char* token, char* phrases);
|
||||
|
||||
#endif
|
||||
368
mod_cobalt_transcribe/mod_cobalt_transcribe.c
Normal file
368
mod_cobalt_transcribe/mod_cobalt_transcribe.c
Normal file
@@ -0,0 +1,368 @@
|
||||
/*
|
||||
*
|
||||
* mod_cobalt_transcribe.c -- Freeswitch module for real-time transcription using cobalt's gRPC interface
|
||||
*
|
||||
*/
|
||||
#include "mod_cobalt_transcribe.h"
|
||||
#include "cobalt_glue.h"
|
||||
#include <stdlib.h>
|
||||
#include <switch.h>
|
||||
#include <switch_curl.h>
|
||||
|
||||
|
||||
/* Prototypes */
|
||||
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_transcribe_shutdown);
|
||||
SWITCH_MODULE_RUNTIME_FUNCTION(mod_transcribe_runtime);
|
||||
SWITCH_MODULE_LOAD_FUNCTION(mod_transcribe_load);
|
||||
|
||||
SWITCH_MODULE_DEFINITION(mod_cobalt_transcribe, mod_transcribe_load, mod_transcribe_shutdown, NULL);
|
||||
|
||||
|
||||
static switch_status_t do_stop(switch_core_session_t *session, char* bugname);
|
||||
|
||||
static void responseHandler(switch_core_session_t* session, const char * json, const char* bugname,
|
||||
const char* details) {
|
||||
switch_event_t *event;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
|
||||
if (0 == strcmp("vad_detected", json)) {
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_VAD_DETECTED);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "cobalt");
|
||||
}
|
||||
else if (0 == strcmp("error", json)) {
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_ERROR);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_body(event, "%s", details);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "cobalt");
|
||||
}
|
||||
else {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "%s json payload: %s.\n", bugname ? bugname : "cobalt_transcribe", json);
|
||||
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_RESULTS);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "cobalt");
|
||||
switch_event_add_body(event, "%s", json);
|
||||
}
|
||||
if (bugname) switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "media-bugname", bugname);
|
||||
switch_event_fire(&event);
|
||||
}
|
||||
|
||||
static switch_bool_t capture_callback(switch_media_bug_t *bug, void *user_data, switch_abc_type_t type)
|
||||
{
|
||||
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
|
||||
|
||||
switch (type) {
|
||||
case SWITCH_ABC_TYPE_INIT:
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Got SWITCH_ABC_TYPE_INIT.\n");
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_CLOSE:
|
||||
{
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Got SWITCH_ABC_TYPE_CLOSE, calling cobalt_speech_session_cleanup.\n");
|
||||
cobalt_speech_session_cleanup(session, 1, bug);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Finished SWITCH_ABC_TYPE_CLOSE.\n");
|
||||
}
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_READ:
|
||||
|
||||
return cobalt_speech_frame(bug, user_data);
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_WRITE:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
|
||||
static switch_status_t do_stop(switch_core_session_t *session, char *bugname)
|
||||
{
|
||||
switch_status_t status = SWITCH_STATUS_SUCCESS;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = switch_channel_get_private(channel, bugname);
|
||||
|
||||
if (bug) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Received user command command, calling cobalt_speech_session_cleanup (possibly to stop prev transcribe)\n");
|
||||
status = cobalt_speech_session_cleanup(session, 0, bug);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "stopped transcription.\n");
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static switch_status_t start_capture(switch_core_session_t *session, switch_media_bug_flag_t flags,
|
||||
char* hostport, char* model, int interim, char* bugname)
|
||||
{
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug;
|
||||
switch_status_t status;
|
||||
switch_codec_implementation_t read_impl = { 0 };
|
||||
void *pUserData;
|
||||
uint32_t samples_per_second;
|
||||
|
||||
if (switch_channel_get_private(channel, bugname)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "removing bug from previous transcribe\n");
|
||||
do_stop(session, bugname);
|
||||
}
|
||||
|
||||
switch_core_session_get_read_impl(session, &read_impl);
|
||||
|
||||
if (switch_channel_pre_answer(channel) != SWITCH_STATUS_SUCCESS) {
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
samples_per_second = !strcasecmp(read_impl.iananame, "g722") ? read_impl.actual_samples_per_second : read_impl.samples_per_second;
|
||||
|
||||
if (SWITCH_STATUS_FALSE == cobalt_speech_session_init(session, responseHandler, hostport, samples_per_second, flags & SMBF_STEREO ? 2 : 1, model, interim, bugname, &pUserData)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error initializing cobalt speech session.\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
if ((status = switch_core_media_bug_add(session, bugname, NULL, capture_callback, pUserData, 0, flags, &bug)) != SWITCH_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
|
||||
switch_channel_set_private(channel, bugname, bug);
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
#define TRANSCRIBE_API_SYNTAX "<uuid> hostport [start|stop] [model] [interim|full] [stereo|mono] [bug-name]"
|
||||
SWITCH_STANDARD_API(transcribe_function)
|
||||
{
|
||||
char *mycmd = NULL, *argv[7] = { 0 };
|
||||
int argc = 0;
|
||||
switch_status_t status = SWITCH_STATUS_FALSE;
|
||||
switch_media_bug_flag_t flags = SMBF_READ_STREAM;
|
||||
|
||||
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
|
||||
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
|
||||
}
|
||||
|
||||
if (zstr(cmd) ||
|
||||
(!strcasecmp(argv[1], "stop") && argc < 2) ||
|
||||
(!strcasecmp(argv[1], "start") && argc < 3) ||
|
||||
zstr(argv[0])) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s %s %s.\n", cmd, argv[0], argv[1]);
|
||||
stream->write_function(stream, "-USAGE: %s\n", TRANSCRIBE_API_SYNTAX);
|
||||
goto done;
|
||||
} else {
|
||||
switch_core_session_t *lsession = NULL;
|
||||
|
||||
if ((lsession = switch_core_session_locate(argv[0]))) {
|
||||
if (!strcasecmp(argv[1], "stop")) {
|
||||
char *bugname = argc > 2 ? argv[2] : MY_BUG_NAME;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "stop transcribing (bug=%s)\n", bugname);
|
||||
status = do_stop(lsession, bugname);
|
||||
}
|
||||
else if (!strcasecmp(argv[1], "start")) {
|
||||
char* hostport = argv[2];
|
||||
char* model = argv[3];
|
||||
int interim = argc > 4 && !strcmp(argv[4], "interim");
|
||||
char *bugname = argc > 6 ? argv[6] : MY_BUG_NAME;
|
||||
if (argc > 5 && !strcmp(argv[5], "stereo")) {
|
||||
flags |= SMBF_WRITE_STREAM ;
|
||||
flags |= SMBF_STEREO;
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(bug=%s) (hostport=%s) start transcribing %s %s\n", bugname, hostport, model, interim ? "interim": "complete");
|
||||
status = start_capture(lsession, flags, hostport, model, interim, bugname);
|
||||
}
|
||||
switch_core_session_rwunlock(lsession);
|
||||
}
|
||||
}
|
||||
|
||||
if (status == SWITCH_STATUS_SUCCESS) {
|
||||
stream->write_function(stream, "+OK Success\n");
|
||||
} else {
|
||||
stream->write_function(stream, "-ERR Operation Failed\n");
|
||||
}
|
||||
|
||||
done:
|
||||
|
||||
switch_safe_free(mycmd);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
#define TRANSCRIBE_API_MODELS_SYNTAX "<uuid> hostport"
|
||||
SWITCH_STANDARD_API(list_models_function)
|
||||
{
|
||||
char *mycmd = NULL, *argv[2] = { 0 };
|
||||
int argc = 0;
|
||||
switch_status_t status = SWITCH_STATUS_FALSE;
|
||||
|
||||
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
|
||||
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
|
||||
}
|
||||
|
||||
if (zstr(cmd) || argc < 2) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s\n", cmd);
|
||||
stream->write_function(stream, "-USAGE: %s\n", TRANSCRIBE_API_MODELS_SYNTAX);
|
||||
goto done;
|
||||
} else {
|
||||
switch_core_session_t *lsession = NULL;
|
||||
|
||||
if ((lsession = switch_core_session_locate(argv[0]))) {
|
||||
char* hostport = argv[1];
|
||||
status = cobalt_speech_list_models(lsession, hostport);
|
||||
switch_core_session_rwunlock(lsession);
|
||||
}
|
||||
}
|
||||
|
||||
if (status == SWITCH_STATUS_SUCCESS) {
|
||||
stream->write_function(stream, "+OK Success\n");
|
||||
} else {
|
||||
stream->write_function(stream, "-ERR Operation Failed\n");
|
||||
}
|
||||
|
||||
done:
|
||||
|
||||
switch_safe_free(mycmd);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
#define TRANSCRIBE_API_VERSION_SYNTAX "<uuid> hostport"
|
||||
SWITCH_STANDARD_API(version_function)
|
||||
{
|
||||
char *mycmd = NULL, *argv[2] = { 0 };
|
||||
int argc = 0;
|
||||
switch_status_t status = SWITCH_STATUS_FALSE;
|
||||
|
||||
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
|
||||
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
|
||||
}
|
||||
|
||||
if (zstr(cmd) || argc < 2) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s\n", cmd);
|
||||
stream->write_function(stream, "-USAGE: %s\n", TRANSCRIBE_API_VERSION_SYNTAX);
|
||||
goto done;
|
||||
} else {
|
||||
switch_core_session_t *lsession = NULL;
|
||||
|
||||
if ((lsession = switch_core_session_locate(argv[0]))) {
|
||||
char* hostport = argv[1];
|
||||
status = cobalt_speech_get_version(lsession, hostport);
|
||||
switch_core_session_rwunlock(lsession);
|
||||
}
|
||||
}
|
||||
|
||||
if (status == SWITCH_STATUS_SUCCESS) {
|
||||
stream->write_function(stream, "+OK Success\n");
|
||||
} else {
|
||||
stream->write_function(stream, "-ERR Operation Failed\n");
|
||||
}
|
||||
|
||||
done:
|
||||
|
||||
switch_safe_free(mycmd);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
#define TRANSCRIBE_API_COMPILE_CONTEXT_SYNTAX "<uuid> hostport model token phrases"
|
||||
SWITCH_STANDARD_API(compile_context_function)
|
||||
{
|
||||
char *mycmd = NULL, *argv[5] = { 0 };
|
||||
int argc = 0;
|
||||
switch_status_t status = SWITCH_STATUS_FALSE;
|
||||
|
||||
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
|
||||
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
|
||||
}
|
||||
|
||||
if (zstr(cmd) || argc < 5) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s\n", cmd);
|
||||
stream->write_function(stream, "-USAGE: %s\n", TRANSCRIBE_API_COMPILE_CONTEXT_SYNTAX);
|
||||
goto done;
|
||||
} else {
|
||||
switch_core_session_t *lsession = NULL;
|
||||
|
||||
if ((lsession = switch_core_session_locate(argv[0]))) {
|
||||
char* hostport = argv[1];
|
||||
status = cobalt_speech_compile_context(lsession, hostport, argv[2], argv[3], argv[4]);
|
||||
switch_core_session_rwunlock(lsession);
|
||||
}
|
||||
}
|
||||
|
||||
if (status == SWITCH_STATUS_SUCCESS) {
|
||||
stream->write_function(stream, "+OK Success\n");
|
||||
} else {
|
||||
stream->write_function(stream, "-ERR Operation Failed\n");
|
||||
}
|
||||
|
||||
done:
|
||||
|
||||
switch_safe_free(mycmd);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
SWITCH_MODULE_LOAD_FUNCTION(mod_transcribe_load)
|
||||
{
|
||||
switch_api_interface_t *api_interface;
|
||||
|
||||
/* create/register custom event message type */
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_RESULTS) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_RESULTS);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
switch_event_reserve_subclass(TRANSCRIBE_EVENT_ERROR);
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_VAD_DETECTED) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_VAD_DETECTED);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_VERSION_RESPONSE) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_VERSION_RESPONSE);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_MODEL_LIST_RESPONSE) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_MODEL_LIST_RESPONSE);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_COMPILE_CONTEXT_RESPONSE) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_COMPILE_CONTEXT_RESPONSE);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
|
||||
/* connect my internal structure to the blank pointer passed to me */
|
||||
*module_interface = switch_loadable_module_create_module_interface(pool, modname);
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Soniox Speech Transcription API loading..\n");
|
||||
|
||||
if (SWITCH_STATUS_FALSE == cobalt_speech_init()) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Failed initializing cobalt speech interface\n");
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Soniox Speech Transcription API successfully loaded\n");
|
||||
|
||||
SWITCH_ADD_API(api_interface, "uuid_cobalt_transcribe", "Soniox Speech Transcription API", transcribe_function, TRANSCRIBE_API_SYNTAX);
|
||||
switch_console_set_complete("add uuid_cobalt_transcribe hostport start model");
|
||||
switch_console_set_complete("add uuid_cobalt_transcribe hostport stop ");
|
||||
|
||||
SWITCH_ADD_API(api_interface, "uuid_cobalt_list_models", "Soniox Speech Transcription API", list_models_function, TRANSCRIBE_API_MODELS_SYNTAX);
|
||||
switch_console_set_complete("add uuid_cobalt_list_models hostport");
|
||||
|
||||
SWITCH_ADD_API(api_interface, "uuid_cobalt_compile_context", "Soniox Speech Transcription API", compile_context_function, TRANSCRIBE_API_COMPILE_CONTEXT_SYNTAX);
|
||||
switch_console_set_complete("add uuid_cobalt_compile_context hostport token phrases");
|
||||
|
||||
SWITCH_ADD_API(api_interface, "uuid_cobalt_get_version", "Soniox Speech Transcription API", version_function, TRANSCRIBE_API_VERSION_SYNTAX);
|
||||
switch_console_set_complete("add uuid_cobalt_get_version hostport");
|
||||
|
||||
/* indicate that the module should continue to be loaded */
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Called when the system shuts down
|
||||
Macro expands to: switch_status_t mod_cobalt_transcribe_shutdown() */
|
||||
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_transcribe_shutdown)
|
||||
{
|
||||
cobalt_speech_cleanup();
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_RESULTS);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_VAD_DETECTED);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_VERSION_RESPONSE);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_MODEL_LIST_RESPONSE);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_COMPILE_CONTEXT_RESPONSE);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
39
mod_cobalt_transcribe/mod_cobalt_transcribe.h
Normal file
39
mod_cobalt_transcribe/mod_cobalt_transcribe.h
Normal file
@@ -0,0 +1,39 @@
|
||||
#ifndef __MOD_COBALT_TRANSCRIBE_H__
|
||||
#define __MOD_COBALT_TRANSCRIBE_H__
|
||||
|
||||
#include <switch.h>
|
||||
#include <speex/speex_resampler.h>
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
#define MAX_SESSION_ID (256)
|
||||
#define MAX_BUG_LEN (64)
|
||||
#define MY_BUG_NAME "cobalt_speech"
|
||||
#define TRANSCRIBE_EVENT_RESULTS "cobalt_speech::transcription"
|
||||
#define TRANSCRIBE_EVENT_ERROR "jambonz::error"
|
||||
#define TRANSCRIBE_EVENT_VAD_DETECTED "cobalt_speech::vad_detected"
|
||||
#define TRANSCRIBE_EVENT_MODEL_LIST_RESPONSE "cobalt_speech::model_list_response"
|
||||
#define TRANSCRIBE_EVENT_VERSION_RESPONSE "cobalt_speech::version_response"
|
||||
#define TRANSCRIBE_EVENT_COMPILE_CONTEXT_RESPONSE "cobalt_speech::compile_context_response"
|
||||
|
||||
|
||||
/* per-channel data */
|
||||
typedef void (*responseHandler_t)(switch_core_session_t* session,
|
||||
const char* json, const char* bugname,
|
||||
const char* details);
|
||||
|
||||
struct cap_cb {
|
||||
switch_mutex_t *mutex;
|
||||
char bugname[MAX_BUG_LEN+1];
|
||||
char sessionId[MAX_SESSION_ID+1];
|
||||
char *base;
|
||||
SpeexResamplerState *resampler;
|
||||
void* streamer;
|
||||
responseHandler_t responseHandler;
|
||||
switch_thread_t* thread;
|
||||
int end_of_utterance;
|
||||
switch_vad_t * vad;
|
||||
uint32_t samples_per_second;
|
||||
};
|
||||
|
||||
#endif
|
||||
51
mod_cobalt_transcribe/simple_buffer.h
Normal file
51
mod_cobalt_transcribe/simple_buffer.h
Normal file
@@ -0,0 +1,51 @@
|
||||
/**
|
||||
* (very) simple and limited circular buffer,
|
||||
* supporting only the use case of doing all of the adds
|
||||
* and then subsquently retrieves.
|
||||
*
|
||||
*/
|
||||
class SimpleBuffer {
|
||||
public:
|
||||
SimpleBuffer(uint32_t chunkSize, uint32_t numChunks) : numItems(0),
|
||||
m_numChunks(numChunks), m_chunkSize(chunkSize) {
|
||||
m_pData = new char[chunkSize * numChunks];
|
||||
m_pNextWrite = m_pData;
|
||||
}
|
||||
~SimpleBuffer() {
|
||||
delete [] m_pData;
|
||||
}
|
||||
|
||||
void add(void *data, uint32_t datalen) {
|
||||
if (datalen % m_chunkSize != 0) return;
|
||||
int numChunks = datalen / m_chunkSize;
|
||||
for (int i = 0; i < numChunks; i++) {
|
||||
memcpy(m_pNextWrite, data, m_chunkSize);
|
||||
data = static_cast<char*>(data) + m_chunkSize;
|
||||
if (numItems < m_numChunks) numItems++;
|
||||
|
||||
uint32_t offset = (m_pNextWrite - m_pData) / m_chunkSize;
|
||||
if (offset >= m_numChunks - 1) m_pNextWrite = m_pData;
|
||||
else m_pNextWrite += m_chunkSize;
|
||||
}
|
||||
}
|
||||
|
||||
char* getNextChunk() {
|
||||
if (numItems--) {
|
||||
char *p = m_pNextWrite;
|
||||
uint32_t offset = (m_pNextWrite - m_pData) / m_chunkSize;
|
||||
if (offset >= m_numChunks - 1) m_pNextWrite = m_pData;
|
||||
else m_pNextWrite += m_chunkSize;
|
||||
return p;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
uint32_t getNumItems() { return numItems;}
|
||||
|
||||
private:
|
||||
char *m_pData;
|
||||
uint32_t numItems;
|
||||
uint32_t m_chunkSize;
|
||||
uint32_t m_numChunks;
|
||||
char* m_pNextWrite;
|
||||
};
|
||||
8
mod_deepgram_transcribe/LICENSE
Normal file
8
mod_deepgram_transcribe/LICENSE
Normal file
@@ -0,0 +1,8 @@
|
||||
Copyright 2023, Drachtio Communications Services, LLC
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
9
mod_deepgram_transcribe/Makefile.am
Normal file
9
mod_deepgram_transcribe/Makefile.am
Normal file
@@ -0,0 +1,9 @@
|
||||
include $(top_srcdir)/build/modmake.rulesam
|
||||
MODNAME=mod_deepgram_transcribe
|
||||
|
||||
mod_LTLIBRARIES = mod_deepgram_transcribe.la
|
||||
mod_deepgram_transcribe_la_SOURCES = mod_deepgram_transcribe.c dg_transcribe_glue.cpp audio_pipe.cpp parser.cpp
|
||||
mod_deepgram_transcribe_la_CFLAGS = $(AM_CFLAGS)
|
||||
mod_deepgram_transcribe_la_CXXFLAGS = $(AM_CXXFLAGS) -std=c++11
|
||||
mod_deepgram_transcribe_la_LIBADD = $(switch_builddir)/libfreeswitch.la
|
||||
mod_deepgram_transcribe_la_LDFLAGS = -avoid-version -module -no-undefined -shared `pkg-config --libs libwebsockets`
|
||||
96
mod_deepgram_transcribe/README.md
Normal file
96
mod_deepgram_transcribe/README.md
Normal file
@@ -0,0 +1,96 @@
|
||||
# mod_deepgram_transcribe
|
||||
|
||||
A Freeswitch module that generates real-time transcriptions on a Freeswitch channel by using Deepgram's streaming transcription API
|
||||
|
||||
## API
|
||||
|
||||
### Commands
|
||||
The freeswitch module exposes the following API commands:
|
||||
|
||||
```
|
||||
uuid_deepgram_transcribe <uuid> start <lang-code> [interim]
|
||||
```
|
||||
Attaches media bug to channel and performs streaming recognize request.
|
||||
- `uuid` - unique identifier of Freeswitch channel
|
||||
- `lang-code` - a valid AWS [language code](https://docs.deepgram.amazon.com/transcribe/latest/dg/what-is-transcribe.html) that is supported for streaming transcription
|
||||
- `interim` - If the 'interim' keyword is present then both interim and final transcription results will be returned; otherwise only final transcriptions will be returned
|
||||
|
||||
```
|
||||
uuid_deepgram_transcribe <uuid> stop
|
||||
```
|
||||
Stop transcription on the channel.
|
||||
|
||||
### Channel Variables
|
||||
|
||||
| variable | Description |
|
||||
| --- | ----------- |
|
||||
| DEEPGRAM_API_KEY | Deepgram API key used to authenticate |
|
||||
| DEEPGRAM_SPEECH_TIER | https://developers.deepgram.com/documentation/features/tier/ |
|
||||
| DEEPGRAM_SPEECH_CUSTOM_MODEL | custom model id |
|
||||
| DEEPGRAM_SPEECH_MODEL | https://developers.deepgram.com/documentation/features/model/ |
|
||||
| DEEPGRAM_SPEECH_MODEL_VERSION | https://developers.deepgram.com/documentation/features/version/ |
|
||||
| DEEPGRAM_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION | https://developers.deepgram.com/documentation/features/punctuate/ |
|
||||
| DEEPGRAM_SPEECH_PROFANITY_FILTER | https://developers.deepgram.com/documentation/features/profanity-filter/ |
|
||||
| DEEPGRAM_SPEECH_REDACT | https://developers.deepgram.com/documentation/features/redact/ |
|
||||
| DEEPGRAM_SPEECH_DIARIZE | https://developers.deepgram.com/documentation/features/diarize/ |
|
||||
| DEEPGRAM_SPEECH_DIARIZE_VERSION | https://developers.deepgram.com/documentation/features/diarize/ |
|
||||
| DEEPGRAM_SPEECH_NER | https://developers.deepgram.com/documentation/features/named-entity-recognition/ |
|
||||
| DEEPGRAM_SPEECH_ALTERNATIVES | number of alternative hypotheses to return (default: 1) |
|
||||
| DEEPGRAM_SPEECH_NUMERALS | https://developers.deepgram.com/documentation/features/numerals/ |
|
||||
| DEEPGRAM_SPEECH_SEARCH | https://developers.deepgram.com/documentation/features/search/ |
|
||||
| DEEPGRAM_SPEECH_KEYWORDS | https://developers.deepgram.com/documentation/features/keywords/ |
|
||||
| DEEPGRAM_SPEECH_REPLACE | https://developers.deepgram.com/documentation/features/replace/ |
|
||||
| DEEPGRAM_SPEECH_TAG | https://developers.deepgram.com/documentation/features/tag/ |
|
||||
| DEEPGRAM_SPEECH_ENDPOINTING | https://developers.deepgram.com/documentation/features/endpointing/ |
|
||||
| DEEPGRAM_SPEECH_VAD_TURNOFF | https://developers.deepgram.com/documentation/features/voice-activity-detection/ |
|
||||
|
||||
|
||||
### Events
|
||||
`deepgram_transcribe::transcription` - returns an interim or final transcription. The event contains a JSON body describing the transcription result:
|
||||
```js
|
||||
{
|
||||
"channel_index": [0, 1],
|
||||
"duration": 4.59,
|
||||
"start": 0.0,
|
||||
"is_final": true,
|
||||
"speech_final": true,
|
||||
"channel": {
|
||||
"alternatives": [{
|
||||
"transcript": "hello hello hello",
|
||||
"confidence": 0.98583984,
|
||||
"words": [{
|
||||
"word": "hello",
|
||||
"start": 3.0865219,
|
||||
"end": 3.206,
|
||||
"confidence": 0.99902344
|
||||
}, {
|
||||
"word": "hello",
|
||||
"start": 3.5644348,
|
||||
"end": 3.644087,
|
||||
"confidence": 0.9741211
|
||||
}, {
|
||||
"word": "hello",
|
||||
"start": 4.042348,
|
||||
"end": 4.3609567,
|
||||
"confidence": 0.98583984
|
||||
}]
|
||||
}]
|
||||
},
|
||||
"metadata": {
|
||||
"request_id": "37835678-5d3b-4c77-910e-f8914c882cec",
|
||||
"model_info": {
|
||||
"name": "conversationalai",
|
||||
"version": "2021-11-10.1",
|
||||
"tier": "base"
|
||||
},
|
||||
"model_uuid": "6b28e919-8427-4f32-9847-492e2efd7daf"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Usage
|
||||
When using [drachtio-fsrmf](https://www.npmjs.com/package/drachtio-fsmrf), you can access this API command via the api method on the 'endpoint' object.
|
||||
```js
|
||||
ep.api('uuid_deepgram_transcribe', `${ep.uuid} start en-US interim`);
|
||||
```
|
||||
|
||||
512
mod_deepgram_transcribe/audio_pipe.cpp
Normal file
512
mod_deepgram_transcribe/audio_pipe.cpp
Normal file
@@ -0,0 +1,512 @@
|
||||
#include "audio_pipe.hpp"
|
||||
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
|
||||
/* discard incoming text messages over the socket that are longer than this */
|
||||
#define MAX_RECV_BUF_SIZE (65 * 1024 * 10)
|
||||
#define RECV_BUF_REALLOC_SIZE (8 * 1024)
|
||||
|
||||
using namespace deepgram;
|
||||
|
||||
namespace {
|
||||
static const char *requestedTcpKeepaliveSecs = std::getenv("MOD_AUDIO_FORK_TCP_KEEPALIVE_SECS");
|
||||
static int nTcpKeepaliveSecs = requestedTcpKeepaliveSecs ? ::atoi(requestedTcpKeepaliveSecs) : 55;
|
||||
}
|
||||
|
||||
static int dch_lws_http_basic_auth_gen(const char *apiKey, char *buf, size_t len) {
|
||||
size_t n = strlen(apiKey);
|
||||
|
||||
if (len < n + 7)
|
||||
return 1;
|
||||
|
||||
strcpy(buf,"Token ");
|
||||
strcpy(buf + 6, apiKey);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int AudioPipe::lws_callback(struct lws *wsi,
|
||||
enum lws_callback_reasons reason,
|
||||
void *user, void *in, size_t len) {
|
||||
|
||||
struct AudioPipe::lws_per_vhost_data *vhd =
|
||||
(struct AudioPipe::lws_per_vhost_data *) lws_protocol_vh_priv_get(lws_get_vhost(wsi), lws_get_protocol(wsi));
|
||||
|
||||
struct lws_vhost* vhost = lws_get_vhost(wsi);
|
||||
AudioPipe ** ppAp = (AudioPipe **) user;
|
||||
|
||||
switch (reason) {
|
||||
case LWS_CALLBACK_PROTOCOL_INIT:
|
||||
vhd = (struct AudioPipe::lws_per_vhost_data *) lws_protocol_vh_priv_zalloc(lws_get_vhost(wsi), lws_get_protocol(wsi), sizeof(struct AudioPipe::lws_per_vhost_data));
|
||||
vhd->context = lws_get_context(wsi);
|
||||
vhd->protocol = lws_get_protocol(wsi);
|
||||
vhd->vhost = lws_get_vhost(wsi);
|
||||
break;
|
||||
|
||||
case LWS_CALLBACK_CLIENT_APPEND_HANDSHAKE_HEADER:
|
||||
{
|
||||
AudioPipe* ap = findPendingConnect(wsi);
|
||||
if (ap) {
|
||||
std::string apiKey = ap->getApiKey();
|
||||
unsigned char **p = (unsigned char **)in, *end = (*p) + len;
|
||||
char b[256];
|
||||
memset(b, 0, sizeof(b));
|
||||
strcpy(b,"Token ");
|
||||
strcpy(b + 6, apiKey.c_str());
|
||||
|
||||
if (lws_add_http_header_by_token(wsi, WSI_TOKEN_HTTP_AUTHORIZATION, (unsigned char *)b, strlen(b), p, end)) return -1;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case LWS_CALLBACK_EVENT_WAIT_CANCELLED:
|
||||
processPendingConnects(vhd);
|
||||
processPendingDisconnects(vhd);
|
||||
processPendingWrites();
|
||||
break;
|
||||
case LWS_CALLBACK_CLIENT_CONNECTION_ERROR:
|
||||
{
|
||||
AudioPipe* ap = findAndRemovePendingConnect(wsi);
|
||||
int rc = lws_http_client_http_response(wsi);
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_CONNECTION_ERROR: %s, response status %d\n", in ? (char *)in : "(null)", rc);
|
||||
if (ap) {
|
||||
ap->m_state = LWS_CLIENT_FAILED;
|
||||
ap->m_callback(ap->m_uuid.c_str(), AudioPipe::CONNECT_FAIL, (char *) in, ap->isFinished());
|
||||
}
|
||||
else {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_CONNECTION_ERROR unable to find wsi %p..\n", wsi);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case LWS_CALLBACK_CLIENT_ESTABLISHED:
|
||||
{
|
||||
AudioPipe* ap = findAndRemovePendingConnect(wsi);
|
||||
if (ap) {
|
||||
*ppAp = ap;
|
||||
ap->m_vhd = vhd;
|
||||
ap->m_state = LWS_CLIENT_CONNECTED;
|
||||
ap->m_callback(ap->m_uuid.c_str(), AudioPipe::CONNECT_SUCCESS, NULL, ap->isFinished());
|
||||
}
|
||||
else {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_ESTABLISHED %s unable to find wsi %p..\n", ap->m_uuid.c_str(), wsi);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case LWS_CALLBACK_CLIENT_CLOSED:
|
||||
{
|
||||
AudioPipe* ap = *ppAp;
|
||||
if (!ap) {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_CLOSED %s unable to find wsi %p..\n", ap->m_uuid.c_str(), wsi);
|
||||
return 0;
|
||||
}
|
||||
if (ap->m_state == LWS_CLIENT_DISCONNECTING) {
|
||||
// closed by us
|
||||
|
||||
lwsl_debug("%s socket closed by us\n", ap->m_uuid.c_str());
|
||||
ap->m_callback(ap->m_uuid.c_str(), AudioPipe::CONNECTION_CLOSED_GRACEFULLY, NULL, ap->isFinished());
|
||||
}
|
||||
else if (ap->m_state == LWS_CLIENT_CONNECTED) {
|
||||
// closed by far end
|
||||
lwsl_info("%s socket closed by far end\n", ap->m_uuid.c_str());
|
||||
ap->m_callback(ap->m_uuid.c_str(), AudioPipe::CONNECTION_DROPPED, NULL, ap->isFinished());
|
||||
}
|
||||
ap->m_state = LWS_CLIENT_DISCONNECTED;
|
||||
ap->setClosed();
|
||||
|
||||
//NB: after receiving any of the events above, any holder of a
|
||||
//pointer or reference to this object must treat is as no longer valid
|
||||
|
||||
//*ppAp = NULL;
|
||||
//delete ap;
|
||||
}
|
||||
break;
|
||||
|
||||
case LWS_CALLBACK_CLIENT_RECEIVE:
|
||||
{
|
||||
AudioPipe* ap = *ppAp;
|
||||
if (!ap) {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_RECEIVE %s unable to find wsi %p..\n", ap->m_uuid.c_str(), wsi);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (lws_frame_is_binary(wsi)) {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_RECEIVE received binary frame, discarding.\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (lws_is_first_fragment(wsi)) {
|
||||
// allocate a buffer for the entire chunk of memory needed
|
||||
assert(nullptr == ap->m_recv_buf);
|
||||
ap->m_recv_buf_len = len + lws_remaining_packet_payload(wsi);
|
||||
ap->m_recv_buf = (uint8_t*) malloc(ap->m_recv_buf_len);
|
||||
ap->m_recv_buf_ptr = ap->m_recv_buf;
|
||||
}
|
||||
|
||||
size_t write_offset = ap->m_recv_buf_ptr - ap->m_recv_buf;
|
||||
size_t remaining_space = ap->m_recv_buf_len - write_offset;
|
||||
if (remaining_space < len) {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_RECEIVE buffer realloc needed.\n");
|
||||
size_t newlen = ap->m_recv_buf_len + RECV_BUF_REALLOC_SIZE;
|
||||
if (newlen > MAX_RECV_BUF_SIZE) {
|
||||
free(ap->m_recv_buf);
|
||||
ap->m_recv_buf = ap->m_recv_buf_ptr = nullptr;
|
||||
ap->m_recv_buf_len = 0;
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_RECEIVE max buffer exceeded, truncating message.\n");
|
||||
}
|
||||
else {
|
||||
ap->m_recv_buf = (uint8_t*) realloc(ap->m_recv_buf, newlen);
|
||||
if (nullptr != ap->m_recv_buf) {
|
||||
ap->m_recv_buf_len = newlen;
|
||||
ap->m_recv_buf_ptr = ap->m_recv_buf + write_offset;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (nullptr != ap->m_recv_buf) {
|
||||
if (len > 0) {
|
||||
memcpy(ap->m_recv_buf_ptr, in, len);
|
||||
ap->m_recv_buf_ptr += len;
|
||||
}
|
||||
if (lws_is_final_fragment(wsi)) {
|
||||
if (nullptr != ap->m_recv_buf) {
|
||||
std::string msg((char *)ap->m_recv_buf, ap->m_recv_buf_ptr - ap->m_recv_buf);
|
||||
ap->m_callback(ap->m_uuid.c_str(), AudioPipe::MESSAGE, msg.c_str(), ap->isFinished());
|
||||
if (nullptr != ap->m_recv_buf) free(ap->m_recv_buf);
|
||||
}
|
||||
ap->m_recv_buf = ap->m_recv_buf_ptr = nullptr;
|
||||
ap->m_recv_buf_len = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case LWS_CALLBACK_CLIENT_WRITEABLE:
|
||||
{
|
||||
AudioPipe* ap = *ppAp;
|
||||
if (!ap) {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_WRITEABLE %s unable to find wsi %p..\n", ap->m_uuid.c_str(), wsi);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// check for text frames to send
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(ap->m_text_mutex);
|
||||
if (ap->m_metadata.length() > 0) {
|
||||
uint8_t buf[ap->m_metadata.length() + LWS_PRE];
|
||||
memcpy(buf + LWS_PRE, ap->m_metadata.c_str(), ap->m_metadata.length());
|
||||
int n = ap->m_metadata.length();
|
||||
int m = lws_write(wsi, buf + LWS_PRE, n, LWS_WRITE_TEXT);
|
||||
ap->m_metadata.clear();
|
||||
if (m < n) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// there may be audio data, but only one write per writeable event
|
||||
// get it next time
|
||||
lws_callback_on_writable(wsi);
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (ap->m_state == LWS_CLIENT_DISCONNECTING) {
|
||||
lws_close_reason(wsi, LWS_CLOSE_STATUS_NORMAL, NULL, 0);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// check for audio packets
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(ap->m_audio_mutex);
|
||||
if (ap->m_audio_buffer_write_offset > LWS_PRE) {
|
||||
size_t datalen = ap->m_audio_buffer_write_offset - LWS_PRE;
|
||||
int sent = lws_write(wsi, (unsigned char *) ap->m_audio_buffer + LWS_PRE, datalen, LWS_WRITE_BINARY);
|
||||
if (sent < datalen) {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_WRITEABLE %s attemped to send %lu only sent %d wsi %p..\n",
|
||||
ap->m_uuid.c_str(), datalen, sent, wsi);
|
||||
}
|
||||
ap->m_audio_buffer_write_offset = LWS_PRE;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return lws_callback_http_dummy(wsi, reason, user, in, len);
|
||||
}
|
||||
|
||||
|
||||
// static members
|
||||
static const lws_retry_bo_t retry = {
|
||||
nullptr, // retry_ms_table
|
||||
0, // retry_ms_table_count
|
||||
0, // conceal_count
|
||||
UINT16_MAX, // secs_since_valid_ping
|
||||
UINT16_MAX, // secs_since_valid_hangup
|
||||
0 // jitter_percent
|
||||
};
|
||||
|
||||
struct lws_context *AudioPipe::context = nullptr;
|
||||
std::string AudioPipe::protocolName;
|
||||
std::mutex AudioPipe::mutex_connects;
|
||||
std::mutex AudioPipe::mutex_disconnects;
|
||||
std::mutex AudioPipe::mutex_writes;
|
||||
std::list<AudioPipe*> AudioPipe::pendingConnects;
|
||||
std::list<AudioPipe*> AudioPipe::pendingDisconnects;
|
||||
std::list<AudioPipe*> AudioPipe::pendingWrites;
|
||||
AudioPipe::log_emit_function AudioPipe::logger;
|
||||
std::mutex AudioPipe::mapMutex;
|
||||
bool AudioPipe::stopFlag;
|
||||
|
||||
void AudioPipe::processPendingConnects(lws_per_vhost_data *vhd) {
|
||||
std::list<AudioPipe*> connects;
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_connects);
|
||||
for (auto it = pendingConnects.begin(); it != pendingConnects.end(); ++it) {
|
||||
if ((*it)->m_state == LWS_CLIENT_IDLE) {
|
||||
connects.push_back(*it);
|
||||
(*it)->m_state = LWS_CLIENT_CONNECTING;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto it = connects.begin(); it != connects.end(); ++it) {
|
||||
AudioPipe* ap = *it;
|
||||
ap->connect_client(vhd);
|
||||
}
|
||||
}
|
||||
|
||||
void AudioPipe::processPendingDisconnects(lws_per_vhost_data *vhd) {
|
||||
std::list<AudioPipe*> disconnects;
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_disconnects);
|
||||
for (auto it = pendingDisconnects.begin(); it != pendingDisconnects.end(); ++it) {
|
||||
if ((*it)->m_state == LWS_CLIENT_DISCONNECTING) disconnects.push_back(*it);
|
||||
}
|
||||
pendingDisconnects.clear();
|
||||
}
|
||||
for (auto it = disconnects.begin(); it != disconnects.end(); ++it) {
|
||||
AudioPipe* ap = *it;
|
||||
lws_callback_on_writable(ap->m_wsi);
|
||||
}
|
||||
}
|
||||
|
||||
void AudioPipe::processPendingWrites() {
|
||||
std::list<AudioPipe*> writes;
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_writes);
|
||||
for (auto it = pendingWrites.begin(); it != pendingWrites.end(); ++it) {
|
||||
if ((*it)->m_state == LWS_CLIENT_CONNECTED) writes.push_back(*it);
|
||||
}
|
||||
pendingWrites.clear();
|
||||
}
|
||||
for (auto it = writes.begin(); it != writes.end(); ++it) {
|
||||
AudioPipe* ap = *it;
|
||||
lws_callback_on_writable(ap->m_wsi);
|
||||
}
|
||||
}
|
||||
|
||||
AudioPipe* AudioPipe::findAndRemovePendingConnect(struct lws *wsi) {
|
||||
AudioPipe* ap = NULL;
|
||||
std::lock_guard<std::mutex> guard(mutex_connects);
|
||||
std::list<AudioPipe* > toRemove;
|
||||
|
||||
for (auto it = pendingConnects.begin(); it != pendingConnects.end() && !ap; ++it) {
|
||||
int state = (*it)->m_state;
|
||||
|
||||
if ((*it)->m_wsi == nullptr)
|
||||
toRemove.push_back(*it);
|
||||
|
||||
if ((state == LWS_CLIENT_CONNECTING) &&
|
||||
(*it)->m_wsi == wsi) ap = *it;
|
||||
}
|
||||
|
||||
for (auto it = toRemove.begin(); it != toRemove.end(); ++it)
|
||||
pendingConnects.remove(*it);
|
||||
|
||||
if (ap) {
|
||||
pendingConnects.remove(ap);
|
||||
}
|
||||
|
||||
return ap;
|
||||
}
|
||||
|
||||
AudioPipe* AudioPipe::findPendingConnect(struct lws *wsi) {
|
||||
AudioPipe* ap = NULL;
|
||||
std::lock_guard<std::mutex> guard(mutex_connects);
|
||||
|
||||
for (auto it = pendingConnects.begin(); it != pendingConnects.end() && !ap; ++it) {
|
||||
int state = (*it)->m_state;
|
||||
if ((state == LWS_CLIENT_CONNECTING) &&
|
||||
(*it)->m_wsi == wsi) ap = *it;
|
||||
}
|
||||
return ap;
|
||||
}
|
||||
|
||||
void AudioPipe::addPendingConnect(AudioPipe* ap) {
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_connects);
|
||||
pendingConnects.push_back(ap);
|
||||
lwsl_debug("%s after adding connect there are %lu pending connects\n",
|
||||
ap->m_uuid.c_str(), pendingConnects.size());
|
||||
}
|
||||
lws_cancel_service(context);
|
||||
}
|
||||
void AudioPipe::addPendingDisconnect(AudioPipe* ap) {
|
||||
ap->m_state = LWS_CLIENT_DISCONNECTING;
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_disconnects);
|
||||
pendingDisconnects.push_back(ap);
|
||||
lwsl_debug("%s after adding disconnect there are %lu pending disconnects\n",
|
||||
ap->m_uuid.c_str(), pendingDisconnects.size());
|
||||
}
|
||||
lws_cancel_service(ap->m_vhd->context);
|
||||
}
|
||||
void AudioPipe::addPendingWrite(AudioPipe* ap) {
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_writes);
|
||||
pendingWrites.push_back(ap);
|
||||
}
|
||||
lws_cancel_service(ap->m_vhd->context);
|
||||
}
|
||||
|
||||
bool AudioPipe::lws_service_thread() {
|
||||
struct lws_context_creation_info info;
|
||||
std::thread::id this_id = std::this_thread::get_id();
|
||||
|
||||
const struct lws_protocols protocols[] = {
|
||||
{
|
||||
"",
|
||||
AudioPipe::lws_callback,
|
||||
sizeof(void *),
|
||||
1024,
|
||||
},
|
||||
{ NULL, NULL, 0, 0 }
|
||||
};
|
||||
|
||||
memset(&info, 0, sizeof info);
|
||||
info.port = CONTEXT_PORT_NO_LISTEN;
|
||||
info.options = LWS_SERVER_OPTION_DO_SSL_GLOBAL_INIT;
|
||||
info.protocols = protocols;
|
||||
info.ka_time = nTcpKeepaliveSecs; // tcp keep-alive timer
|
||||
info.ka_probes = 4; // number of times to try ka before closing connection
|
||||
info.ka_interval = 5; // time between ka's
|
||||
info.timeout_secs = 10; // doc says timeout for "various processes involving network roundtrips"
|
||||
info.keepalive_timeout = 5; // seconds to allow remote client to hold on to an idle HTTP/1.1 connection
|
||||
info.timeout_secs_ah_idle = 10; // secs to allow a client to hold an ah without using it
|
||||
info.retry_and_idle_policy = &retry;
|
||||
|
||||
lwsl_notice("AudioPipe::lws_service_thread creating context\n");
|
||||
|
||||
context = lws_create_context(&info);
|
||||
if (!context) {
|
||||
lwsl_err("AudioPipe::lws_service_thread failed creating context\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
int n;
|
||||
do {
|
||||
n = lws_service(context, 0);
|
||||
} while (n >= 0 && !stopFlag);
|
||||
|
||||
lwsl_notice("AudioPipe::lws_service_thread ending\n");
|
||||
lws_context_destroy(context);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void AudioPipe::initialize(int loglevel, log_emit_function logger) {
|
||||
|
||||
lws_set_log_level(loglevel, logger);
|
||||
|
||||
lwsl_notice("AudioPipe::initialize starting\n");
|
||||
std::lock_guard<std::mutex> lock(mapMutex);
|
||||
std::thread t(&AudioPipe::lws_service_thread);
|
||||
stopFlag = false;
|
||||
t.detach();
|
||||
}
|
||||
|
||||
bool AudioPipe::deinitialize() {
|
||||
lwsl_notice("AudioPipe::deinitialize\n");
|
||||
std::lock_guard<std::mutex> lock(mapMutex);
|
||||
stopFlag = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
// instance members
|
||||
AudioPipe::AudioPipe(const char* uuid, const char* host, unsigned int port, const char* path,
|
||||
size_t bufLen, size_t minFreespace, const char* apiKey, notifyHandler_t callback) :
|
||||
m_uuid(uuid), m_host(host), m_port(port), m_path(path), m_finished(false),
|
||||
m_audio_buffer_min_freespace(minFreespace), m_audio_buffer_max_len(bufLen), m_gracefulShutdown(false),
|
||||
m_audio_buffer_write_offset(LWS_PRE), m_recv_buf(nullptr), m_recv_buf_ptr(nullptr),
|
||||
m_state(LWS_CLIENT_IDLE), m_wsi(nullptr), m_vhd(nullptr), m_apiKey(apiKey), m_callback(callback) {
|
||||
|
||||
m_audio_buffer = new uint8_t[m_audio_buffer_max_len];
|
||||
}
|
||||
AudioPipe::~AudioPipe() {
|
||||
if (m_audio_buffer) delete [] m_audio_buffer;
|
||||
if (m_recv_buf) delete [] m_recv_buf;
|
||||
}
|
||||
|
||||
void AudioPipe::connect(void) {
|
||||
addPendingConnect(this);
|
||||
}
|
||||
|
||||
bool AudioPipe::connect_client(struct lws_per_vhost_data *vhd) {
|
||||
assert(m_audio_buffer != nullptr);
|
||||
assert(m_vhd == nullptr);
|
||||
struct lws_client_connect_info i;
|
||||
|
||||
memset(&i, 0, sizeof(i));
|
||||
i.context = vhd->context;
|
||||
i.port = m_port;
|
||||
i.address = m_host.c_str();
|
||||
i.path = m_path.c_str();
|
||||
i.host = i.address;
|
||||
i.origin = i.address;
|
||||
i.ssl_connection = LCCSCF_USE_SSL;
|
||||
//i.protocol = protocolName.c_str();
|
||||
i.pwsi = &(m_wsi);
|
||||
|
||||
m_state = LWS_CLIENT_CONNECTING;
|
||||
m_vhd = vhd;
|
||||
|
||||
m_wsi = lws_client_connect_via_info(&i);
|
||||
lwsl_debug("%s attempting connection, wsi is %p\n", m_uuid.c_str(), m_wsi);
|
||||
|
||||
return nullptr != m_wsi;
|
||||
}
|
||||
|
||||
void AudioPipe::bufferForSending(const char* text) {
|
||||
if (m_state != LWS_CLIENT_CONNECTED) return;
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(m_text_mutex);
|
||||
m_metadata.append(text);
|
||||
}
|
||||
addPendingWrite(this);
|
||||
}
|
||||
|
||||
void AudioPipe::unlockAudioBuffer() {
|
||||
if (m_audio_buffer_write_offset > LWS_PRE) addPendingWrite(this);
|
||||
m_audio_mutex.unlock();
|
||||
}
|
||||
|
||||
void AudioPipe::close() {
|
||||
if (m_state != LWS_CLIENT_CONNECTED) return;
|
||||
addPendingDisconnect(this);
|
||||
}
|
||||
|
||||
void AudioPipe::finish() {
|
||||
if (m_finished || m_state != LWS_CLIENT_CONNECTED) return;
|
||||
m_finished = true;
|
||||
bufferForSending("{\"type\": \"CloseStream\"}");
|
||||
}
|
||||
|
||||
void AudioPipe::waitForClose() {
|
||||
std::shared_future<void> sf(m_promise.get_future());
|
||||
sf.wait();
|
||||
return;
|
||||
}
|
||||
145
mod_deepgram_transcribe/audio_pipe.hpp
Normal file
145
mod_deepgram_transcribe/audio_pipe.hpp
Normal file
@@ -0,0 +1,145 @@
|
||||
#ifndef __DG_AUDIO_PIPE_HPP__
|
||||
#define __DG_AUDIO_PIPE_HPP__
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
#include <mutex>
|
||||
#include <future>
|
||||
#include <queue>
|
||||
#include <unordered_map>
|
||||
#include <thread>
|
||||
|
||||
#include <libwebsockets.h>
|
||||
|
||||
namespace deepgram {
|
||||
|
||||
class AudioPipe {
|
||||
public:
|
||||
enum LwsState_t {
|
||||
LWS_CLIENT_IDLE,
|
||||
LWS_CLIENT_CONNECTING,
|
||||
LWS_CLIENT_CONNECTED,
|
||||
LWS_CLIENT_FAILED,
|
||||
LWS_CLIENT_DISCONNECTING,
|
||||
LWS_CLIENT_DISCONNECTED
|
||||
};
|
||||
enum NotifyEvent_t {
|
||||
CONNECT_SUCCESS,
|
||||
CONNECT_FAIL,
|
||||
CONNECTION_DROPPED,
|
||||
CONNECTION_CLOSED_GRACEFULLY,
|
||||
MESSAGE
|
||||
};
|
||||
typedef void (*log_emit_function)(int level, const char *line);
|
||||
typedef void (*notifyHandler_t)(const char *sessionId, NotifyEvent_t event, const char* message, bool finished);
|
||||
|
||||
struct lws_per_vhost_data {
|
||||
struct lws_context *context;
|
||||
struct lws_vhost *vhost;
|
||||
const struct lws_protocols *protocol;
|
||||
};
|
||||
|
||||
static void initialize(int loglevel, log_emit_function logger);
|
||||
static bool deinitialize();
|
||||
static bool lws_service_thread();
|
||||
|
||||
// constructor
|
||||
AudioPipe(const char* uuid, const char* host, unsigned int port, const char* path,
|
||||
size_t bufLen, size_t minFreespace, const char* apiKey, notifyHandler_t callback);
|
||||
~AudioPipe();
|
||||
|
||||
LwsState_t getLwsState(void) { return m_state; }
|
||||
std::string& getApiKey(void) {
|
||||
return m_apiKey;
|
||||
}
|
||||
void connect(void);
|
||||
void bufferForSending(const char* text);
|
||||
size_t binarySpaceAvailable(void) {
|
||||
return m_audio_buffer_max_len - m_audio_buffer_write_offset;
|
||||
}
|
||||
size_t binaryMinSpace(void) {
|
||||
return m_audio_buffer_min_freespace;
|
||||
}
|
||||
char * binaryWritePtr(void) {
|
||||
return (char *) m_audio_buffer + m_audio_buffer_write_offset;
|
||||
}
|
||||
void binaryWritePtrAdd(size_t len) {
|
||||
m_audio_buffer_write_offset += len;
|
||||
}
|
||||
void binaryWritePtrResetToZero(void) {
|
||||
m_audio_buffer_write_offset = 0;
|
||||
}
|
||||
void lockAudioBuffer(void) {
|
||||
m_audio_mutex.lock();
|
||||
}
|
||||
void unlockAudioBuffer(void) ;
|
||||
|
||||
void close() ;
|
||||
void finish();
|
||||
void waitForClose();
|
||||
void setClosed() { m_promise.set_value(); }
|
||||
bool isFinished() { return m_finished;}
|
||||
|
||||
// no default constructor or copying
|
||||
AudioPipe() = delete;
|
||||
AudioPipe(const AudioPipe&) = delete;
|
||||
void operator=(const AudioPipe&) = delete;
|
||||
|
||||
private:
|
||||
|
||||
static int lws_callback(struct lws *wsi, enum lws_callback_reasons reason, void *user, void *in, size_t len);
|
||||
static unsigned int nchild;
|
||||
static struct lws_context *context;
|
||||
static unsigned int numContexts;
|
||||
static std::string protocolName;
|
||||
static std::mutex mutex_connects;
|
||||
static std::mutex mutex_disconnects;
|
||||
static std::mutex mutex_writes;
|
||||
static std::list<AudioPipe*> pendingConnects;
|
||||
static std::list<AudioPipe*> pendingDisconnects;
|
||||
static std::list<AudioPipe*> pendingWrites;
|
||||
static log_emit_function logger;
|
||||
|
||||
static std::mutex mapMutex;
|
||||
static bool stopFlag;
|
||||
|
||||
static AudioPipe* findAndRemovePendingConnect(struct lws *wsi);
|
||||
static AudioPipe* findPendingConnect(struct lws *wsi);
|
||||
static void addPendingConnect(AudioPipe* ap);
|
||||
static void addPendingDisconnect(AudioPipe* ap);
|
||||
static void addPendingWrite(AudioPipe* ap);
|
||||
static void processPendingConnects(lws_per_vhost_data *vhd);
|
||||
static void processPendingDisconnects(lws_per_vhost_data *vhd);
|
||||
static void processPendingWrites(void);
|
||||
|
||||
bool connect_client(struct lws_per_vhost_data *vhd);
|
||||
|
||||
LwsState_t m_state;
|
||||
std::string m_uuid;
|
||||
std::string m_host;
|
||||
unsigned int m_port;
|
||||
std::string m_path;
|
||||
std::string m_metadata;
|
||||
std::mutex m_text_mutex;
|
||||
std::mutex m_audio_mutex;
|
||||
int m_sslFlags;
|
||||
struct lws *m_wsi;
|
||||
uint8_t *m_audio_buffer;
|
||||
size_t m_audio_buffer_max_len;
|
||||
size_t m_audio_buffer_write_offset;
|
||||
size_t m_audio_buffer_min_freespace;
|
||||
uint8_t* m_recv_buf;
|
||||
uint8_t* m_recv_buf_ptr;
|
||||
size_t m_recv_buf_len;
|
||||
struct lws_per_vhost_data* m_vhd;
|
||||
notifyHandler_t m_callback;
|
||||
log_emit_function m_logger;
|
||||
std::string m_apiKey;
|
||||
bool m_gracefulShutdown;
|
||||
bool m_finished;
|
||||
std::string m_bugname;
|
||||
std::promise<void> m_promise;
|
||||
};
|
||||
|
||||
} // namespace deepgram
|
||||
#endif
|
||||
577
mod_deepgram_transcribe/dg_transcribe_glue.cpp
Normal file
577
mod_deepgram_transcribe/dg_transcribe_glue.cpp
Normal file
@@ -0,0 +1,577 @@
|
||||
#include <switch.h>
|
||||
#include <switch_json.h>
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <list>
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
#include <cassert>
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <regex>
|
||||
#include <iostream>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "mod_deepgram_transcribe.h"
|
||||
#include "simple_buffer.h"
|
||||
#include "parser.hpp"
|
||||
#include "audio_pipe.hpp"
|
||||
|
||||
#define RTP_PACKETIZATION_PERIOD 20
|
||||
#define FRAME_SIZE_8000 320 /*which means each 20ms frame as 320 bytes at 8 khz (1 channel only)*/
|
||||
|
||||
namespace {
|
||||
static bool hasDefaultCredentials = false;
|
||||
static const char* defaultApiKey = nullptr;
|
||||
static const char *requestedBufferSecs = std::getenv("MOD_AUDIO_FORK_BUFFER_SECS");
|
||||
static int nAudioBufferSecs = std::max(1, std::min(requestedBufferSecs ? ::atoi(requestedBufferSecs) : 2, 5));
|
||||
static const char *requestedNumServiceThreads = std::getenv("MOD_AUDIO_FORK_SERVICE_THREADS");
|
||||
static unsigned int idxCallCount = 0;
|
||||
static uint32_t playCount = 0;
|
||||
|
||||
/* deepgram model / tier defaults by language */
|
||||
struct LanguageInfo {
|
||||
std::string tier;
|
||||
std::string model;
|
||||
};
|
||||
|
||||
static const std::unordered_map<std::string, LanguageInfo> languageLookupTable = {
|
||||
{"zh", {"base", "general"}},
|
||||
{"zh-CN", {"base", "general"}},
|
||||
{"zh-TW", {"base", "general"}},
|
||||
{"da", {"enhanced", "general"}},
|
||||
{"en", {"nova", "phonecall"}},
|
||||
{"en-US", {"nova", "phonecall"}},
|
||||
{"en-AU", {"nova", "general"}},
|
||||
{"en-GB", {"nova", "general"}},
|
||||
{"en-IN", {"nova", "general"}},
|
||||
{"en-NZ", {"nova", "general"}},
|
||||
{"nl", {"enhanced", "general"}},
|
||||
{"fr", {"enhanced", "general"}},
|
||||
{"fr-CA", {"base", "general"}},
|
||||
{"de", {"enhanced", "general"}},
|
||||
{"hi", {"enhanced", "general"}},
|
||||
{"hi-Latn", {"base", "general"}},
|
||||
{"id", {"base", "general"}},
|
||||
{"ja", {"enhanced", "general"}},
|
||||
{"ko", {"enhanced", "general"}},
|
||||
{"no", {"enhanced", "general"}},
|
||||
{"pl", {"enhanced", "general"}},
|
||||
{"pt", {"enhanced", "general"}},
|
||||
{"pt-BR", {"enhanced", "general"}},
|
||||
{"pt-PT", {"enhanced", "general"}},
|
||||
{"ru", {"base", "general"}},
|
||||
{"es", {"nova", "general"}},
|
||||
{"es-419", {"nova", "general"}},
|
||||
{"sv", {"enhanced", "general"}},
|
||||
{"ta", {"enhanced", "general"}},
|
||||
{"tr", {"base", "general"}},
|
||||
{"uk", {"base", "general"}}
|
||||
};
|
||||
|
||||
static bool getLanguageInfo(const std::string& language, LanguageInfo& info) {
|
||||
auto it = languageLookupTable.find(language);
|
||||
if (it != languageLookupTable.end()) {
|
||||
info = it->second;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static const char* emptyTranscript = "{\"alternatives\":[{\"transcript\":\"\",\"confidence\":0.0,\"words\":[]}]}";
|
||||
|
||||
static void reaper(private_t *tech_pvt) {
|
||||
std::shared_ptr<deepgram::AudioPipe> pAp;
|
||||
pAp.reset((deepgram::AudioPipe *)tech_pvt->pAudioPipe);
|
||||
tech_pvt->pAudioPipe = nullptr;
|
||||
|
||||
std::thread t([pAp, tech_pvt]{
|
||||
pAp->finish();
|
||||
pAp->waitForClose();
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "%s (%u) got remote close\n", tech_pvt->sessionId, tech_pvt->id);
|
||||
});
|
||||
t.detach();
|
||||
}
|
||||
|
||||
static void destroy_tech_pvt(private_t *tech_pvt) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "%s (%u) destroy_tech_pvt\n", tech_pvt->sessionId, tech_pvt->id);
|
||||
if (tech_pvt) {
|
||||
if (tech_pvt->pAudioPipe) {
|
||||
deepgram::AudioPipe* p = (deepgram::AudioPipe *) tech_pvt->pAudioPipe;
|
||||
delete p;
|
||||
tech_pvt->pAudioPipe = nullptr;
|
||||
}
|
||||
if (tech_pvt->resampler) {
|
||||
speex_resampler_destroy(tech_pvt->resampler);
|
||||
tech_pvt->resampler = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
if (tech_pvt->vad) {
|
||||
switch_vad_destroy(&tech_pvt->vad);
|
||||
tech_pvt->vad = nullptr;
|
||||
}
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
std::string encodeURIComponent(std::string decoded)
|
||||
{
|
||||
|
||||
std::ostringstream oss;
|
||||
std::regex r("[!'\\(\\)*-.0-9A-Za-z_~:]");
|
||||
|
||||
for (char &c : decoded)
|
||||
{
|
||||
if (std::regex_match((std::string){c}, r))
|
||||
{
|
||||
oss << c;
|
||||
}
|
||||
else
|
||||
{
|
||||
oss << "%" << std::uppercase << std::hex << (0xff & c);
|
||||
}
|
||||
}
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
std::string& constructPath(switch_core_session_t* session, std::string& path,
|
||||
int sampleRate, int channels, const char* language, int interim) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
const char *var ;
|
||||
const char *model = switch_channel_get_variable(channel, "DEEPGRAM_SPEECH_MODEL");
|
||||
const char *customModel = switch_channel_get_variable(channel, "DEEPGRAM_SPEECH_CUSTOM_MODEL");
|
||||
const char *tier = switch_channel_get_variable(channel, "DEEPGRAM_SPEECH_TIER") ;
|
||||
std::ostringstream oss;
|
||||
LanguageInfo info;
|
||||
|
||||
oss << "/v1/listen?";
|
||||
|
||||
if (!tier && !model && !customModel) {
|
||||
/* make best choice by language */
|
||||
if (getLanguageInfo(language, info)) {
|
||||
oss << "tier=" << info.tier << "&model=" << info.model;
|
||||
}
|
||||
else {
|
||||
oss << "tier=base&model=general"; // most widely supported, though not ideal
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (tier) oss << "tier=" << tier;
|
||||
if (model) oss << "&model=" << model;
|
||||
if (customModel) oss << "&model=" << customModel;
|
||||
}
|
||||
|
||||
if (var = switch_channel_get_variable(channel, "DEEPGRAM_SPEECH_MODEL_VERSION")) {
|
||||
oss << "&version";
|
||||
oss << var;
|
||||
}
|
||||
oss << "&language=";
|
||||
oss << language;
|
||||
|
||||
if (channels == 2) {
|
||||
oss << "&multichannel=true";
|
||||
oss << "&channels=2";
|
||||
}
|
||||
|
||||
if (var = switch_channel_get_variable(channel, "DEEPGRAM_SPEECH_ENABLE_SMART_FORMAT")) {
|
||||
oss << "&smart_format=true";
|
||||
oss << "&no_delay=true";
|
||||
/**
|
||||
* see: https://github.com/orgs/deepgram/discussions/384
|
||||
*
|
||||
*/
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "DEEPGRAM_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION")) {
|
||||
oss << "&punctuate=true";
|
||||
}
|
||||
if (switch_true(switch_channel_get_variable(channel, "DEEPGRAM_SPEECH_PROFANITY_FILTER"))) {
|
||||
oss << "&profanity_filter=true";
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "DEEPGRAM_SPEECH_REDACT")) {
|
||||
oss << "&redact=";
|
||||
oss << var;
|
||||
}
|
||||
if (switch_true(switch_channel_get_variable(channel, "DEEPGRAM_SPEECH_DIARIZE"))) {
|
||||
oss << "&diarize=true";
|
||||
if (var = switch_channel_get_variable(channel, "DEEPGRAM_SPEECH_DIARIZE_VERSION")) {
|
||||
oss << "&diarize_version=";
|
||||
oss << var;
|
||||
}
|
||||
}
|
||||
if (switch_true(switch_channel_get_variable(channel, "DEEPGRAM_SPEECH_NER"))) {
|
||||
oss << "&ner=true";
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "DEEPGRAM_SPEECH_ALTERNATIVES")) {
|
||||
oss << "&alternatives=";
|
||||
oss << var;
|
||||
}
|
||||
if (switch_true(switch_channel_get_variable(channel, "DEEPGRAM_SPEECH_NUMERALS"))) {
|
||||
oss << "&numerals=true";
|
||||
}
|
||||
|
||||
const char* hints = switch_channel_get_variable(channel, "DEEPGRAM_SPEECH_SEARCH");
|
||||
if (hints) {
|
||||
char *phrases[500] = { 0 };
|
||||
int argc = switch_separate_string((char *)hints, ',', phrases, 500);
|
||||
for (int i = 0; i < argc; i++) {
|
||||
oss << "&search=";
|
||||
oss << encodeURIComponent(phrases[i]);
|
||||
}
|
||||
}
|
||||
const char* keywords = switch_channel_get_variable(channel, "DEEPGRAM_SPEECH_KEYWORDS");
|
||||
if (keywords) {
|
||||
char *phrases[500] = { 0 };
|
||||
int argc = switch_separate_string((char *)keywords, ',', phrases, 500);
|
||||
for (int i = 0; i < argc; i++) {
|
||||
oss << "&keywords=";
|
||||
oss << encodeURIComponent(phrases[i]);
|
||||
}
|
||||
}
|
||||
const char* replace = switch_channel_get_variable(channel, "DEEPGRAM_SPEECH_REPLACE");
|
||||
if (replace) {
|
||||
char *phrases[500] = { 0 };
|
||||
int argc = switch_separate_string((char *)replace, ',', phrases, 500);
|
||||
for (int i = 0; i < argc; i++) {
|
||||
oss << "&replace=";
|
||||
oss << encodeURIComponent(phrases[i]);
|
||||
}
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "DEEPGRAM_SPEECH_TAG")) {
|
||||
oss << "&tag=";
|
||||
oss << var;
|
||||
}
|
||||
if (interim) {
|
||||
oss << "&interim_results=true";
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "DEEPGRAM_SPEECH_ENDPOINTING")) {
|
||||
oss << "&endpointing=";
|
||||
oss << var;
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "DEEPGRAM_SPEECH_UTTERANCE_END_MS")) {
|
||||
oss << "&utterance_end_ms=";
|
||||
oss << var;
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "DEEPGRAM_SPEECH_VAD_TURNOFF")) {
|
||||
oss << "&vad_turnoff=";
|
||||
oss << var;
|
||||
}
|
||||
oss << "&encoding=linear16";
|
||||
oss << "&sample_rate=8000";
|
||||
path = oss.str();
|
||||
return path;
|
||||
}
|
||||
|
||||
static void eventCallback(const char* sessionId, deepgram::AudioPipe::NotifyEvent_t event, const char* message, bool finished) {
|
||||
switch_core_session_t* session = switch_core_session_locate(sessionId);
|
||||
if (session) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, MY_BUG_NAME);
|
||||
if (bug) {
|
||||
private_t* tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug);
|
||||
if (tech_pvt) {
|
||||
switch (event) {
|
||||
case deepgram::AudioPipe::CONNECT_SUCCESS:
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "connection successful\n");
|
||||
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_CONNECT_SUCCESS, NULL, tech_pvt->bugname, finished);
|
||||
break;
|
||||
case deepgram::AudioPipe::CONNECT_FAIL:
|
||||
{
|
||||
// first thing: we can no longer access the AudioPipe
|
||||
std::stringstream json;
|
||||
json << "{\"reason\":\"" << message << "\"}";
|
||||
tech_pvt->pAudioPipe = nullptr;
|
||||
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_CONNECT_FAIL, (char *) json.str().c_str(), tech_pvt->bugname, finished);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_NOTICE, "connection failed: %s\n", message);
|
||||
}
|
||||
break;
|
||||
case deepgram::AudioPipe::CONNECTION_DROPPED:
|
||||
// first thing: we can no longer access the AudioPipe
|
||||
tech_pvt->pAudioPipe = nullptr;
|
||||
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_DISCONNECT, NULL, tech_pvt->bugname, finished);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "connection dropped from far end\n");
|
||||
break;
|
||||
case deepgram::AudioPipe::CONNECTION_CLOSED_GRACEFULLY:
|
||||
// first thing: we can no longer access the AudioPipe
|
||||
tech_pvt->pAudioPipe = nullptr;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "connection closed gracefully\n");
|
||||
break;
|
||||
case deepgram::AudioPipe::MESSAGE:
|
||||
if( strstr(message, emptyTranscript)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "discarding empty deepgram transcript\n");
|
||||
}
|
||||
else {
|
||||
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_RESULTS, message, tech_pvt->bugname, finished);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "deepgram message: %s\n", message);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_NOTICE, "got unexpected msg from deepgram %d:%s\n", event, message);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
switch_core_session_rwunlock(session);
|
||||
}
|
||||
}
|
||||
switch_status_t fork_data_init(private_t *tech_pvt, switch_core_session_t *session,
|
||||
int sampling, int desiredSampling, int channels, char *lang, int interim,
|
||||
char* bugname, responseHandler_t responseHandler) {
|
||||
|
||||
int err;
|
||||
switch_codec_implementation_t read_impl;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
|
||||
switch_core_session_get_read_impl(session, &read_impl);
|
||||
|
||||
memset(tech_pvt, 0, sizeof(private_t));
|
||||
|
||||
std::string path;
|
||||
constructPath(session, path, desiredSampling, channels, lang, interim);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "path: %s\n", path.c_str());
|
||||
|
||||
strncpy(tech_pvt->sessionId, switch_core_session_get_uuid(session), MAX_SESSION_ID);
|
||||
strncpy(tech_pvt->host, "api.deepgram.com", MAX_WS_URL_LEN);
|
||||
tech_pvt->port = 443;
|
||||
strncpy(tech_pvt->path, path.c_str(), MAX_PATH_LEN);
|
||||
tech_pvt->sampling = desiredSampling;
|
||||
tech_pvt->responseHandler = responseHandler;
|
||||
tech_pvt->channels = channels;
|
||||
tech_pvt->id = ++idxCallCount;
|
||||
tech_pvt->buffer_overrun_notified = 0;
|
||||
|
||||
size_t buflen = LWS_PRE + (FRAME_SIZE_8000 * desiredSampling / 8000 * channels * 1000 / RTP_PACKETIZATION_PERIOD * nAudioBufferSecs);
|
||||
|
||||
const char* apiKey = switch_channel_get_variable(channel, "DEEPGRAM_API_KEY");
|
||||
if (!apiKey && defaultApiKey) apiKey = defaultApiKey;
|
||||
else if (!apiKey) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "no deepgram api key provided\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
deepgram::AudioPipe* ap = new deepgram::AudioPipe(tech_pvt->sessionId, tech_pvt->host, tech_pvt->port, tech_pvt->path,
|
||||
buflen, read_impl.decoded_bytes_per_packet, apiKey, eventCallback);
|
||||
if (!ap) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error allocating AudioPipe\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
tech_pvt->pAudioPipe = static_cast<void *>(ap);
|
||||
|
||||
switch_mutex_init(&tech_pvt->mutex, SWITCH_MUTEX_NESTED, switch_core_session_get_pool(session));
|
||||
|
||||
if (desiredSampling != sampling) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) resampling from %u to %u\n", tech_pvt->id, sampling, desiredSampling);
|
||||
tech_pvt->resampler = speex_resampler_init(channels, sampling, desiredSampling, SWITCH_RESAMPLE_QUALITY, &err);
|
||||
if (0 != err) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error initializing resampler: %s.\n", speex_resampler_strerror(err));
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
}
|
||||
else {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) no resampling needed for this call\n", tech_pvt->id);
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) fork_data_init\n", tech_pvt->id);
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void lws_logger(int level, const char *line) {
|
||||
switch_log_level_t llevel = SWITCH_LOG_DEBUG;
|
||||
|
||||
switch (level) {
|
||||
case LLL_ERR: llevel = SWITCH_LOG_ERROR; break;
|
||||
case LLL_WARN: llevel = SWITCH_LOG_WARNING; break;
|
||||
case LLL_NOTICE: llevel = SWITCH_LOG_NOTICE; break;
|
||||
case LLL_INFO: llevel = SWITCH_LOG_INFO; break;
|
||||
break;
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "%s\n", line);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
extern "C" {
|
||||
switch_status_t dg_transcribe_init() {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "mod_deepgram_transcribe: audio buffer (in secs): %d secs\n", nAudioBufferSecs);
|
||||
|
||||
int logs = LLL_ERR | LLL_WARN | LLL_NOTICE || LLL_INFO | LLL_PARSER | LLL_HEADER | LLL_EXT | LLL_CLIENT | LLL_LATENCY | LLL_DEBUG ;
|
||||
|
||||
deepgram::AudioPipe::initialize(logs, lws_logger);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "AudioPipe::initialize completed\n");
|
||||
|
||||
const char* apiKey = std::getenv("DEEPGRAM_API_KEY");
|
||||
if (NULL == apiKey) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE,
|
||||
"\"DEEPGRAM_API_KEY\" env var not set; authentication will expect channel variables of same names to be set\n");
|
||||
}
|
||||
else {
|
||||
hasDefaultCredentials = true;
|
||||
defaultApiKey = apiKey;
|
||||
}
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_status_t dg_transcribe_cleanup() {
|
||||
bool cleanup = false;
|
||||
cleanup = deepgram::AudioPipe::deinitialize();
|
||||
if (cleanup == true) {
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
switch_status_t dg_transcribe_session_init(switch_core_session_t *session,
|
||||
responseHandler_t responseHandler, uint32_t samples_per_second, uint32_t channels,
|
||||
char* lang, int interim, char* bugname, void **ppUserData)
|
||||
{
|
||||
int err;
|
||||
|
||||
// allocate per-session data structure
|
||||
private_t* tech_pvt = (private_t *) switch_core_session_alloc(session, sizeof(private_t));
|
||||
if (!tech_pvt) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "error allocating memory!\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
if (SWITCH_STATUS_SUCCESS != fork_data_init(tech_pvt, session, samples_per_second, 8000, channels, lang, interim, bugname, responseHandler)) {
|
||||
destroy_tech_pvt(tech_pvt);
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
*ppUserData = tech_pvt;
|
||||
|
||||
deepgram::AudioPipe *pAudioPipe = static_cast<deepgram::AudioPipe *>(tech_pvt->pAudioPipe);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "connecting now\n");
|
||||
pAudioPipe->connect();
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "connection in progress\n");
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_status_t dg_transcribe_session_stop(switch_core_session_t *session,int channelIsClosing, char* bugname) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, MY_BUG_NAME);
|
||||
if (!bug) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "dg_transcribe_session_stop: no bug - websocket conection already closed\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
private_t* tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug);
|
||||
uint32_t id = tech_pvt->id;
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) dg_transcribe_session_stop\n", id);
|
||||
|
||||
if (!tech_pvt) return SWITCH_STATUS_FALSE;
|
||||
|
||||
// close connection and get final responses
|
||||
switch_mutex_lock(tech_pvt->mutex);
|
||||
switch_channel_set_private(channel, bugname, NULL);
|
||||
if (!channelIsClosing) switch_core_media_bug_remove(session, &bug);
|
||||
|
||||
deepgram::AudioPipe *pAudioPipe = static_cast<deepgram::AudioPipe *>(tech_pvt->pAudioPipe);
|
||||
if (pAudioPipe) reaper(tech_pvt);
|
||||
destroy_tech_pvt(tech_pvt);
|
||||
switch_mutex_unlock(tech_pvt->mutex);
|
||||
switch_mutex_destroy(tech_pvt->mutex);
|
||||
tech_pvt->mutex = nullptr;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) dg_transcribe_session_stop\n", id);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_bool_t dg_transcribe_frame(switch_core_session_t *session, switch_media_bug_t *bug) {
|
||||
private_t* tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug);
|
||||
size_t inuse = 0;
|
||||
bool dirty = false;
|
||||
char *p = (char *) "{\"msg\": \"buffer overrun\"}";
|
||||
|
||||
if (!tech_pvt) return SWITCH_TRUE;
|
||||
|
||||
if (switch_mutex_trylock(tech_pvt->mutex) == SWITCH_STATUS_SUCCESS) {
|
||||
if (!tech_pvt->pAudioPipe) {
|
||||
switch_mutex_unlock(tech_pvt->mutex);
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
deepgram::AudioPipe *pAudioPipe = static_cast<deepgram::AudioPipe *>(tech_pvt->pAudioPipe);
|
||||
if (pAudioPipe->getLwsState() != deepgram::AudioPipe::LWS_CLIENT_CONNECTED) {
|
||||
switch_mutex_unlock(tech_pvt->mutex);
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
|
||||
pAudioPipe->lockAudioBuffer();
|
||||
size_t available = pAudioPipe->binarySpaceAvailable();
|
||||
if (NULL == tech_pvt->resampler) {
|
||||
switch_frame_t frame = { 0 };
|
||||
frame.data = pAudioPipe->binaryWritePtr();
|
||||
frame.buflen = available;
|
||||
while (true) {
|
||||
|
||||
// check if buffer would be overwritten; dump packets if so
|
||||
if (available < pAudioPipe->binaryMinSpace()) {
|
||||
if (!tech_pvt->buffer_overrun_notified) {
|
||||
tech_pvt->buffer_overrun_notified = 1;
|
||||
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_BUFFER_OVERRUN, NULL, tech_pvt->bugname, 0);
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "(%u) dropping packets!\n",
|
||||
tech_pvt->id);
|
||||
pAudioPipe->binaryWritePtrResetToZero();
|
||||
|
||||
frame.data = pAudioPipe->binaryWritePtr();
|
||||
frame.buflen = available = pAudioPipe->binarySpaceAvailable();
|
||||
}
|
||||
|
||||
switch_status_t rv = switch_core_media_bug_read(bug, &frame, SWITCH_TRUE);
|
||||
if (rv != SWITCH_STATUS_SUCCESS) break;
|
||||
if (frame.datalen) {
|
||||
pAudioPipe->binaryWritePtrAdd(frame.datalen);
|
||||
frame.buflen = available = pAudioPipe->binarySpaceAvailable();
|
||||
frame.data = pAudioPipe->binaryWritePtr();
|
||||
dirty = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
uint8_t data[SWITCH_RECOMMENDED_BUFFER_SIZE];
|
||||
switch_frame_t frame = { 0 };
|
||||
frame.data = data;
|
||||
frame.buflen = SWITCH_RECOMMENDED_BUFFER_SIZE;
|
||||
while (switch_core_media_bug_read(bug, &frame, SWITCH_TRUE) == SWITCH_STATUS_SUCCESS) {
|
||||
if (frame.datalen) {
|
||||
spx_uint32_t out_len = available >> 1; // space for samples which are 2 bytes
|
||||
spx_uint32_t in_len = frame.samples;
|
||||
|
||||
speex_resampler_process_interleaved_int(tech_pvt->resampler,
|
||||
(const spx_int16_t *) frame.data,
|
||||
(spx_uint32_t *) &in_len,
|
||||
(spx_int16_t *) ((char *) pAudioPipe->binaryWritePtr()),
|
||||
&out_len);
|
||||
|
||||
if (out_len > 0) {
|
||||
// bytes written = num samples * 2 * num channels
|
||||
size_t bytes_written = out_len << tech_pvt->channels;
|
||||
pAudioPipe->binaryWritePtrAdd(bytes_written);
|
||||
available = pAudioPipe->binarySpaceAvailable();
|
||||
dirty = true;
|
||||
}
|
||||
if (available < pAudioPipe->binaryMinSpace()) {
|
||||
if (!tech_pvt->buffer_overrun_notified) {
|
||||
tech_pvt->buffer_overrun_notified = 1;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "(%u) dropping packets!\n",
|
||||
tech_pvt->id);
|
||||
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_BUFFER_OVERRUN, NULL, tech_pvt->bugname, 0);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pAudioPipe->unlockAudioBuffer();
|
||||
switch_mutex_unlock(tech_pvt->mutex);
|
||||
}
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
}
|
||||
11
mod_deepgram_transcribe/dg_transcribe_glue.h
Normal file
11
mod_deepgram_transcribe/dg_transcribe_glue.h
Normal file
@@ -0,0 +1,11 @@
|
||||
#ifndef __DG_GLUE_H__
|
||||
#define __DG_GLUE_H__
|
||||
|
||||
switch_status_t dg_transcribe_init();
|
||||
switch_status_t dg_transcribe_cleanup();
|
||||
switch_status_t dg_transcribe_session_init(switch_core_session_t *session, responseHandler_t responseHandler,
|
||||
uint32_t samples_per_second, uint32_t channels, char* lang, int interim, char* bugname, void **ppUserData);
|
||||
switch_status_t dg_transcribe_session_stop(switch_core_session_t *session, int channelIsClosing, char* bugname);
|
||||
switch_bool_t dg_transcribe_frame(switch_core_session_t *session, switch_media_bug_t *bug);
|
||||
|
||||
#endif
|
||||
211
mod_deepgram_transcribe/mod_deepgram_transcribe.c
Normal file
211
mod_deepgram_transcribe/mod_deepgram_transcribe.c
Normal file
@@ -0,0 +1,211 @@
|
||||
/*
|
||||
*
|
||||
* mod_deepgram_transcribe.c -- Freeswitch module for using dg streaming transcribe api
|
||||
*
|
||||
*/
|
||||
#include "mod_deepgram_transcribe.h"
|
||||
#include "dg_transcribe_glue.h"
|
||||
|
||||
/* Prototypes */
|
||||
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_deepgram_transcribe_shutdown);
|
||||
SWITCH_MODULE_LOAD_FUNCTION(mod_deepgram_transcribe_load);
|
||||
|
||||
SWITCH_MODULE_DEFINITION(mod_deepgram_transcribe, mod_deepgram_transcribe_load, mod_deepgram_transcribe_shutdown, NULL);
|
||||
|
||||
static switch_status_t do_stop(switch_core_session_t *session, char* bugname);
|
||||
|
||||
static void responseHandler(switch_core_session_t* session,
|
||||
const char* eventName, const char * json, const char* bugname, int finished) {
|
||||
switch_event_t *event;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, eventName);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "deepgram");
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-session-finished", finished ? "true" : "false");
|
||||
if (finished) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "responseHandler returning event %s, from finished recognition session\n", eventName);
|
||||
}
|
||||
if (json) switch_event_add_body(event, "%s", json);
|
||||
if (bugname) switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "media-bugname", bugname);
|
||||
switch_event_fire(&event);
|
||||
}
|
||||
|
||||
|
||||
static switch_bool_t capture_callback(switch_media_bug_t *bug, void *user_data, switch_abc_type_t type)
|
||||
{
|
||||
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
|
||||
|
||||
switch (type) {
|
||||
case SWITCH_ABC_TYPE_INIT:
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Got SWITCH_ABC_TYPE_INIT.\n");
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_CLOSE:
|
||||
{
|
||||
private_t *tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Got SWITCH_ABC_TYPE_CLOSE.\n");
|
||||
|
||||
dg_transcribe_session_stop(session, 1, tech_pvt->bugname);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Finished SWITCH_ABC_TYPE_CLOSE.\n");
|
||||
}
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_READ:
|
||||
|
||||
return dg_transcribe_frame(session, bug);
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_WRITE:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
|
||||
static switch_status_t start_capture(switch_core_session_t *session, switch_media_bug_flag_t flags,
|
||||
char* lang, int interim, char* bugname)
|
||||
{
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug;
|
||||
switch_status_t status;
|
||||
switch_codec_implementation_t read_impl = { 0 };
|
||||
void *pUserData;
|
||||
uint32_t samples_per_second;
|
||||
|
||||
if (switch_channel_get_private(channel, MY_BUG_NAME)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "removing bug from previous transcribe\n");
|
||||
do_stop(session, bugname);
|
||||
}
|
||||
|
||||
switch_core_session_get_read_impl(session, &read_impl);
|
||||
|
||||
if (switch_channel_pre_answer(channel) != SWITCH_STATUS_SUCCESS) {
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
samples_per_second = !strcasecmp(read_impl.iananame, "g722") ? read_impl.actual_samples_per_second : read_impl.samples_per_second;
|
||||
|
||||
if (SWITCH_STATUS_FALSE == dg_transcribe_session_init(session, responseHandler, samples_per_second, flags & SMBF_STEREO ? 2 : 1, lang, interim, bugname, &pUserData)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error initializing dg speech session.\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
if ((status = switch_core_media_bug_add(session, "dg_transcribe", NULL, capture_callback, pUserData, 0, flags, &bug)) != SWITCH_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
switch_channel_set_private(channel, MY_BUG_NAME, bug);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "added media bug for dg transcribe\n");
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static switch_status_t do_stop(switch_core_session_t *session, char* bugname)
|
||||
{
|
||||
switch_status_t status = SWITCH_STATUS_SUCCESS;
|
||||
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = switch_channel_get_private(channel, MY_BUG_NAME);
|
||||
|
||||
if (bug) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Received user command command to stop transcribe.\n");
|
||||
status = dg_transcribe_session_stop(session, 0, bugname);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "stopped transcribe.\n");
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
#define TRANSCRIBE_API_SYNTAX "<uuid> [start|stop] lang-code [interim] [stereo|mono]"
|
||||
SWITCH_STANDARD_API(dg_transcribe_function)
|
||||
{
|
||||
char *mycmd = NULL, *argv[6] = { 0 };
|
||||
int argc = 0;
|
||||
switch_status_t status = SWITCH_STATUS_FALSE;
|
||||
switch_media_bug_flag_t flags = SMBF_READ_STREAM /* | SMBF_WRITE_STREAM | SMBF_READ_PING */;
|
||||
|
||||
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
|
||||
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
|
||||
}
|
||||
|
||||
if (zstr(cmd) ||
|
||||
(!strcasecmp(argv[1], "stop") && argc < 2) ||
|
||||
(!strcasecmp(argv[1], "start") && argc < 3) ||
|
||||
zstr(argv[0])) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s %s %s.\n", cmd, argv[0], argv[1]);
|
||||
stream->write_function(stream, "-USAGE: %s\n", TRANSCRIBE_API_SYNTAX);
|
||||
goto done;
|
||||
} else {
|
||||
switch_core_session_t *lsession = NULL;
|
||||
|
||||
if ((lsession = switch_core_session_locate(argv[0]))) {
|
||||
if (!strcasecmp(argv[1], "stop")) {
|
||||
char *bugname = argc > 2 ? argv[2] : MY_BUG_NAME;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "stop transcribing\n");
|
||||
status = do_stop(lsession, bugname);
|
||||
} else if (!strcasecmp(argv[1], "start")) {
|
||||
char* lang = argv[2];
|
||||
int interim = argc > 3 && !strcmp(argv[3], "interim");
|
||||
char *bugname = argc > 5 ? argv[5] : MY_BUG_NAME;
|
||||
if (argc > 4 && !strcmp(argv[4], "stereo")) {
|
||||
flags |= SMBF_WRITE_STREAM ;
|
||||
flags |= SMBF_STEREO;
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "start transcribing %s %s\n", lang, interim ? "interim": "complete");
|
||||
status = start_capture(lsession, flags, lang, interim, bugname);
|
||||
}
|
||||
switch_core_session_rwunlock(lsession);
|
||||
}
|
||||
}
|
||||
|
||||
if (status == SWITCH_STATUS_SUCCESS) {
|
||||
stream->write_function(stream, "+OK Success\n");
|
||||
} else {
|
||||
stream->write_function(stream, "-ERR Operation Failed\n");
|
||||
}
|
||||
|
||||
done:
|
||||
|
||||
switch_safe_free(mycmd);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
SWITCH_MODULE_LOAD_FUNCTION(mod_deepgram_transcribe_load)
|
||||
{
|
||||
switch_api_interface_t *api_interface;
|
||||
|
||||
/* create/register custom event message type */
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_RESULTS) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_RESULTS);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
|
||||
/* connect my internal structure to the blank pointer passed to me */
|
||||
*module_interface = switch_loadable_module_create_module_interface(pool, modname);
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Deepgram Speech Transcription API loading..\n");
|
||||
|
||||
if (SWITCH_STATUS_FALSE == dg_transcribe_init()) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Failed initializing dg speech interface\n");
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Deepgram Speech Transcription API successfully loaded\n");
|
||||
|
||||
SWITCH_ADD_API(api_interface, "uuid_deepgram_transcribe", "Deepgram Speech Transcription API", dg_transcribe_function, TRANSCRIBE_API_SYNTAX);
|
||||
switch_console_set_complete("add uuid_deepgram_transcribe start lang-code [interim|final] [stereo|mono]");
|
||||
switch_console_set_complete("add uuid_deepgram_transcribe stop ");
|
||||
|
||||
/* indicate that the module should continue to be loaded */
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Called when the system shuts down
|
||||
Macro expands to: switch_status_t mod_deepgram_transcribe_shutdown() */
|
||||
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_deepgram_transcribe_shutdown)
|
||||
{
|
||||
dg_transcribe_cleanup();
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_RESULTS);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
46
mod_deepgram_transcribe/mod_deepgram_transcribe.h
Normal file
46
mod_deepgram_transcribe/mod_deepgram_transcribe.h
Normal file
@@ -0,0 +1,46 @@
|
||||
#ifndef __MOD_AWS_TRANSCRIBE_H__
|
||||
#define __MOD_AWS_TRANSCRIBE_H__
|
||||
|
||||
#include <switch.h>
|
||||
#include <speex/speex_resampler.h>
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
#define MY_BUG_NAME "deepgram_transcribe"
|
||||
#define TRANSCRIBE_EVENT_RESULTS "deepgram_transcribe::transcription"
|
||||
#define TRANSCRIBE_EVENT_NO_AUDIO_DETECTED "deepgram_transcribe::no_audio_detected"
|
||||
#define TRANSCRIBE_EVENT_VAD_DETECTED "deepgram_transcribe::vad_detected"
|
||||
#define TRANSCRIBE_EVENT_CONNECT_SUCCESS "deepgram_transcribe::connect"
|
||||
#define TRANSCRIBE_EVENT_CONNECT_FAIL "deepgram_transcribe::connect_failed"
|
||||
#define TRANSCRIBE_EVENT_BUFFER_OVERRUN "deepgram_transcribe::buffer_overrun"
|
||||
#define TRANSCRIBE_EVENT_DISCONNECT "deepgram_transcribe::disconnect"
|
||||
|
||||
#define MAX_LANG (12)
|
||||
#define MAX_SESSION_ID (256)
|
||||
#define MAX_WS_URL_LEN (512)
|
||||
#define MAX_PATH_LEN (4096)
|
||||
#define MAX_BUG_LEN (64)
|
||||
|
||||
typedef void (*responseHandler_t)(switch_core_session_t* session, const char* eventName, const char* json, const char* bugname, int finished);
|
||||
|
||||
struct private_data {
|
||||
switch_mutex_t *mutex;
|
||||
char sessionId[MAX_SESSION_ID];
|
||||
SpeexResamplerState *resampler;
|
||||
responseHandler_t responseHandler;
|
||||
void *pAudioPipe;
|
||||
int ws_state;
|
||||
char host[MAX_WS_URL_LEN];
|
||||
unsigned int port;
|
||||
char path[MAX_PATH_LEN];
|
||||
char bugname[MAX_BUG_LEN+1];
|
||||
int sampling;
|
||||
int channels;
|
||||
unsigned int id;
|
||||
int buffer_overrun_notified:1;
|
||||
int is_finished:1;
|
||||
};
|
||||
|
||||
typedef struct private_data private_t;
|
||||
|
||||
#endif
|
||||
21
mod_deepgram_transcribe/parser.cpp
Normal file
21
mod_deepgram_transcribe/parser.cpp
Normal file
@@ -0,0 +1,21 @@
|
||||
#include "parser.hpp"
|
||||
#include <switch.h>
|
||||
|
||||
cJSON* parse_json(switch_core_session_t* session, const std::string& data, std::string& type) {
|
||||
cJSON* json = NULL;
|
||||
const char *szType = NULL;
|
||||
json = cJSON_Parse(data.c_str());
|
||||
if (!json) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "parse - failed parsing incoming msg as JSON: %s\n", data.c_str());
|
||||
return NULL;
|
||||
}
|
||||
|
||||
szType = cJSON_GetObjectCstr(json, "type");
|
||||
if (szType) {
|
||||
type.assign(szType);
|
||||
}
|
||||
else {
|
||||
type.assign("json");
|
||||
}
|
||||
return json;
|
||||
}
|
||||
9
mod_deepgram_transcribe/parser.hpp
Normal file
9
mod_deepgram_transcribe/parser.hpp
Normal file
@@ -0,0 +1,9 @@
|
||||
#ifndef __PARSER_H__
|
||||
#define __PARSER_H__
|
||||
|
||||
#include <string>
|
||||
#include <switch_json.h>
|
||||
|
||||
cJSON* parse_json(switch_core_session_t* session, const std::string& data, std::string& type) ;
|
||||
|
||||
#endif
|
||||
51
mod_deepgram_transcribe/simple_buffer.h
Normal file
51
mod_deepgram_transcribe/simple_buffer.h
Normal file
@@ -0,0 +1,51 @@
|
||||
/**
|
||||
* (very) simple and limited circular buffer,
|
||||
* supporting only the use case of doing all of the adds
|
||||
* and then subsquently retrieves.
|
||||
*
|
||||
*/
|
||||
class SimpleBuffer {
|
||||
public:
|
||||
SimpleBuffer(uint32_t chunkSize, uint32_t numChunks) : numItems(0),
|
||||
m_numChunks(numChunks), m_chunkSize(chunkSize) {
|
||||
m_pData = new char[chunkSize * numChunks];
|
||||
m_pNextWrite = m_pData;
|
||||
}
|
||||
~SimpleBuffer() {
|
||||
delete [] m_pData;
|
||||
}
|
||||
|
||||
void add(void *data, uint32_t datalen) {
|
||||
if (datalen % m_chunkSize != 0) return;
|
||||
int numChunks = datalen / m_chunkSize;
|
||||
for (int i = 0; i < numChunks; i++) {
|
||||
memcpy(m_pNextWrite, data, m_chunkSize);
|
||||
data = static_cast<char*>(data) + m_chunkSize;
|
||||
if (numItems < m_numChunks) numItems++;
|
||||
|
||||
uint32_t offset = (m_pNextWrite - m_pData) / m_chunkSize;
|
||||
if (offset >= m_numChunks - 1) m_pNextWrite = m_pData;
|
||||
else m_pNextWrite += m_chunkSize;
|
||||
}
|
||||
}
|
||||
|
||||
char* getNextChunk() {
|
||||
if (numItems--) {
|
||||
char *p = m_pNextWrite;
|
||||
uint32_t offset = (m_pNextWrite - m_pData) / m_chunkSize;
|
||||
if (offset >= m_numChunks - 1) m_pNextWrite = m_pData;
|
||||
else m_pNextWrite += m_chunkSize;
|
||||
return p;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
uint32_t getNumItems() { return numItems;}
|
||||
|
||||
private:
|
||||
char *m_pData;
|
||||
uint32_t numItems;
|
||||
uint32_t m_chunkSize;
|
||||
uint32_t m_numChunks;
|
||||
char* m_pNextWrite;
|
||||
};
|
||||
8
mod_dialogflow/LICENSE
Normal file
8
mod_dialogflow/LICENSE
Normal file
@@ -0,0 +1,8 @@
|
||||
Copyright 2023, Drachtio Communications Services, LLC
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
10
mod_dialogflow/Makefile.am
Normal file
10
mod_dialogflow/Makefile.am
Normal file
@@ -0,0 +1,10 @@
|
||||
include $(top_srcdir)/build/modmake.rulesam
|
||||
MODNAME=mod_dialogflow
|
||||
|
||||
mod_LTLIBRARIES = mod_dialogflow.la
|
||||
mod_dialogflow_la_SOURCES = mod_dialogflow.c google_glue.cpp parser.cpp
|
||||
mod_dialogflow_la_CFLAGS = $(AM_CFLAGS)
|
||||
mod_dialogflow_la_CXXFLAGS = -I $(top_srcdir)/libs/googleapis/gens $(AM_CXXFLAGS) -std=c++17
|
||||
|
||||
mod_dialogflow_la_LIBADD = $(switch_builddir)/libfreeswitch.la
|
||||
mod_dialogflow_la_LDFLAGS = -avoid-version -module -no-undefined -shared `pkg-config --libs grpc++ grpc`
|
||||
84
mod_dialogflow/README.md
Normal file
84
mod_dialogflow/README.md
Normal file
@@ -0,0 +1,84 @@
|
||||
# mod_dialogflow
|
||||
|
||||
A Freeswitch module that connects a Freeswitch channel to a [dialogflow agent](https://dialogflow.com/docs/getting-started/first-agent) so that an IVR interaction can be driven completely by dialogflow logic.
|
||||
|
||||
Once a Freeswitch channel is connected to a dialogflow agent, media is streamed to the dialogflow service, which returns information describing the "intent" that was detected, along with transcriptions and audio prompts and text to play to the caller. The handling of returned audio by the module is two-fold:
|
||||
1. If an audio clip was returned, it is *not* immediately played to the caller, but instead is written to a temporary wave file on the Freeswitch server.
|
||||
2. Next, a Freeswitch custom event is sent to the application containing the details of the dialogflow response as well as the path to the wave file.
|
||||
|
||||
This allows the application whether to decide to play the returned audio clip (via the mod_dptools 'play' command), or to use a text-to-speech service to generate audio using the returned prompt text.
|
||||
|
||||
## API
|
||||
|
||||
### Commands
|
||||
The freeswitch module exposes the following API commands:
|
||||
|
||||
#### dialogflow_start
|
||||
```
|
||||
dialogflow_start <uuid> <project-id> <lang-code> [<event>]
|
||||
```
|
||||
Attaches media bug to channel and performs streaming recognize request.
|
||||
- `uuid` - unique identifier of Freeswitch channel
|
||||
- `project-id` - the identifier of the dialogflow project to execute, which may optionally include a dialogflow environment, a region and output audio configurations (see below).
|
||||
- `project-id` - the identifier of the dialogflow project to execute, which may optionally include a dialogflow environment, a region and output audio configurations (see below).
|
||||
- `lang-code` - a valid dialogflow [language tag](https://dialogflow.com/docs/reference/language) to use for speech recognition
|
||||
- `event` - name of an initial event to send to dialogflow; e.g. to trigger an initial prompt
|
||||
|
||||
When executing a dialogflow project, the environment and region will default to 'draft' and 'us', respectively.
|
||||
|
||||
To specify both an environment and a region, provide a value for project-id in the dialogflow_start command as follows:
|
||||
```
|
||||
dialogflow-project-id:environment:region, i.e myproject:production:eu-west1
|
||||
```
|
||||
To specify environment and default to the global region:
|
||||
```
|
||||
dialogflow-project-id:environment, i.e myproject:production
|
||||
```
|
||||
To specify a region and default environment:
|
||||
```
|
||||
dialogflow-project-id::region, i.e myproject::eu-west1
|
||||
```
|
||||
To simply use the defaults for both environment and region:
|
||||
```
|
||||
dialogflow-project-id, i.e myproject
|
||||
```
|
||||
|
||||
By default, [Output Audio configurations](https://cloud.google.com/dialogflow/es/docs/reference/rest/v2/OutputAudioConfig) and [Sentiment Analysis](https://cloud.google.com/dialogflow/es/docs/reference/rpc/google.cloud.dialogflow.v2beta1#google.cloud.dialogflow.v2beta1.SentimentAnalysisRequestConfig) will be ignored and the configs selected for [your agent in Dialogflow platform](https://dialogflow.cloud.google.com/) will be used, however if you wish to abstract your implementation from the platform and define them programatically it can be done in the dialogflow_start command as follows:
|
||||
|
||||
```
|
||||
dialogflow-project-id:environment:region:speakingRate:pitch:volume:voice-name:voice-gender:effect:sentiment-analysis
|
||||
```
|
||||
|
||||
Example:
|
||||
```
|
||||
myproject:production:eu-west1:1.1:1.5:2.5:en-GB-Standard-D:F:handset-class-device:true
|
||||
```
|
||||
Speaking rate, pitch and volume should take the value of a double. Information [here](https://cloud.google.com/dialogflow/es/docs/reference/rest/v2/projects.agent.environments#synthesizespeechconfig).
|
||||
|
||||
Voice Name should take a valid Text-to-speech model name (choose available voices from https://cloud.google.com/text-to-speech/docs/voices). If not set, the Dialogflow service will choose a voice based on the other parameters such as language code and gender.
|
||||
|
||||
Voice Gender should be M for Male, F for Female, N for neutral gender or leave empty for Unspecified. If not set, the Dialogflow service will choose a voice based on the other parameters such as language code and name. Note that this is only a preference, not requirement. If a voice of the appropriate gender is not available, the synthesizer should substitute a voice with a different gender rather than failing the request.
|
||||
|
||||
Effects are applied on the text-to-speech and are used to improve the playback of an audio on different types of hardware. Available effects and information [here](https://cloud.google.com/text-to-speech/docs/audio-profiles#available_audio_profiles).
|
||||
|
||||
Sentiment Analysis uses Cloud Natural Language to provide a sentiment score for each user query. To enable send the boolean ```true```.
|
||||
|
||||
#### dialogflow_stop
|
||||
```
|
||||
dialogflow_stop <uuid>
|
||||
```
|
||||
Stops dialogflow on the channel.
|
||||
|
||||
### Events
|
||||
* `dialogflow::intent` - a dialogflow [intent](https://dialogflow.com/docs/intents) has been detected.
|
||||
* `dialogflow::transcription` - a transcription has been returned
|
||||
* `dialogflow::audio_provided` - an audio prompt has been returned from dialogflow. Dialogflow will return both an audio clip in linear 16 format, as well as the text of the prompt. The audio clip will be played out to the caller and the prompt text is returned to the application in this event.
|
||||
* `dialogflow::end_of_utterance` - dialogflow has detected the end of an utterance
|
||||
* `dialogflow::error` - dialogflow has returned an error
|
||||
## Usage
|
||||
When using [drachtio-fsrmf](https://www.npmjs.com/package/drachtio-fsmrf), you can access this API command via the api method on the 'endpoint' object.
|
||||
```js
|
||||
ep.api('dialogflow_start', `${ep.uuid} my-agent-uuxr:production en-US welcome`);
|
||||
```
|
||||
## Examples
|
||||
[drachtio-dialogflow-phone-gateway](https://github.com/davehorton/drachtio-dialogflow-phone-gateway)
|
||||
5
mod_dialogflow/conf/autoload_configs/dialogflow.xml
Normal file
5
mod_dialogflow/conf/autoload_configs/dialogflow.xml
Normal file
@@ -0,0 +1,5 @@
|
||||
<configuration name="dialogflow.conf" description="Google Dialogflow Configuration">
|
||||
<settings>
|
||||
<param name="google-application-credentials-json-file" value="/tmp/gcs_service_account_key.json"/>
|
||||
</settings>
|
||||
</configuration>
|
||||
595
mod_dialogflow/google_glue.cpp
Normal file
595
mod_dialogflow/google_glue.cpp
Normal file
@@ -0,0 +1,595 @@
|
||||
#include <cstdlib>
|
||||
|
||||
#include <switch.h>
|
||||
#include <switch_json.h>
|
||||
#include <grpc++/grpc++.h>
|
||||
#include <string.h>
|
||||
#include <mutex>
|
||||
#include <condition_variable>
|
||||
|
||||
#include <regex>
|
||||
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <map>
|
||||
|
||||
#include "google/cloud/dialogflow/v2beta1/session.grpc.pb.h"
|
||||
|
||||
#include "mod_dialogflow.h"
|
||||
#include "parser.h"
|
||||
|
||||
using google::cloud::dialogflow::v2beta1::Sessions;
|
||||
using google::cloud::dialogflow::v2beta1::StreamingDetectIntentRequest;
|
||||
using google::cloud::dialogflow::v2beta1::StreamingDetectIntentResponse;
|
||||
using google::cloud::dialogflow::v2beta1::AudioEncoding;
|
||||
using google::cloud::dialogflow::v2beta1::InputAudioConfig;
|
||||
using google::cloud::dialogflow::v2beta1::OutputAudioConfig;
|
||||
using google::cloud::dialogflow::v2beta1::SynthesizeSpeechConfig;
|
||||
using google::cloud::dialogflow::v2beta1::QueryInput;
|
||||
using google::cloud::dialogflow::v2beta1::QueryResult;
|
||||
using google::cloud::dialogflow::v2beta1::StreamingRecognitionResult;
|
||||
using google::cloud::dialogflow::v2beta1::EventInput;
|
||||
using google::rpc::Status;
|
||||
using google::protobuf::Struct;
|
||||
using google::protobuf::Value;
|
||||
using google::protobuf::MapPair;
|
||||
|
||||
static uint64_t playCount = 0;
|
||||
static std::multimap<std::string, std::string> audioFiles;
|
||||
static bool hasDefaultCredentials = false;
|
||||
|
||||
static switch_status_t hanguphook(switch_core_session_t *session) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_channel_state_t state = switch_channel_get_state(channel);
|
||||
|
||||
if (state == CS_HANGUP || state == CS_ROUTING) {
|
||||
char * sessionId = switch_core_session_get_uuid(session);
|
||||
typedef std::multimap<std::string, std::string>::iterator MMAPIterator;
|
||||
std::pair<MMAPIterator, MMAPIterator> result = audioFiles.equal_range(sessionId);
|
||||
for (MMAPIterator it = result.first; it != result.second; it++) {
|
||||
std::string filename = it->second;
|
||||
std::remove(filename.c_str());
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG,
|
||||
"google_dialogflow_session_cleanup: removed audio file %s\n", filename.c_str());
|
||||
}
|
||||
audioFiles.erase(sessionId);
|
||||
switch_core_event_hook_remove_state_change(session, hanguphook);
|
||||
}
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static void parseEventParams(Struct* grpcParams, cJSON* json) {
|
||||
auto* map = grpcParams->mutable_fields();
|
||||
int count = cJSON_GetArraySize(json);
|
||||
for (int i = 0; i < count; i++) {
|
||||
cJSON* prop = cJSON_GetArrayItem(json, i);
|
||||
if (prop) {
|
||||
google::protobuf::Value v;
|
||||
switch (prop->type) {
|
||||
case cJSON_False:
|
||||
case cJSON_True:
|
||||
v.set_bool_value(prop->type == cJSON_True);
|
||||
break;
|
||||
|
||||
case cJSON_Number:
|
||||
v.set_number_value(prop->valuedouble);
|
||||
break;
|
||||
|
||||
case cJSON_String:
|
||||
v.set_string_value(prop->valuestring);
|
||||
break;
|
||||
|
||||
case cJSON_Array:
|
||||
case cJSON_Object:
|
||||
case cJSON_Raw:
|
||||
case cJSON_NULL:
|
||||
continue;
|
||||
}
|
||||
map->insert(MapPair<std::string, Value>(prop->string, v));
|
||||
}
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "parseEventParams: added %d event params\n", map->size());
|
||||
}
|
||||
|
||||
void tokenize(std::string const &str, const char delim, std::vector<std::string> &out) {
|
||||
size_t start = 0;
|
||||
size_t end = 0;
|
||||
bool finished = false;
|
||||
do {
|
||||
end = str.find(delim, start);
|
||||
if (end == std::string::npos) {
|
||||
finished = true;
|
||||
out.push_back(str.substr(start));
|
||||
}
|
||||
else {
|
||||
out.push_back(str.substr(start, end - start));
|
||||
start = ++end;
|
||||
}
|
||||
} while (!finished);
|
||||
}
|
||||
|
||||
class GStreamer {
|
||||
public:
|
||||
GStreamer(switch_core_session_t *session, const char* lang, char* projectId, char* event, char* text) :
|
||||
m_lang(lang), m_sessionId(switch_core_session_get_uuid(session)), m_environment("draft"), m_regionId("us"),
|
||||
m_speakingRate(), m_pitch(), m_volume(), m_voiceName(""), m_voiceGender(""), m_effects(""),
|
||||
m_sentimentAnalysis(false), m_finished(false), m_packets(0) {
|
||||
const char* var;
|
||||
switch_channel_t* channel = switch_core_session_get_channel(session);
|
||||
std::vector<std::string> tokens;
|
||||
const char delim = ':';
|
||||
tokenize(projectId, delim, tokens);
|
||||
int idx = 0;
|
||||
for (auto &s: tokens) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer: token %d: '%s'\n", idx, s.c_str());
|
||||
if (0 == idx) m_projectId = s;
|
||||
else if (1 == idx && s.length() > 0) m_environment = s;
|
||||
else if (2 == idx && s.length() > 0) m_regionId = s;
|
||||
else if (3 == idx && s.length() > 0) m_speakingRate = stod(s);
|
||||
else if (4 == idx && s.length() > 0) m_pitch = stod(s);
|
||||
else if (5 == idx && s.length() > 0) m_volume = stod(s);
|
||||
else if (6 == idx && s.length() > 0) m_voiceName = s;
|
||||
else if (7 == idx && s.length() > 0) m_voiceGender = s;
|
||||
else if (8 == idx && s.length() > 0) m_effects = s;
|
||||
else if (9 == idx && s.length() > 0) m_sentimentAnalysis = (s == "true");
|
||||
idx++;
|
||||
}
|
||||
|
||||
std::string endpoint = "dialogflow.googleapis.com";
|
||||
if (0 != m_regionId.compare("us")) {
|
||||
endpoint = m_regionId;
|
||||
endpoint.append("-dialogflow.googleapis.com:443");
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO,
|
||||
"GStreamer dialogflow endpoint is %s, region is %s, project is %s, environment is %s\n",
|
||||
endpoint.c_str(), m_regionId.c_str(), m_projectId.c_str(), m_environment.c_str());
|
||||
|
||||
if (var = switch_channel_get_variable(channel, "GOOGLE_APPLICATION_CREDENTIALS")) {
|
||||
auto callCreds = grpc::ServiceAccountJWTAccessCredentials(var, INT64_MAX);
|
||||
auto channelCreds = grpc::SslCredentials(grpc::SslCredentialsOptions());
|
||||
auto creds = grpc::CompositeChannelCredentials(channelCreds, callCreds);
|
||||
m_channel = grpc::CreateChannel(endpoint, creds);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer json credentials are %s\n", var);
|
||||
}
|
||||
else {
|
||||
auto creds = grpc::GoogleDefaultCredentials();
|
||||
m_channel = grpc::CreateChannel(endpoint, creds);
|
||||
}
|
||||
startStream(session, event, text);
|
||||
}
|
||||
|
||||
~GStreamer() {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::~GStreamer wrote %ld packets %p\n", m_packets, this);
|
||||
}
|
||||
|
||||
void startStream(switch_core_session_t *session, const char* event, const char* text) {
|
||||
char szSession[256];
|
||||
|
||||
m_request = std::make_shared<StreamingDetectIntentRequest>();
|
||||
m_context= std::make_shared<grpc::ClientContext>();
|
||||
m_stub = Sessions::NewStub(m_channel);
|
||||
|
||||
snprintf(szSession, 256, "projects/%s/locations/%s/agent/environments/%s/users/-/sessions/%s",
|
||||
m_projectId.c_str(), m_regionId.c_str(), m_environment.c_str(), m_sessionId.c_str());
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "GStreamer::startStream session %s, event %s, text %s %p\n", szSession, event, text, this);
|
||||
|
||||
m_request->set_session(szSession);
|
||||
auto* queryInput = m_request->mutable_query_input();
|
||||
if (event) {
|
||||
auto* eventInput = queryInput->mutable_event();
|
||||
eventInput->set_name(event);
|
||||
eventInput->set_language_code(m_lang.c_str());
|
||||
if (text) {
|
||||
cJSON* json = cJSON_Parse(text);
|
||||
if (!json) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "GStreamer::startStream ignoring event params since it is not json %s\n", text);
|
||||
}
|
||||
else {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::startStream adding event params (JSON) %s\n", text);
|
||||
auto* eventParams = eventInput->mutable_parameters();
|
||||
parseEventParams(eventParams, json);
|
||||
cJSON_Delete(json);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (text) {
|
||||
auto* textInput = queryInput->mutable_text();
|
||||
textInput->set_text(text);
|
||||
textInput->set_language_code(m_lang.c_str());
|
||||
}
|
||||
else {
|
||||
auto* audio_config = queryInput->mutable_audio_config();
|
||||
audio_config->set_sample_rate_hertz(16000);
|
||||
audio_config->set_audio_encoding(AudioEncoding::AUDIO_ENCODING_LINEAR_16);
|
||||
audio_config->set_language_code(m_lang.c_str());
|
||||
audio_config->set_single_utterance(true);
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::startStream checking OutputAudioConfig custom parameters: speaking rate %f,"
|
||||
" pitch %f, volume %f, voice name '%s' gender '%s', effects '%s'\n", m_speakingRate,
|
||||
m_pitch, m_volume, m_voiceName.c_str(), m_voiceGender.c_str(), m_effects.c_str());
|
||||
if (isAnyOutputAudioConfigChanged()) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "GStreamer::startStream adding a custom OutputAudioConfig to the request since at"
|
||||
" least one parameter was received.");
|
||||
auto* outputAudioConfig = m_request->mutable_output_audio_config();
|
||||
outputAudioConfig->set_sample_rate_hertz(16000);
|
||||
outputAudioConfig->set_audio_encoding(OutputAudioEncoding::OUTPUT_AUDIO_ENCODING_LINEAR_16);
|
||||
|
||||
auto* synthesizeSpeechConfig = outputAudioConfig->mutable_synthesize_speech_config();
|
||||
if (m_speakingRate) synthesizeSpeechConfig->set_speaking_rate(m_speakingRate);
|
||||
if (m_pitch) synthesizeSpeechConfig->set_pitch(m_pitch);
|
||||
if (m_volume) synthesizeSpeechConfig->set_volume_gain_db(m_volume);
|
||||
if (!m_effects.empty()) synthesizeSpeechConfig->add_effects_profile_id(m_effects);
|
||||
|
||||
auto* voice = synthesizeSpeechConfig->mutable_voice();
|
||||
if (!m_voiceName.empty()) voice->set_name(m_voiceName);
|
||||
if (!m_voiceGender.empty()) {
|
||||
SsmlVoiceGender gender = SsmlVoiceGender::SSML_VOICE_GENDER_UNSPECIFIED;
|
||||
switch (toupper(m_voiceGender[0]))
|
||||
{
|
||||
case 'F': gender = SsmlVoiceGender::SSML_VOICE_GENDER_MALE; break;
|
||||
case 'M': gender = SsmlVoiceGender::SSML_VOICE_GENDER_FEMALE; break;
|
||||
case 'N': gender = SsmlVoiceGender::SSML_VOICE_GENDER_NEUTRAL; break;
|
||||
}
|
||||
voice->set_ssml_gender(gender);
|
||||
}
|
||||
} else {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "GStreamer::startStream no custom parameters for OutputAudioConfig, keeping default");
|
||||
}
|
||||
|
||||
if (m_sentimentAnalysis) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "GStreamer::startStream received sentiment analysis flag as true, adding as query param");
|
||||
auto* queryParameters = m_request->mutable_query_params();
|
||||
auto* sentimentAnalysisConfig = queryParameters->mutable_sentiment_analysis_request_config();
|
||||
sentimentAnalysisConfig->set_analyze_query_text_sentiment(m_sentimentAnalysis);
|
||||
}
|
||||
|
||||
m_streamer = m_stub->StreamingDetectIntent(m_context.get());
|
||||
m_streamer->Write(*m_request);
|
||||
}
|
||||
bool write(void* data, uint32_t datalen) {
|
||||
if (m_finished) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::write not writing because we are finished, %p\n", this);
|
||||
return false;
|
||||
}
|
||||
|
||||
m_request->clear_query_input();
|
||||
m_request->clear_query_params();
|
||||
m_request->set_input_audio(data, datalen);
|
||||
|
||||
m_packets++;
|
||||
return m_streamer->Write(*m_request);
|
||||
|
||||
}
|
||||
bool read(StreamingDetectIntentResponse* response) {
|
||||
return m_streamer->Read(response);
|
||||
}
|
||||
grpc::Status finish() {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::finish %p\n", this);
|
||||
if (m_finished) {
|
||||
grpc::Status ok;
|
||||
return ok;
|
||||
}
|
||||
m_finished = true;
|
||||
return m_streamer->Finish();
|
||||
}
|
||||
void writesDone() {
|
||||
m_streamer->WritesDone();
|
||||
}
|
||||
|
||||
bool isFinished() {
|
||||
return m_finished;
|
||||
}
|
||||
|
||||
bool isAnyOutputAudioConfigChanged() {
|
||||
return m_speakingRate|| m_pitch || m_volume || !m_voiceName.empty() || !m_voiceGender.empty() || !m_effects.empty();
|
||||
}
|
||||
|
||||
private:
|
||||
std::string m_sessionId;
|
||||
std::shared_ptr<grpc::ClientContext> m_context;
|
||||
std::shared_ptr<grpc::Channel> m_channel;
|
||||
std::unique_ptr<Sessions::Stub> m_stub;
|
||||
std::unique_ptr< grpc::ClientReaderWriterInterface<StreamingDetectIntentRequest, StreamingDetectIntentResponse> > m_streamer;
|
||||
std::shared_ptr<StreamingDetectIntentRequest> m_request;
|
||||
std::string m_lang;
|
||||
std::string m_projectId;
|
||||
std::string m_environment;
|
||||
std::string m_regionId;
|
||||
double m_speakingRate;
|
||||
double m_pitch;
|
||||
double m_volume;
|
||||
std::string m_effects;
|
||||
std::string m_voiceName;
|
||||
std::string m_voiceGender;
|
||||
bool m_sentimentAnalysis;
|
||||
bool m_finished;
|
||||
uint32_t m_packets;
|
||||
};
|
||||
|
||||
static void killcb(struct cap_cb* cb) {
|
||||
if (cb) {
|
||||
if (cb->streamer) {
|
||||
GStreamer* p = (GStreamer *) cb->streamer;
|
||||
delete p;
|
||||
cb->streamer = NULL;
|
||||
}
|
||||
if (cb->resampler) {
|
||||
speex_resampler_destroy(cb->resampler);
|
||||
cb->resampler = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void *SWITCH_THREAD_FUNC grpc_read_thread(switch_thread_t *thread, void *obj) {
|
||||
struct cap_cb *cb = (struct cap_cb *) obj;
|
||||
GStreamer* streamer = (GStreamer *) cb->streamer;
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "grpc_read_thread: starting cb %p\n", (void *) cb);
|
||||
|
||||
// Our contract: while we are reading, cb and cb->streamer will not be deleted
|
||||
|
||||
// Read responses until there are no more
|
||||
StreamingDetectIntentResponse response;
|
||||
while (streamer->read(&response)) {
|
||||
switch_core_session_t* psession = switch_core_session_locate(cb->sessionId);
|
||||
if (psession) {
|
||||
switch_channel_t* channel = switch_core_session_get_channel(psession);
|
||||
GRPCParser parser(psession);
|
||||
|
||||
if (response.has_query_result() || response.has_recognition_result()) {
|
||||
cJSON* jResponse = parser.parse(response) ;
|
||||
char* json = cJSON_PrintUnformatted(jResponse);
|
||||
const char* type = DIALOGFLOW_EVENT_TRANSCRIPTION;
|
||||
|
||||
if (response.has_query_result()) type = DIALOGFLOW_EVENT_INTENT;
|
||||
else {
|
||||
const StreamingRecognitionResult_MessageType& o = response.recognition_result().message_type();
|
||||
if (0 == StreamingRecognitionResult_MessageType_Name(o).compare("END_OF_SINGLE_UTTERANCE")) {
|
||||
type = DIALOGFLOW_EVENT_END_OF_UTTERANCE;
|
||||
}
|
||||
}
|
||||
|
||||
cb->responseHandler(psession, type, json);
|
||||
|
||||
free(json);
|
||||
cJSON_Delete(jResponse);
|
||||
}
|
||||
|
||||
const std::string& audio = parser.parseAudio(response);
|
||||
bool playAudio = !audio.empty() ;
|
||||
|
||||
// save audio
|
||||
if (playAudio) {
|
||||
std::ostringstream s;
|
||||
s << SWITCH_GLOBAL_dirs.temp_dir << SWITCH_PATH_SEPARATOR <<
|
||||
cb->sessionId << "_" << ++playCount;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG, "grpc_read_thread: received audio to play\n");
|
||||
|
||||
if (response.has_output_audio_config()) {
|
||||
const OutputAudioConfig& cfg = response.output_audio_config();
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG, "grpc_read_thread: encoding is %d\n", cfg.audio_encoding());
|
||||
if (cfg.audio_encoding() == OutputAudioEncoding::OUTPUT_AUDIO_ENCODING_MP3) {
|
||||
s << ".mp3";
|
||||
}
|
||||
else if (cfg.audio_encoding() == OutputAudioEncoding::OUTPUT_AUDIO_ENCODING_OGG_OPUS) {
|
||||
s << ".opus";
|
||||
}
|
||||
else {
|
||||
s << ".wav";
|
||||
}
|
||||
}
|
||||
std::ofstream f(s.str(), std::ofstream::binary);
|
||||
f << audio;
|
||||
f.close();
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG, "grpc_read_thread: wrote audio to %s\n", s.str().c_str());
|
||||
|
||||
// add the file to the list of files played for this session,
|
||||
// we'll delete when session closes
|
||||
audioFiles.insert(std::pair<std::string, std::string>(cb->sessionId, s.str()));
|
||||
|
||||
cJSON * jResponse = cJSON_CreateObject();
|
||||
cJSON_AddItemToObject(jResponse, "path", cJSON_CreateString(s.str().c_str()));
|
||||
char* json = cJSON_PrintUnformatted(jResponse);
|
||||
|
||||
cb->responseHandler(psession, DIALOGFLOW_EVENT_AUDIO_PROVIDED, json);
|
||||
free(json);
|
||||
cJSON_Delete(jResponse);
|
||||
}
|
||||
switch_core_session_rwunlock(psession);
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "dialogflow read loop is done\n");
|
||||
|
||||
// finish the detect intent session: here is where we may get an error if credentials are invalid
|
||||
switch_core_session_t* psession = switch_core_session_locate(cb->sessionId);
|
||||
if (psession) {
|
||||
grpc::Status status = streamer->finish();
|
||||
if (!status.ok()) {
|
||||
std::ostringstream s;
|
||||
s << "{\"msg\": \"" << status.error_message() << "\", \"code\": " << status.error_code();
|
||||
if (status.error_details().length() > 0) {
|
||||
s << ", \"details\": \"" << status.error_details() << "\"";
|
||||
}
|
||||
s << "}";
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "StreamingDetectIntentRequest finished with err %s (%d): %s\n",
|
||||
status.error_message().c_str(), status.error_code(), status.error_details().c_str());
|
||||
cb->errorHandler(psession, s.str().c_str());
|
||||
}
|
||||
|
||||
switch_core_session_rwunlock(psession);
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "dialogflow read thread exiting \n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
switch_status_t google_dialogflow_init() {
|
||||
const char* gcsServiceKeyFile = std::getenv("GOOGLE_APPLICATION_CREDENTIALS");
|
||||
if (NULL == gcsServiceKeyFile) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE,
|
||||
"\"GOOGLE_APPLICATION_CREDENTIALS\" environment variable is not set; authentication will use \"GOOGLE_APPLICATION_CREDENTIALS\" channel variable\n");
|
||||
}
|
||||
else {
|
||||
hasDefaultCredentials = true;
|
||||
}
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_status_t google_dialogflow_cleanup() {
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// start dialogflow on a channel
|
||||
switch_status_t google_dialogflow_session_init(
|
||||
switch_core_session_t *session,
|
||||
responseHandler_t responseHandler,
|
||||
errorHandler_t errorHandler,
|
||||
uint32_t samples_per_second,
|
||||
char* lang,
|
||||
char* projectId,
|
||||
char* event,
|
||||
char* text,
|
||||
struct cap_cb **ppUserData
|
||||
) {
|
||||
switch_status_t status = SWITCH_STATUS_SUCCESS;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
int err;
|
||||
switch_threadattr_t *thd_attr = NULL;
|
||||
switch_memory_pool_t *pool = switch_core_session_get_pool(session);
|
||||
struct cap_cb* cb = (struct cap_cb *) switch_core_session_alloc(session, sizeof(*cb));
|
||||
|
||||
if (!hasDefaultCredentials && !switch_channel_get_variable(channel, "GOOGLE_APPLICATION_CREDENTIALS")) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
|
||||
"missing credentials: GOOGLE_APPLICATION_CREDENTIALS must be suuplied either as an env variable (path to file) or a channel variable (json string)\n");
|
||||
status = SWITCH_STATUS_FALSE;
|
||||
goto done;
|
||||
}
|
||||
|
||||
strncpy(cb->sessionId, switch_core_session_get_uuid(session), 256);
|
||||
cb->responseHandler = responseHandler;
|
||||
cb->errorHandler = errorHandler;
|
||||
|
||||
if (switch_mutex_init(&cb->mutex, SWITCH_MUTEX_NESTED, pool) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error initializing mutex\n");
|
||||
status = SWITCH_STATUS_FALSE;
|
||||
goto done;
|
||||
}
|
||||
|
||||
strncpy(cb->lang, lang, MAX_LANG);
|
||||
strncpy(cb->projectId, lang, MAX_PROJECT_ID);
|
||||
cb->streamer = new GStreamer(session, lang, projectId, event, text);
|
||||
cb->resampler = speex_resampler_init(1, 8000, 16000, SWITCH_RESAMPLE_QUALITY, &err);
|
||||
if (0 != err) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error initializing resampler: %s.\n",
|
||||
switch_channel_get_name(channel), speex_resampler_strerror(err));
|
||||
status = SWITCH_STATUS_FALSE;
|
||||
goto done;
|
||||
}
|
||||
|
||||
// hangup hook to clear temp audio files
|
||||
switch_core_event_hook_add_state_change(session, hanguphook);
|
||||
|
||||
// create the read thread
|
||||
switch_threadattr_create(&thd_attr, pool);
|
||||
//switch_threadattr_detach_set(thd_attr, 1);
|
||||
switch_threadattr_stacksize_set(thd_attr, SWITCH_THREAD_STACKSIZE);
|
||||
switch_thread_create(&cb->thread, thd_attr, grpc_read_thread, cb, pool);
|
||||
|
||||
*ppUserData = cb;
|
||||
|
||||
done:
|
||||
if (status != SWITCH_STATUS_SUCCESS) {
|
||||
killcb(cb);
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
switch_status_t google_dialogflow_session_stop(switch_core_session_t *session, int channelIsClosing) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, MY_BUG_NAME);
|
||||
|
||||
if (bug) {
|
||||
struct cap_cb *cb = (struct cap_cb *) switch_core_media_bug_get_user_data(bug);
|
||||
switch_status_t st;
|
||||
|
||||
// close connection and get final responses
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "google_dialogflow_session_cleanup: acquiring lock\n");
|
||||
switch_mutex_lock(cb->mutex);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "google_dialogflow_session_cleanup: acquired lock\n");
|
||||
GStreamer* streamer = (GStreamer *) cb->streamer;
|
||||
if (streamer) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "google_dialogflow_session_cleanup: sending writesDone..\n");
|
||||
streamer->writesDone();
|
||||
streamer->finish();
|
||||
}
|
||||
if (cb->thread) {
|
||||
switch_status_t retval;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "google_dialogflow_session_cleanup: waiting for read thread to complete\n");
|
||||
switch_thread_join(&retval, cb->thread);
|
||||
cb->thread = NULL;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "google_dialogflow_session_cleanup: read thread completed\n");
|
||||
}
|
||||
killcb(cb);
|
||||
|
||||
switch_channel_set_private(channel, MY_BUG_NAME, NULL);
|
||||
if (!channelIsClosing) switch_core_media_bug_remove(session, &bug);
|
||||
|
||||
switch_mutex_unlock(cb->mutex);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "google_dialogflow_session_cleanup: Closed google session\n");
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s Bug is not attached.\n", switch_channel_get_name(channel));
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
switch_bool_t google_dialogflow_frame(switch_media_bug_t *bug, void* user_data) {
|
||||
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
|
||||
uint8_t data[SWITCH_RECOMMENDED_BUFFER_SIZE];
|
||||
switch_frame_t frame = {};
|
||||
struct cap_cb *cb = (struct cap_cb *) user_data;
|
||||
|
||||
frame.data = data;
|
||||
frame.buflen = SWITCH_RECOMMENDED_BUFFER_SIZE;
|
||||
|
||||
if (switch_mutex_trylock(cb->mutex) == SWITCH_STATUS_SUCCESS) {
|
||||
GStreamer* streamer = (GStreamer *) cb->streamer;
|
||||
if (streamer && !streamer->isFinished()) {
|
||||
while (switch_core_media_bug_read(bug, &frame, SWITCH_TRUE) == SWITCH_STATUS_SUCCESS && !switch_test_flag((&frame), SFF_CNG)) {
|
||||
if (frame.datalen) {
|
||||
spx_int16_t out[SWITCH_RECOMMENDED_BUFFER_SIZE];
|
||||
spx_uint32_t out_len = SWITCH_RECOMMENDED_BUFFER_SIZE;
|
||||
spx_uint32_t in_len = frame.samples;
|
||||
size_t written;
|
||||
|
||||
speex_resampler_process_interleaved_int(cb->resampler, (const spx_int16_t *) frame.data, (spx_uint32_t *) &in_len, &out[0], &out_len);
|
||||
|
||||
streamer->write( &out[0], sizeof(spx_int16_t) * out_len);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
//switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG,
|
||||
// "google_dialogflow_frame: not sending audio because google channel has been closed\n");
|
||||
}
|
||||
switch_mutex_unlock(cb->mutex);
|
||||
}
|
||||
else {
|
||||
//switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG,
|
||||
// "google_dialogflow_frame: not sending audio since failed to get lock on mutex\n");
|
||||
}
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
|
||||
void destroyChannelUserData(struct cap_cb* cb) {
|
||||
killcb(cb);
|
||||
}
|
||||
|
||||
}
|
||||
12
mod_dialogflow/google_glue.h
Normal file
12
mod_dialogflow/google_glue.h
Normal file
@@ -0,0 +1,12 @@
|
||||
#ifndef __GOOGLE_GLUE_H__
|
||||
#define __GOOGLE_GLUE_H__
|
||||
|
||||
switch_status_t google_dialogflow_init();
|
||||
switch_status_t google_dialogflow_cleanup();
|
||||
switch_status_t google_dialogflow_session_init(switch_core_session_t *session, responseHandler_t responseHandler, errorHandler_t errorHandler,
|
||||
uint32_t samples_per_second, char* lang, char* projectId, char* welcomeEvent, char *text, struct cap_cb **cb);
|
||||
switch_status_t google_dialogflow_session_stop(switch_core_session_t *session, int channelIsClosing);
|
||||
switch_bool_t google_dialogflow_frame(switch_media_bug_t *bug, void* user_data);
|
||||
|
||||
void destroyChannelUserData(struct cap_cb* cb);
|
||||
#endif
|
||||
293
mod_dialogflow/mod_dialogflow.c
Normal file
293
mod_dialogflow/mod_dialogflow.c
Normal file
@@ -0,0 +1,293 @@
|
||||
/*
|
||||
*
|
||||
* mod_dialogflow.c -- Freeswitch module for running a google dialogflow
|
||||
*
|
||||
*/
|
||||
#include "mod_dialogflow.h"
|
||||
#include "google_glue.h"
|
||||
|
||||
#define DEFAULT_INTENT_TIMEOUT_SECS (30)
|
||||
#define DIALOGFLOW_INTENT "dialogflow_intent"
|
||||
#define DIALOGFLOW_INTENT_AUDIO_FILE "dialogflow_intent_audio_file"
|
||||
|
||||
/* Prototypes */
|
||||
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_dialogflow_shutdown);
|
||||
SWITCH_MODULE_RUNTIME_FUNCTION(mod_dialogflow_runtime);
|
||||
SWITCH_MODULE_LOAD_FUNCTION(mod_dialogflow_load);
|
||||
|
||||
SWITCH_MODULE_DEFINITION(mod_dialogflow, mod_dialogflow_load, mod_dialogflow_shutdown, NULL);
|
||||
|
||||
static switch_status_t do_stop(switch_core_session_t *session);
|
||||
|
||||
static void responseHandler(switch_core_session_t* session, const char * type, char * json) {
|
||||
switch_event_t *event;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "json payload for type %s: %s.\n", type, json);
|
||||
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, type);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_body(event, "%s", json);
|
||||
switch_event_fire(&event);
|
||||
}
|
||||
static void errorHandler(switch_core_session_t* session, const char * json) {
|
||||
switch_event_t *event;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, DIALOGFLOW_EVENT_ERROR);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_body(event, "%s", json);
|
||||
|
||||
switch_event_fire(&event);
|
||||
|
||||
do_stop(session);
|
||||
}
|
||||
|
||||
static switch_bool_t capture_callback(switch_media_bug_t *bug, void *user_data, switch_abc_type_t type)
|
||||
{
|
||||
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
|
||||
|
||||
switch (type) {
|
||||
case SWITCH_ABC_TYPE_INIT:
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Got SWITCH_ABC_TYPE_INIT.\n");
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_CLOSE:
|
||||
{
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Got SWITCH_ABC_TYPE_CLOSE.\n");
|
||||
|
||||
google_dialogflow_session_stop(session, 1);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Finished SWITCH_ABC_TYPE_CLOSE.\n");
|
||||
}
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_READ:
|
||||
|
||||
return google_dialogflow_frame(bug, user_data);
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_WRITE:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
|
||||
static switch_status_t start_capture(switch_core_session_t *session, switch_media_bug_flag_t flags, char* lang, char*projectId, char* event, char* text)
|
||||
{
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug;
|
||||
switch_codec_implementation_t read_impl = { 0 };
|
||||
struct cap_cb *cb = NULL;
|
||||
switch_status_t status = SWITCH_STATUS_SUCCESS;
|
||||
|
||||
if (switch_channel_get_private(channel, MY_BUG_NAME)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "a dialogflow is already running on this channel, we will stop it.\n");
|
||||
do_stop(session);
|
||||
}
|
||||
|
||||
if (switch_channel_pre_answer(channel) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "channel must have at least early media to run dialogflow.\n");
|
||||
status = SWITCH_STATUS_FALSE;
|
||||
goto done;
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "starting dialogflow with project %s, language %s, event %s, text %s.\n",
|
||||
projectId, lang, event, text);
|
||||
|
||||
switch_core_session_get_read_impl(session, &read_impl);
|
||||
if (SWITCH_STATUS_FALSE == google_dialogflow_session_init(session, responseHandler, errorHandler,
|
||||
read_impl.samples_per_second, lang, projectId, event, text, &cb)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error initializing google dialogflow session.\n");
|
||||
status = SWITCH_STATUS_FALSE;
|
||||
goto done;
|
||||
}
|
||||
|
||||
if ((status = switch_core_media_bug_add(session, "dialogflow", NULL, capture_callback, (void *) cb, 0, flags, &bug)) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error adding bug.\n");
|
||||
status = SWITCH_STATUS_FALSE;
|
||||
goto done;
|
||||
}
|
||||
switch_channel_set_private(channel, MY_BUG_NAME, bug);
|
||||
|
||||
done:
|
||||
if (status == SWITCH_STATUS_FALSE) {
|
||||
if (cb) destroyChannelUserData(cb);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static switch_status_t do_stop(switch_core_session_t *session)
|
||||
{
|
||||
switch_status_t status = SWITCH_STATUS_SUCCESS;
|
||||
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = switch_channel_get_private(channel, MY_BUG_NAME);
|
||||
|
||||
if (bug) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Received user command command to stop dialogflow.\n");
|
||||
status = google_dialogflow_session_stop(session, 0);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "stopped dialogflow.\n");
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
#define DIALOGFLOW_API_START_SYNTAX "<uuid> project-id lang-code [event]"
|
||||
SWITCH_STANDARD_API(dialogflow_api_start_function)
|
||||
{
|
||||
char *mycmd = NULL, *argv[10] = { 0 };
|
||||
int argc = 0;
|
||||
switch_status_t status = SWITCH_STATUS_FALSE;
|
||||
switch_media_bug_flag_t flags = SMBF_READ_STREAM | SMBF_READ_STREAM | SMBF_READ_PING;
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "command %s\n", cmd);
|
||||
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
|
||||
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
|
||||
}
|
||||
|
||||
if (zstr(cmd) || argc < 3) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s %s %s.\n", cmd, argv[0], argv[1]);
|
||||
stream->write_function(stream, "-USAGE: %s\n", DIALOGFLOW_API_START_SYNTAX);
|
||||
goto done;
|
||||
} else {
|
||||
switch_core_session_t *lsession = NULL;
|
||||
|
||||
if ((lsession = switch_core_session_locate(argv[0]))) {
|
||||
char *event = NULL;
|
||||
char *text = NULL;
|
||||
char *projectId = argv[1];
|
||||
char *lang = argv[2];
|
||||
if (argc > 3) {
|
||||
event = argv[3];
|
||||
}
|
||||
if (argc > 4) {
|
||||
if (0 == strcmp("none", event)) {
|
||||
event = NULL;
|
||||
}
|
||||
text = argv[4];
|
||||
}
|
||||
status = start_capture(lsession, flags, lang, projectId, event, text);
|
||||
switch_core_session_rwunlock(lsession);
|
||||
}
|
||||
}
|
||||
|
||||
if (status == SWITCH_STATUS_SUCCESS) {
|
||||
stream->write_function(stream, "+OK Success\n");
|
||||
} else {
|
||||
stream->write_function(stream, "-ERR Operation Failed\n");
|
||||
}
|
||||
|
||||
done:
|
||||
|
||||
switch_safe_free(mycmd);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
#define DIALOGFLOW_API_STOP_SYNTAX "<uuid>"
|
||||
SWITCH_STANDARD_API(dialogflow_api_stop_function)
|
||||
{
|
||||
char *mycmd = NULL, *argv[10] = { 0 };
|
||||
int argc = 0;
|
||||
switch_status_t status = SWITCH_STATUS_FALSE;
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "command %s\n", cmd);
|
||||
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
|
||||
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
|
||||
}
|
||||
|
||||
if (zstr(cmd) || argc != 1) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s %s %s.\n", cmd, argv[0], argv[1]);
|
||||
stream->write_function(stream, "-USAGE: %s\n", DIALOGFLOW_API_STOP_SYNTAX);
|
||||
goto done;
|
||||
} else {
|
||||
switch_core_session_t *lsession = NULL;
|
||||
|
||||
if ((lsession = switch_core_session_locate(argv[0]))) {
|
||||
status = do_stop(lsession);
|
||||
switch_core_session_rwunlock(lsession);
|
||||
}
|
||||
}
|
||||
|
||||
if (status == SWITCH_STATUS_SUCCESS) {
|
||||
stream->write_function(stream, "+OK Success\n");
|
||||
} else {
|
||||
stream->write_function(stream, "-ERR Operation Failed\n");
|
||||
}
|
||||
|
||||
done:
|
||||
|
||||
switch_safe_free(mycmd);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/* Macro expands to: switch_status_t mod_dialogflow_load(switch_loadable_module_interface_t **module_interface, switch_memory_pool_t *pool) */
|
||||
SWITCH_MODULE_LOAD_FUNCTION(mod_dialogflow_load)
|
||||
{
|
||||
switch_api_interface_t *api_interface;
|
||||
|
||||
/* create/register custom event message types */
|
||||
if (switch_event_reserve_subclass(DIALOGFLOW_EVENT_INTENT) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", DIALOGFLOW_EVENT_INTENT);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(DIALOGFLOW_EVENT_TRANSCRIPTION) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", DIALOGFLOW_EVENT_TRANSCRIPTION);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(DIALOGFLOW_EVENT_END_OF_UTTERANCE) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", DIALOGFLOW_EVENT_END_OF_UTTERANCE);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(DIALOGFLOW_EVENT_AUDIO_PROVIDED) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", DIALOGFLOW_EVENT_AUDIO_PROVIDED);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
|
||||
if (switch_event_reserve_subclass(DIALOGFLOW_EVENT_ERROR) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", DIALOGFLOW_EVENT_ERROR);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
|
||||
|
||||
/* connect my internal structure to the blank pointer passed to me */
|
||||
*module_interface = switch_loadable_module_create_module_interface(pool, modname);
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Google Dialogflow API loading..\n");
|
||||
|
||||
if (SWITCH_STATUS_FALSE == google_dialogflow_init()) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Failed initializing google dialogflow interface\n");
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Google Dialogflow API successfully loaded\n");
|
||||
|
||||
SWITCH_ADD_API(api_interface, "dialogflow_start", "Start a google dialogflow", dialogflow_api_start_function, DIALOGFLOW_API_START_SYNTAX);
|
||||
SWITCH_ADD_API(api_interface, "dialogflow_stop", "Terminate a google dialogflow", dialogflow_api_stop_function, DIALOGFLOW_API_STOP_SYNTAX);
|
||||
|
||||
switch_console_set_complete("add dialogflow_stop");
|
||||
switch_console_set_complete("add dialogflow_start project lang");
|
||||
switch_console_set_complete("add dialogflow_start project lang timeout-secs");
|
||||
switch_console_set_complete("add dialogflow_start project lang timeout-secs event");
|
||||
|
||||
/* indicate that the module should continue to be loaded */
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Called when the system shuts down
|
||||
Macro expands to: switch_status_t mod_dialogflow_shutdown() */
|
||||
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_dialogflow_shutdown)
|
||||
{
|
||||
google_dialogflow_cleanup();
|
||||
|
||||
switch_event_free_subclass(DIALOGFLOW_EVENT_INTENT);
|
||||
switch_event_free_subclass(DIALOGFLOW_EVENT_TRANSCRIPTION);
|
||||
switch_event_free_subclass(DIALOGFLOW_EVENT_END_OF_UTTERANCE);
|
||||
switch_event_free_subclass(DIALOGFLOW_EVENT_AUDIO_PROVIDED);
|
||||
switch_event_free_subclass(DIALOGFLOW_EVENT_ERROR);
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
37
mod_dialogflow/mod_dialogflow.h
Normal file
37
mod_dialogflow/mod_dialogflow.h
Normal file
@@ -0,0 +1,37 @@
|
||||
#ifndef __MOD_DIALOGFLOW_H__
|
||||
#define __MOD_DIALOGFLOW_H__
|
||||
|
||||
#include <switch.h>
|
||||
#include <speex/speex_resampler.h>
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
#define MY_BUG_NAME "__dialogflow_bug__"
|
||||
#define DIALOGFLOW_EVENT_INTENT "dialogflow::intent"
|
||||
#define DIALOGFLOW_EVENT_TRANSCRIPTION "dialogflow::transcription"
|
||||
#define DIALOGFLOW_EVENT_AUDIO_PROVIDED "dialogflow::audio_provided"
|
||||
#define DIALOGFLOW_EVENT_END_OF_UTTERANCE "dialogflow::end_of_utterance"
|
||||
#define DIALOGFLOW_EVENT_ERROR "dialogflow::error"
|
||||
|
||||
#define MAX_LANG (12)
|
||||
#define MAX_PROJECT_ID (128)
|
||||
#define MAX_PATHLEN (256)
|
||||
|
||||
/* per-channel data */
|
||||
typedef void (*responseHandler_t)(switch_core_session_t* session, const char * type, char* json);
|
||||
typedef void (*errorHandler_t)(switch_core_session_t* session, const char * reason);
|
||||
|
||||
struct cap_cb {
|
||||
switch_mutex_t *mutex;
|
||||
char sessionId[256];
|
||||
SpeexResamplerState *resampler;
|
||||
void* streamer;
|
||||
responseHandler_t responseHandler;
|
||||
errorHandler_t errorHandler;
|
||||
switch_thread_t* thread;
|
||||
char lang[MAX_LANG];
|
||||
char projectId[MAX_PROJECT_ID];
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
567
mod_dialogflow/parser.cpp
Normal file
567
mod_dialogflow/parser.cpp
Normal file
@@ -0,0 +1,567 @@
|
||||
#include "parser.h"
|
||||
#include <switch.h>
|
||||
|
||||
template <typename T> cJSON* GRPCParser::parseCollection(const RepeatedPtrField<T> coll) {
|
||||
cJSON* json = cJSON_CreateArray();
|
||||
typename RepeatedPtrField<T>::const_iterator it = coll.begin();
|
||||
for (; it != coll.end(); it++) {
|
||||
cJSON_AddItemToArray(json, parse(*it));
|
||||
}
|
||||
return json;
|
||||
}
|
||||
|
||||
const std::string& GRPCParser::parseAudio(const StreamingDetectIntentResponse& response) {
|
||||
return response.output_audio();
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const StreamingDetectIntentResponse& response) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_INFO, "GStrGRPCParser - parsing StreamingDetectIntentResponse\n");
|
||||
|
||||
// response_id
|
||||
cJSON_AddItemToObject(json, "response_id",cJSON_CreateString(response.response_id().c_str()));
|
||||
|
||||
// recognition_result
|
||||
if (response.has_recognition_result()) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_INFO, "GStrGRPCParser - adding recognition result\n");
|
||||
cJSON_AddItemToObject(json, "recognition_result", parse(response.recognition_result()));
|
||||
}
|
||||
|
||||
// query_result
|
||||
if (response.has_query_result()) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_INFO, "GStrGRPCParser - adding query result\n");
|
||||
cJSON_AddItemToObject(json, "query_result", parse(response.query_result()));
|
||||
}
|
||||
|
||||
// alternative_query_results
|
||||
cJSON_AddItemToObject(json, "alternative_query_results", parseCollection(response.alternative_query_results()));
|
||||
|
||||
// webhook_status
|
||||
cJSON_AddItemToObject(json, "webhook_status", parse(response.webhook_status()));
|
||||
|
||||
//
|
||||
if (response.has_output_audio_config()) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_INFO, "GStrGRPCParser - adding audio config\n");
|
||||
cJSON_AddItemToObject(json, "output_audio_config", parse(response.output_audio_config()));
|
||||
}
|
||||
|
||||
// XXXX: not doing anything with output_audio for the moment
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const OutputAudioEncoding& o) {
|
||||
return cJSON_CreateString(OutputAudioEncoding_Name(o).c_str());
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const OutputAudioConfig& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "audio_encoding", parse(o.audio_encoding()));
|
||||
cJSON_AddItemToObject(json, "sample_rate_hertz", cJSON_CreateNumber(o.sample_rate_hertz()));
|
||||
cJSON_AddItemToObject(json, "synthesize_speech_config", parse(o.synthesize_speech_config()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const SynthesizeSpeechConfig& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "speaking_rate", cJSON_CreateNumber(o.speaking_rate()));
|
||||
cJSON_AddItemToObject(json, "pitch", cJSON_CreateNumber(o.pitch()));
|
||||
cJSON_AddItemToObject(json, "volume_gain_db", cJSON_CreateNumber(o.volume_gain_db()));
|
||||
cJSON_AddItemToObject(json, "effects_profile_id", parseCollection(o.effects_profile_id()));
|
||||
cJSON_AddItemToObject(json, "voice", parse(o.voice()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const SsmlVoiceGender& o) {
|
||||
return cJSON_CreateString(SsmlVoiceGender_Name(o).c_str());
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const VoiceSelectionParams& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "name", cJSON_CreateString(o.name().c_str()));
|
||||
cJSON_AddItemToObject(json, "ssml_gender", parse(o.ssml_gender()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const google::rpc::Status& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "code", cJSON_CreateNumber(o.code()));
|
||||
cJSON_AddItemToObject(json, "message", cJSON_CreateString(o.message().c_str()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Value& value) {
|
||||
cJSON* json = NULL;
|
||||
|
||||
switch (value.kind_case()) {
|
||||
case Value::KindCase::kNullValue:
|
||||
json = cJSON_CreateNull();
|
||||
break;
|
||||
|
||||
case Value::KindCase::kNumberValue:
|
||||
json = cJSON_CreateNumber(value.number_value());
|
||||
break;
|
||||
|
||||
case Value::KindCase::kStringValue:
|
||||
json = cJSON_CreateString(value.string_value().c_str());
|
||||
break;
|
||||
|
||||
case Value::KindCase::kBoolValue:
|
||||
json = cJSON_CreateBool(value.bool_value());
|
||||
break;
|
||||
|
||||
case Value::KindCase::kStructValue:
|
||||
json = parse(value.struct_value());
|
||||
break;
|
||||
|
||||
case Value::KindCase::kListValue:
|
||||
{
|
||||
const ListValue& list = value.list_value();
|
||||
json = cJSON_CreateArray();
|
||||
for (int i = 0; i < list.values_size(); i++) {
|
||||
const Value& val = list.values(i);
|
||||
cJSON_AddItemToArray(json, parse(val));
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Struct& rpcStruct) {
|
||||
cJSON* json = cJSON_CreateObject();
|
||||
|
||||
for (StructIterator_t it = rpcStruct.fields().begin(); it != rpcStruct.fields().end(); it++) {
|
||||
const std::string& key = it->first;
|
||||
const Value& value = it->second;
|
||||
cJSON_AddItemToObject(json, key.c_str(), parse(value));
|
||||
}
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_Message_SimpleResponse& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
cJSON_AddItemToObject(json, "ssml", cJSON_CreateString(o.ssml().c_str()));
|
||||
cJSON_AddItemToObject(json, "text_to_speech", cJSON_CreateString(o.text_to_speech().c_str()));
|
||||
cJSON_AddItemToObject(json, "display_text", cJSON_CreateString(o.display_text().c_str()));
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_Message_SimpleResponses& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
cJSON_AddItemToObject(json, "simple_responses", parseCollection(o.simple_responses()));
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_Message_Image& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
cJSON_AddItemToObject(json, "accessibility_text", cJSON_CreateString(o.accessibility_text().c_str()));
|
||||
cJSON_AddItemToObject(json, "image_uri", cJSON_CreateString(o.image_uri().c_str()));
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_Message_BasicCard_Button_OpenUriAction& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
cJSON_AddItemToObject(json, "uri", cJSON_CreateString(o.uri().c_str()));
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_Message_BasicCard_Button& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
cJSON_AddItemToObject(json, "title", cJSON_CreateString(o.title().c_str()));
|
||||
cJSON_AddItemToObject(json, "open_uri_action", parse(o.open_uri_action()));
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_Message_Card_Button& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
cJSON_AddItemToObject(json, "text", cJSON_CreateString(o.text().c_str()));
|
||||
cJSON_AddItemToObject(json, "postback", parse(o.postback()));
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_Message_BasicCard& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "title", cJSON_CreateString(o.title().c_str()));
|
||||
cJSON_AddItemToObject(json, "subtitle", cJSON_CreateString(o.subtitle().c_str()));
|
||||
cJSON_AddItemToObject(json, "formatted_text", cJSON_CreateString(o.formatted_text().c_str()));
|
||||
cJSON_AddItemToObject(json, "image", parse(o.image()));
|
||||
cJSON_AddItemToObject(json, "buttons", parseCollection(o.buttons()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_Message_Card& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "title", cJSON_CreateString(o.title().c_str()));
|
||||
cJSON_AddItemToObject(json, "subtitle", cJSON_CreateString(o.subtitle().c_str()));
|
||||
cJSON_AddItemToObject(json, "image_uri", cJSON_CreateString(o.image_uri().c_str()));
|
||||
cJSON_AddItemToObject(json, "buttons", parseCollection(o.buttons()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_Message_Suggestion& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "title", cJSON_CreateString(o.title().c_str()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_Message_Suggestions& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "suggestions", parseCollection(o.suggestions()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const std::string& val) {
|
||||
return cJSON_CreateString(val.c_str());
|
||||
}
|
||||
cJSON* GRPCParser::parse(const Intent_Message_LinkOutSuggestion& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "destination_name", cJSON_CreateString(o.destination_name().c_str()));
|
||||
cJSON_AddItemToObject(json, "uri", cJSON_CreateString(o.uri().c_str()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_Message_SelectItemInfo& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "key", cJSON_CreateString(o.key().c_str()));
|
||||
cJSON_AddItemToObject(json, "synonyms", parseCollection(o.synonyms()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_Message_ListSelect_Item& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "info", parse(o.info()));
|
||||
cJSON_AddItemToObject(json, "title", cJSON_CreateString(o.title().c_str()));
|
||||
cJSON_AddItemToObject(json, "description", cJSON_CreateString(o.description().c_str()));
|
||||
cJSON_AddItemToObject(json, "image", parse(o.image()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_Message_CarouselSelect& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "items", parseCollection(o.items()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_Message_CarouselSelect_Item& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "info", parse(o.info()));
|
||||
cJSON_AddItemToObject(json, "title", cJSON_CreateString(o.title().c_str()));
|
||||
cJSON_AddItemToObject(json, "description", cJSON_CreateString(o.description().c_str()));
|
||||
cJSON_AddItemToObject(json, "image", parse(o.image()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_Message_ListSelect& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "title", cJSON_CreateString(o.title().c_str()));
|
||||
cJSON_AddItemToObject(json, "items", parseCollection(o.items()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_Message_TelephonyPlayAudio& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "audio_uri", cJSON_CreateString(o.audio_uri().c_str()));
|
||||
|
||||
return json;
|
||||
}
|
||||
cJSON* GRPCParser::parse(const Intent_Message_TelephonySynthesizeSpeech& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "text", cJSON_CreateString(o.text().c_str()));
|
||||
cJSON_AddItemToObject(json, "ssml", cJSON_CreateString(o.ssml().c_str()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_Message_TelephonyTransferCall& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "phone_number", cJSON_CreateString(o.phone_number().c_str()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_Message_QuickReplies& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "title", cJSON_CreateString(o.title().c_str()));
|
||||
cJSON_AddItemToObject(json, "quick_replies", parseCollection(o.quick_replies()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_Message_Text& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "text", parseCollection(o.text()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_TrainingPhrase_Part& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "text", cJSON_CreateString(o.text().c_str()));
|
||||
cJSON_AddItemToObject(json, "entity_type", cJSON_CreateString(o.entity_type().c_str()));
|
||||
cJSON_AddItemToObject(json, "alias", cJSON_CreateString(o.alias().c_str()));
|
||||
cJSON_AddItemToObject(json, "user", cJSON_CreateBool(o.user_defined()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_WebhookState& o) {
|
||||
return cJSON_CreateString(Intent_WebhookState_Name(o).c_str());
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_TrainingPhrase_Type& o) {
|
||||
return cJSON_CreateString(Intent_TrainingPhrase_Type_Name(o).c_str());
|
||||
}
|
||||
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_TrainingPhrase& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "name", cJSON_CreateString(o.name().c_str()));
|
||||
cJSON_AddItemToObject(json, "type", parse(o.type()));
|
||||
cJSON_AddItemToObject(json, "parts", parseCollection(o.parts()));
|
||||
cJSON_AddItemToObject(json, "times_added_count", cJSON_CreateNumber(o.times_added_count()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_Parameter& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "name", cJSON_CreateString(o.name().c_str()));
|
||||
cJSON_AddItemToObject(json, "display_name", cJSON_CreateString(o.display_name().c_str()));
|
||||
cJSON_AddItemToObject(json, "value", cJSON_CreateString(o.value().c_str()));
|
||||
cJSON_AddItemToObject(json, "default_value", cJSON_CreateString(o.default_value().c_str()));
|
||||
cJSON_AddItemToObject(json, "entity_type_display_name", cJSON_CreateString(o.entity_type_display_name().c_str()));
|
||||
cJSON_AddItemToObject(json, "mandatory", cJSON_CreateBool(o.mandatory()));
|
||||
cJSON_AddItemToObject(json, "prompts", parseCollection(o.prompts()));
|
||||
cJSON_AddItemToObject(json, "is_list", cJSON_CreateBool(o.is_list()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_FollowupIntentInfo& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "followup_intent_name", cJSON_CreateString(o.followup_intent_name().c_str()));
|
||||
cJSON_AddItemToObject(json, "parent_followup_intent_name", cJSON_CreateString(o.parent_followup_intent_name().c_str()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Sentiment& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "score", cJSON_CreateNumber(o.score()));
|
||||
cJSON_AddItemToObject(json, "magnitude", cJSON_CreateNumber(o.magnitude()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const SentimentAnalysisResult& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "query_text_sentiment", parse(o.query_text_sentiment()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const KnowledgeAnswers_Answer_MatchConfidenceLevel& o) {
|
||||
return cJSON_CreateString(KnowledgeAnswers_Answer_MatchConfidenceLevel_Name(o).c_str());
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const KnowledgeAnswers_Answer& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "source", cJSON_CreateString(o.source().c_str()));
|
||||
cJSON_AddItemToObject(json, "faq_question", cJSON_CreateString(o.faq_question().c_str()));
|
||||
cJSON_AddItemToObject(json, "answer", cJSON_CreateString(o.answer().c_str()));
|
||||
cJSON_AddItemToObject(json, "match_confidence_level", parse(o.match_confidence_level()));
|
||||
cJSON_AddItemToObject(json, "match_confidence", cJSON_CreateNumber(o.match_confidence()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const KnowledgeAnswers& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "answers", parseCollection(o.answers()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "name", cJSON_CreateString(o.name().c_str()));
|
||||
cJSON_AddItemToObject(json, "display_name", cJSON_CreateString(o.display_name().c_str()));
|
||||
cJSON_AddItemToObject(json, "webhook_state", parse(o.webhook_state()));
|
||||
cJSON_AddItemToObject(json, "priority", cJSON_CreateNumber(o.priority()));
|
||||
cJSON_AddItemToObject(json, "is_fallback", cJSON_CreateBool(o.is_fallback()));
|
||||
cJSON_AddItemToObject(json, "ml_disabled", cJSON_CreateBool(o.ml_disabled()));
|
||||
cJSON_AddItemToObject(json, "end_interaction", cJSON_CreateBool(o.end_interaction()));
|
||||
cJSON_AddItemToObject(json, "input_context_names", parseCollection(o.input_context_names()));
|
||||
cJSON_AddItemToObject(json, "events", parseCollection(o.events()));
|
||||
cJSON_AddItemToObject(json, "training_phrases", parseCollection(o.training_phrases()));
|
||||
cJSON_AddItemToObject(json, "action", cJSON_CreateString(o.action().c_str()));
|
||||
cJSON_AddItemToObject(json, "output_contexts", parseCollection(o.output_contexts()));
|
||||
cJSON_AddItemToObject(json, "reset_contexts", cJSON_CreateBool(o.reset_contexts()));
|
||||
cJSON_AddItemToObject(json, "parameters", parseCollection(o.parameters()));
|
||||
cJSON_AddItemToObject(json, "messages", parseCollection(o.messages()));
|
||||
|
||||
cJSON* j = cJSON_CreateArray();
|
||||
for (int i = 0; i < o.default_response_platforms_size(); i++) {
|
||||
cJSON_AddItemToArray(j, cJSON_CreateString(Intent_Message_Platform_Name(o.default_response_platforms(i)).c_str()));
|
||||
}
|
||||
cJSON_AddItemToObject(json, "default_response_platforms", j);
|
||||
|
||||
cJSON_AddItemToObject(json, "root_followup_intent_name", cJSON_CreateString(o.root_followup_intent_name().c_str()));
|
||||
cJSON_AddItemToObject(json, "followup_intent_info", parseCollection(o.followup_intent_info()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const google::cloud::dialogflow::v2beta1::Context& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "name", cJSON_CreateString(o.name().c_str()));
|
||||
cJSON_AddItemToObject(json, "lifespan_count", cJSON_CreateNumber(o.lifespan_count()));
|
||||
if (o.has_parameters()) cJSON_AddItemToObject(json, "parameters", parse(o.parameters()));
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const Intent_Message& msg) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
auto platform = msg.platform();
|
||||
cJSON_AddItemToObject(json, "platform", cJSON_CreateString(Intent_Message_Platform_Name(platform).c_str()));
|
||||
|
||||
if (msg.has_text()) {
|
||||
cJSON_AddItemToObject(json, "text", parse(msg.text()));
|
||||
}
|
||||
|
||||
if (msg.has_image()) {
|
||||
cJSON_AddItemToObject(json, "image", parse(msg.image()));
|
||||
}
|
||||
|
||||
if (msg.has_quick_replies()) {
|
||||
cJSON_AddItemToObject(json, "quick_replies", parse(msg.quick_replies()));
|
||||
}
|
||||
|
||||
if (msg.has_card()) {
|
||||
cJSON_AddItemToObject(json, "card", parse(msg.card()));
|
||||
}
|
||||
|
||||
if (msg.has_payload()) {
|
||||
cJSON_AddItemToObject(json, "payload", parse(msg.payload()));
|
||||
}
|
||||
|
||||
if (msg.has_simple_responses()) {
|
||||
cJSON_AddItemToObject(json, "simple_responses", parse(msg.simple_responses()));
|
||||
}
|
||||
|
||||
if (msg.has_basic_card()) {
|
||||
cJSON_AddItemToObject(json, "basic_card", parse(msg.card()));
|
||||
}
|
||||
|
||||
if (msg.has_suggestions()) {
|
||||
cJSON_AddItemToObject(json, "suggestions", parse(msg.suggestions()));
|
||||
}
|
||||
|
||||
if (msg.has_link_out_suggestion()) {
|
||||
cJSON_AddItemToObject(json, "link_out_suggestion", parse(msg.link_out_suggestion()));
|
||||
}
|
||||
|
||||
if (msg.has_list_select()) {
|
||||
cJSON_AddItemToObject(json, "list_select", parse(msg.list_select()));
|
||||
}
|
||||
|
||||
if (msg.has_telephony_play_audio()) {
|
||||
cJSON_AddItemToObject(json, "telephony_play_audio", parse(msg.telephony_play_audio()));
|
||||
}
|
||||
|
||||
if (msg.has_telephony_synthesize_speech()) {
|
||||
cJSON_AddItemToObject(json, "telephony_synthesize_speech", parse(msg.telephony_synthesize_speech()));
|
||||
}
|
||||
|
||||
if (msg.has_telephony_transfer_call()) {
|
||||
cJSON_AddItemToObject(json, "telephony_transfer_call", parse(msg.telephony_transfer_call()));
|
||||
}
|
||||
|
||||
return json;
|
||||
}
|
||||
|
||||
|
||||
cJSON* GRPCParser::parse(const QueryResult& qr) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "query_text", cJSON_CreateString(qr.query_text().c_str()));
|
||||
cJSON_AddItemToObject(json, "language_code", cJSON_CreateString(qr.language_code().c_str()));
|
||||
cJSON_AddItemToObject(json, "speech_recognition_confidence", cJSON_CreateNumber(qr.speech_recognition_confidence()));
|
||||
cJSON_AddItemToObject(json, "action", cJSON_CreateString(qr.action().c_str()));
|
||||
cJSON_AddItemToObject(json, "parameters", parse(qr.parameters()));
|
||||
cJSON_AddItemToObject(json, "all_required_params_present", cJSON_CreateBool(qr.all_required_params_present()));
|
||||
cJSON_AddItemToObject(json, "fulfillment_text", cJSON_CreateString(qr.fulfillment_text().c_str()));
|
||||
cJSON_AddItemToObject(json, "fulfillment_messages", parseCollection(qr.fulfillment_messages()));
|
||||
cJSON_AddItemToObject(json, "webhook_source", cJSON_CreateString(qr.webhook_source().c_str()));
|
||||
if (qr.has_webhook_payload()) cJSON_AddItemToObject(json, "webhook_payload", parse(qr.webhook_payload()));
|
||||
cJSON_AddItemToObject(json, "output_contexts", parseCollection(qr.output_contexts()));
|
||||
cJSON_AddItemToObject(json, "intent", parse(qr.intent()));
|
||||
cJSON_AddItemToObject(json, "intent_detection_confidence", cJSON_CreateNumber(qr.intent_detection_confidence()));
|
||||
if (qr.has_diagnostic_info()) cJSON_AddItemToObject(json, "diagnostic_info", parse(qr.diagnostic_info()));
|
||||
cJSON_AddItemToObject(json, "sentiment_analysis_result", parse(qr.sentiment_analysis_result()));
|
||||
cJSON_AddItemToObject(json, "knowledge_answers", parse(qr.knowledge_answers()));
|
||||
|
||||
return json;
|
||||
}
|
||||
cJSON* GRPCParser::parse(const StreamingRecognitionResult_MessageType& o) {
|
||||
return cJSON_CreateString(StreamingRecognitionResult_MessageType_Name(o).c_str());
|
||||
}
|
||||
|
||||
cJSON* GRPCParser::parse(const StreamingRecognitionResult& o) {
|
||||
cJSON * json = cJSON_CreateObject();
|
||||
|
||||
cJSON_AddItemToObject(json, "message_type", parse(o.message_type()));
|
||||
cJSON_AddItemToObject(json, "transcript", cJSON_CreateString(o.transcript().c_str()));
|
||||
cJSON_AddItemToObject(json, "is_final", cJSON_CreateBool(o.is_final()));
|
||||
cJSON_AddItemToObject(json, "confidence", cJSON_CreateNumber(o.confidence()));
|
||||
|
||||
return json;
|
||||
}
|
||||
137
mod_dialogflow/parser.h
Normal file
137
mod_dialogflow/parser.h
Normal file
@@ -0,0 +1,137 @@
|
||||
#ifndef __PARSER_H__
|
||||
#define __PARSER_H__
|
||||
|
||||
#include <switch_json.h>
|
||||
#include <grpc++/grpc++.h>
|
||||
#include "google/cloud/dialogflow/v2beta1/session.grpc.pb.h"
|
||||
|
||||
using google::cloud::dialogflow::v2beta1::Sessions;
|
||||
using google::cloud::dialogflow::v2beta1::StreamingDetectIntentRequest;
|
||||
using google::cloud::dialogflow::v2beta1::StreamingDetectIntentResponse;
|
||||
using google::cloud::dialogflow::v2beta1::AudioEncoding;
|
||||
using google::cloud::dialogflow::v2beta1::InputAudioConfig;
|
||||
using google::cloud::dialogflow::v2beta1::OutputAudioConfig;
|
||||
using google::cloud::dialogflow::v2beta1::SynthesizeSpeechConfig;
|
||||
using google::cloud::dialogflow::v2beta1::VoiceSelectionParams;
|
||||
using google::cloud::dialogflow::v2beta1::SsmlVoiceGender;
|
||||
using google::cloud::dialogflow::v2beta1::SsmlVoiceGender_Name;
|
||||
using google::cloud::dialogflow::v2beta1::QueryInput;
|
||||
using google::cloud::dialogflow::v2beta1::QueryResult;
|
||||
using google::cloud::dialogflow::v2beta1::StreamingRecognitionResult;
|
||||
using google::cloud::dialogflow::v2beta1::StreamingRecognitionResult_MessageType;
|
||||
using google::cloud::dialogflow::v2beta1::StreamingRecognitionResult_MessageType_Name;
|
||||
using google::cloud::dialogflow::v2beta1::EventInput;
|
||||
using google::cloud::dialogflow::v2beta1::OutputAudioEncoding;
|
||||
using google::cloud::dialogflow::v2beta1::OutputAudioEncoding_Name;
|
||||
using google::cloud::dialogflow::v2beta1::Context;
|
||||
using google::cloud::dialogflow::v2beta1::Sentiment;
|
||||
using google::cloud::dialogflow::v2beta1::SentimentAnalysisResult;
|
||||
using google::cloud::dialogflow::v2beta1::KnowledgeAnswers;
|
||||
using google::cloud::dialogflow::v2beta1::KnowledgeAnswers_Answer;
|
||||
using google::cloud::dialogflow::v2beta1::KnowledgeAnswers_Answer_MatchConfidenceLevel;
|
||||
using google::cloud::dialogflow::v2beta1::KnowledgeAnswers_Answer_MatchConfidenceLevel_Name;
|
||||
using google::cloud::dialogflow::v2beta1::Intent;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_FollowupIntentInfo;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_WebhookState;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_WebhookState_Name;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Parameter;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_TrainingPhrase;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_TrainingPhrase_Type;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_TrainingPhrase_Part;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_TrainingPhrase_Type_Name;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Message;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Message_QuickReplies;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Message_Platform_Name;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Message_SimpleResponses;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Message_SimpleResponse;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Message_BasicCard;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Message_Card;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Message_Image;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Message_Text;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Message_Card_Button;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Message_BasicCard_Button;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Message_BasicCard_Button_OpenUriAction;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Message_Suggestion;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Message_Suggestions;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Message_LinkOutSuggestion;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Message_ListSelect;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Message_CarouselSelect;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Message_CarouselSelect_Item;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Message_ListSelect_Item;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Message_SelectItemInfo;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Message_TelephonyPlayAudio;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Message_TelephonySynthesizeSpeech;
|
||||
using google::cloud::dialogflow::v2beta1::Intent_Message_TelephonyTransferCall;
|
||||
using google::protobuf::RepeatedPtrField;
|
||||
using google::rpc::Status;
|
||||
using google::protobuf::Struct;
|
||||
using google::protobuf::Value;
|
||||
using google::protobuf::ListValue;
|
||||
|
||||
typedef google::protobuf::Map< std::string, Value >::const_iterator StructIterator_t;
|
||||
|
||||
class GRPCParser {
|
||||
public:
|
||||
GRPCParser(switch_core_session_t *session) : m_session(session) {}
|
||||
~GRPCParser() {}
|
||||
|
||||
template <typename T> cJSON* parseCollection(const RepeatedPtrField<T> coll) ;
|
||||
|
||||
cJSON* parse(const StreamingDetectIntentResponse& response) ;
|
||||
const std::string& parseAudio(const StreamingDetectIntentResponse& response);
|
||||
|
||||
|
||||
cJSON* parse(const OutputAudioEncoding& o) ;
|
||||
cJSON* parse(const OutputAudioConfig& o) ;
|
||||
cJSON* parse(const SynthesizeSpeechConfig& o) ;
|
||||
cJSON* parse(const SsmlVoiceGender& o) ;
|
||||
cJSON* parse(const VoiceSelectionParams& o) ;
|
||||
cJSON* parse(const google::rpc::Status& o) ;
|
||||
cJSON* parse(const Value& value) ;
|
||||
cJSON* parse(const Struct& rpcStruct) ;
|
||||
cJSON* parse(const Intent_Message_SimpleResponses& o) ;
|
||||
cJSON* parse(const Intent_Message_SimpleResponse& o) ;
|
||||
cJSON* parse(const Intent_Message_Image& o) ;
|
||||
cJSON* parse(const Intent_Message_BasicCard_Button_OpenUriAction& o) ;
|
||||
cJSON* parse(const Intent_Message_BasicCard_Button& o) ;
|
||||
cJSON* parse(const Intent_Message_Card_Button& o) ;
|
||||
cJSON* parse(const Intent_Message_BasicCard& o) ;
|
||||
cJSON* parse(const Intent_Message_Card& o) ;
|
||||
cJSON* parse(const Intent_Message_Suggestion& o) ;
|
||||
cJSON* parse(const Intent_Message_Suggestions& o) ;
|
||||
cJSON* parse(const std::string& val) ;
|
||||
cJSON* parse(const Intent_Message_LinkOutSuggestion& o) ;
|
||||
cJSON* parse(const Intent_Message_SelectItemInfo& o) ;
|
||||
cJSON* parse(const Intent_Message_ListSelect_Item& o) ;
|
||||
cJSON* parse(const Intent_Message_CarouselSelect& o) ;
|
||||
cJSON* parse(const Intent_Message_CarouselSelect_Item& o) ;
|
||||
cJSON* parse(const Intent_Message_ListSelect& o) ;
|
||||
cJSON* parse(const Intent_Message_TelephonyPlayAudio& o) ;
|
||||
cJSON* parse(const Intent_Message_TelephonySynthesizeSpeech& o) ;
|
||||
cJSON* parse(const Intent_Message_TelephonyTransferCall& o) ;
|
||||
cJSON* parse(const Intent_Message_QuickReplies& o) ;
|
||||
cJSON* parse(const Intent_Message_Text& o) ;
|
||||
cJSON* parse(const Intent_TrainingPhrase_Part& o) ;
|
||||
cJSON* parse(const Intent_WebhookState& o) ;
|
||||
cJSON* parse(const Intent_TrainingPhrase_Type& o) ;
|
||||
cJSON* parse(const Intent_TrainingPhrase& o) ;
|
||||
cJSON* parse(const Intent_Parameter& o) ;
|
||||
cJSON* parse(const Intent_FollowupIntentInfo& o) ;
|
||||
cJSON* parse(const Sentiment& o) ;
|
||||
cJSON* parse(const SentimentAnalysisResult& o) ;
|
||||
cJSON* parse(const KnowledgeAnswers_Answer_MatchConfidenceLevel& o) ;
|
||||
cJSON* parse(const KnowledgeAnswers_Answer& o) ;
|
||||
cJSON* parse(const KnowledgeAnswers& o) ;
|
||||
cJSON* parse(const Intent& o) ;
|
||||
cJSON* parse(const google::cloud::dialogflow::v2beta1::Context& o) ;
|
||||
cJSON* parse(const Intent_Message& msg) ;
|
||||
cJSON* parse(const QueryResult& qr) ;
|
||||
cJSON* parse(const StreamingRecognitionResult_MessageType& o) ;
|
||||
cJSON* parse(const StreamingRecognitionResult& o) ;
|
||||
|
||||
private:
|
||||
switch_core_session_t *m_session;
|
||||
} ;
|
||||
|
||||
|
||||
#endif
|
||||
BIN
mod_google_transcribe/.DS_Store
vendored
Normal file
BIN
mod_google_transcribe/.DS_Store
vendored
Normal file
Binary file not shown.
8
mod_google_transcribe/LICENSE
Normal file
8
mod_google_transcribe/LICENSE
Normal file
@@ -0,0 +1,8 @@
|
||||
Copyright 2023, Drachtio Communications Services, LLC
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
10
mod_google_transcribe/Makefile.am
Normal file
10
mod_google_transcribe/Makefile.am
Normal file
@@ -0,0 +1,10 @@
|
||||
include $(top_srcdir)/build/modmake.rulesam
|
||||
MODNAME=mod_google_transcribe
|
||||
|
||||
mod_LTLIBRARIES = mod_google_transcribe.la
|
||||
mod_google_transcribe_la_SOURCES = mod_google_transcribe.c google_glue.cpp
|
||||
mod_google_transcribe_la_CFLAGS = $(AM_CFLAGS)
|
||||
mod_google_transcribe_la_CXXFLAGS = -I $(top_srcdir)/libs/googleapis/gens $(AM_CXXFLAGS) -std=c++17
|
||||
|
||||
mod_google_transcribe_la_LIBADD = $(switch_builddir)/libfreeswitch.la
|
||||
mod_google_transcribe_la_LDFLAGS = -avoid-version -module -no-undefined -shared `pkg-config --libs grpc++ grpc`
|
||||
101
mod_google_transcribe/README.md
Normal file
101
mod_google_transcribe/README.md
Normal file
@@ -0,0 +1,101 @@
|
||||
# mod_google_transcribe
|
||||
|
||||
A Freeswitch module that generates real-time transcriptions on a Freeswitch channel by using Google's Speech-to-Text API.
|
||||
|
||||
Optionally, the connection to the google cloud recognizer can be delayed until voice activity has been detected. This can be useful in cases where it is desired to minimize the costs of streaming audio for transcription. This setting is governed by the channel variables starting with 1RECOGNIZER_VAD`, as described below.
|
||||
|
||||
## API
|
||||
|
||||
### Commands
|
||||
The freeswitch module exposes two versions of an API command to transcribe speech:
|
||||
#### version 1
|
||||
```bash
|
||||
uuid_google_transcribe <uuid> start <lang-code> [interim]
|
||||
```
|
||||
When using this command, additional speech processing options can be provided through Freeswitch channel variables, described [below](#command-variables).
|
||||
|
||||
####version 2
|
||||
```bash
|
||||
uuid_google_transcribe2 <uuid> start <lang-code> [interim] (bool) \
|
||||
[single-utterance](bool) [separate-recognition](bool) [max-alternatives](int) \
|
||||
[profanity-filter](bool) [word-time](bool) [punctuation](bool) \
|
||||
[model](string) [enhanced](bool) [hints](word seperated by , and no spaces) \
|
||||
[play-file] (play file path)
|
||||
```
|
||||
This command allows speech processing options to be provided on the command line, and has the ability to optionally play an audio file as a prompt.
|
||||
|
||||
Example:
|
||||
```bash
|
||||
bgapi uuid_google_transcribe2 312033b6-4b2a-48d8-be0c-5f161aec2b3e start en-US \
|
||||
true true true 5 true true true command_and_search true \
|
||||
yes,no,hello https://www2.cs.uic.edu/~i101/SoundFiles/CantinaBand60.wav
|
||||
```
|
||||
Attaches media bug to channel and performs streaming recognize request.
|
||||
- `uuid` - unique identifier of Freeswitch channel
|
||||
- `lang-code` - a valid Google [language code](https://cloud.google.com/speech-to-text/docs/languages) to use for speech recognition
|
||||
- `interim` - If the 'interim' keyword is present then both interim and final transcription results will be returned; otherwise only final transcriptions will be returned
|
||||
|
||||
```
|
||||
uuid_google_transcribe <uuid> stop
|
||||
```
|
||||
Stop transcription on the channel.
|
||||
|
||||
### Command Variables
|
||||
Additional google speech options can be set through freeswitch channel variables for `uuid_google_transcribe` (some can alternatively be set in the command line for `uuid_google_transcribe2`).
|
||||
|
||||
| variable | Description |
|
||||
| --- | ----------- |
|
||||
| GOOGLE_SPEECH_SINGLE_UTTERANCE | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.StreamingRecognitionConfig.FIELDS.bool.google.cloud.speech.v1.StreamingRecognitionConfig.single_utterance) |
|
||||
| GOOGLE_SPEECH_SEPARATE_RECOGNITION_PER_CHANNEL | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.RecognitionConfig.FIELDS.bool.google.cloud.speech.v1.RecognitionConfig.enable_separate_recognition_per_channel) |
|
||||
| GOOGLE_SPEECH_MAX_ALTERNATIVES | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.RecognitionConfig.FIELDS.int32.google.cloud.speech.v1.RecognitionConfig.max_alternatives) |
|
||||
| GOOGLE_SPEECH_PROFANITY_FILTER | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.RecognitionConfig.FIELDS.bool.google.cloud.speech.v1.RecognitionConfig.profanity_filter) |
|
||||
| GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.RecognitionConfig.FIELDS.bool.google.cloud.speech.v1.RecognitionConfig.enable_word_time_offsets) |
|
||||
| GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.RecognitionConfig.FIELDS.bool.google.cloud.speech.v1.RecognitionConfig.enable_automatic_punctuation) |
|
||||
| GOOGLE_SPEECH_MODEL | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.RecognitionConfig.FIELDS.string.google.cloud.speech.v1.RecognitionConfig.model) |
|
||||
| GOOGLE_SPEECH_USE_ENHANCED | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.RecognitionConfig.FIELDS.bool.google.cloud.speech.v1.RecognitionConfig.use_enhanced) |
|
||||
| GOOGLE_SPEECH_HINTS | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.PhraseSet) |
|
||||
| GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES | a comma-separated list of language codes, [per this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.RecognitionConfig.FIELDS.repeated.string.google.cloud.speech.v1p1beta1.RecognitionConfig.alternative_language_codes) |
|
||||
| GOOGLE_SPEECH_SPEAKER_DIARIZATION | set to 1 to enable [speaker diarization](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.SpeakerDiarizationConfig) |
|
||||
| GOOGLE_SPEECH_SPEAKER_DIARIZATION_MIN_SPEAKER_COUNT | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.SpeakerDiarizationConfig) |
|
||||
| GOOGLE_SPEECH_SPEAKER_DIARIZATION_MAX_SPEAKER_COUNT | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.SpeakerDiarizationConfig) |
|
||||
| GOOGLE_SPEECH_METADATA_INTERACTION_TYPE | set to 'discussion', 'presentation', 'phone_call', 'voicemail', 'professionally_produced', 'voice_search', 'voice_command', or 'dictation' [per this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.RecognitionMetadata.InteractionType) |
|
||||
| GOOGLE_SPEECH_METADATA_INDUSTRY_NAICS_CODE | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.RecognitionMetadata) |
|
||||
| GOOGLE_SPEECH_METADATA_MICROPHONE_DISTANCE | set to 'nearfield', 'midfield', or 'farfield' [per this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.RecognitionMetadata.MicrophoneDistance) |
|
||||
| GOOGLE_SPEECH_METADATA_ORIGINAL_MEDIA_TYPE | set to 'audio', or 'video' [per this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.RecognitionMetadata.OriginalMediaType) |
|
||||
| GOOGLE_SPEECH_METADATA_RECORDING_DEVICE_TYPE | set to 'smartphone', 'pc', 'phone_line', 'vehicle', 'other_outdoor_device', or 'other_indoor_device' [per this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.RecognitionMetadata.RecordingDeviceType)|
|
||||
| START_RECOGNIZING_ON_VAD | if set to 1 or true, do not begin streaming audio to google cloud until voice activity is detected.|
|
||||
| RECOGNIZER_VAD_MODE | An integer value 0-3 from less to more aggressive vad detection (default: 2).|
|
||||
| RECOGNIZER_VAD_VOICE_MS | The number of milliseconds of voice activity that is required to trigger the connection to google cloud, when START_RECOGNIZING_ON_VAD is set (default: 250).|
|
||||
| RECOGNIZER_VAD_DEBUG | if >0 vad debug logs will be generated (default: 0).|
|
||||
|
||||
|
||||
### Events
|
||||
**google_transcribe::transcription** - returns an interim or final transcription. The event contains a JSON body describing the transcription result:
|
||||
```js
|
||||
{
|
||||
"stability": 0,
|
||||
"is_final": true,
|
||||
"alternatives": [{
|
||||
"confidence": 0.96471,
|
||||
"transcript": "Donny was a good bowler, and a good man"
|
||||
}]
|
||||
}
|
||||
```
|
||||
|
||||
**google_transcribe::end_of_utterance** - returns an indication that an utterance has been detected. This may be returned prior to a final transcription. This event is only returned when GOOGLE_SPEECH_SINGLE_UTTERANCE is set to true.
|
||||
|
||||
**google_transcribe::end_of_transcript** - returned when a transcription operation has completed. If a final transcription has not been returned by now, it won't be. This event is only returned when GOOGLE_SPEECH_SINGLE_UTTERANCE is set to true.
|
||||
|
||||
**google_transcribe::no_audio_detected** - returned when google has returned an error indicating that no audio was received for a lengthy period of time.
|
||||
|
||||
**google_transcribe::max_duration_exceeded** - returned when google has returned an an indication that a long-running transcription has been stopped due to a max duration limit (305 seconds) on their side. It is the applications responsibility to respond by starting a new transcription session, if desired.
|
||||
|
||||
**google_transcribe::no_audio_detected** - returned when google has not received any audio for some reason.
|
||||
|
||||
## Usage
|
||||
When using [drachtio-fsrmf](https://www.npmjs.com/package/drachtio-fsmrf), you can access this API command via the api method on the 'endpoint' object.
|
||||
```js
|
||||
ep.api('uuid_google_transcribe', `${ep.uuid} start en-US`);
|
||||
```
|
||||
## Examples
|
||||
[google_transcribe.js](../../examples/google_transcribe.js)
|
||||
727
mod_google_transcribe/google_glue.cpp
Normal file
727
mod_google_transcribe/google_glue.cpp
Normal file
@@ -0,0 +1,727 @@
|
||||
#include <cstdlib>
|
||||
#include <algorithm>
|
||||
#include <future>
|
||||
|
||||
#include <switch.h>
|
||||
#include <switch_json.h>
|
||||
#include <grpc++/grpc++.h>
|
||||
|
||||
#include "google/cloud/speech/v1p1beta1/cloud_speech.grpc.pb.h"
|
||||
|
||||
#include <switch_json.h>
|
||||
|
||||
#include "mod_google_transcribe.h"
|
||||
#include "simple_buffer.h"
|
||||
|
||||
using google::cloud::speech::v1p1beta1::RecognitionConfig;
|
||||
using google::cloud::speech::v1p1beta1::Speech;
|
||||
using google::cloud::speech::v1p1beta1::SpeechContext;
|
||||
using google::cloud::speech::v1p1beta1::StreamingRecognizeRequest;
|
||||
using google::cloud::speech::v1p1beta1::StreamingRecognizeResponse;
|
||||
using google::cloud::speech::v1p1beta1::SpeakerDiarizationConfig;
|
||||
using google::cloud::speech::v1p1beta1::SpeechAdaptation;
|
||||
using google::cloud::speech::v1p1beta1::PhraseSet;
|
||||
using google::cloud::speech::v1p1beta1::PhraseSet_Phrase;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_DISCUSSION;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_PRESENTATION;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_PHONE_CALL;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_VOICEMAIL;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_PROFESSIONALLY_PRODUCED;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_VOICE_SEARCH;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_VOICE_COMMAND;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_DICTATION;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_MicrophoneDistance_NEARFIELD;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_MicrophoneDistance_MIDFIELD;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_MicrophoneDistance_FARFIELD;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_OriginalMediaType_AUDIO;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_OriginalMediaType_VIDEO;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_RecordingDeviceType_SMARTPHONE;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_RecordingDeviceType_PC;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_RecordingDeviceType_PHONE_LINE;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_RecordingDeviceType_VEHICLE;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_RecordingDeviceType_OTHER_OUTDOOR_DEVICE;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_RecordingDeviceType_OTHER_INDOOR_DEVICE;
|
||||
using google::cloud::speech::v1p1beta1::StreamingRecognizeResponse_SpeechEventType_END_OF_SINGLE_UTTERANCE;
|
||||
using google::rpc::Status;
|
||||
|
||||
#define CHUNKSIZE (320)
|
||||
|
||||
namespace {
|
||||
int case_insensitive_match(std::string s1, std::string s2) {
|
||||
std::transform(s1.begin(), s1.end(), s1.begin(), ::tolower);
|
||||
std::transform(s2.begin(), s2.end(), s2.begin(), ::tolower);
|
||||
if(s1.compare(s2) == 0)
|
||||
return 1; //The strings are same
|
||||
return 0; //not matched
|
||||
}
|
||||
}
|
||||
class GStreamer;
|
||||
|
||||
class GStreamer {
|
||||
public:
|
||||
GStreamer(
|
||||
switch_core_session_t *session,
|
||||
uint32_t channels,
|
||||
char* lang,
|
||||
int interim,
|
||||
uint32_t config_sample_rate,
|
||||
uint32_t samples_per_second,
|
||||
int single_utterance,
|
||||
int separate_recognition,
|
||||
int max_alternatives,
|
||||
int profanity_filter,
|
||||
int word_time_offset,
|
||||
int punctuation,
|
||||
const char* model,
|
||||
int enhanced,
|
||||
const char* hints) : m_session(session), m_writesDone(false), m_connected(false),
|
||||
m_audioBuffer(CHUNKSIZE, 15) {
|
||||
|
||||
const char* var;
|
||||
const char* google_uri;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
|
||||
if (!(google_uri = switch_channel_get_variable(channel, "GOOGLE_SPEECH_TO_TEXT_URI"))) {
|
||||
google_uri = "speech.googleapis.com";
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "GOOGLE_APPLICATION_CREDENTIALS")) {
|
||||
auto channelCreds = grpc::SslCredentials(grpc::SslCredentialsOptions());
|
||||
auto callCreds = grpc::ServiceAccountJWTAccessCredentials(var);
|
||||
auto creds = grpc::CompositeChannelCredentials(channelCreds, callCreds);
|
||||
m_channel = grpc::CreateChannel(google_uri, creds);
|
||||
}
|
||||
else {
|
||||
auto creds = grpc::GoogleDefaultCredentials();
|
||||
m_channel = grpc::CreateChannel(google_uri, creds);
|
||||
}
|
||||
|
||||
m_stub = Speech::NewStub(m_channel);
|
||||
|
||||
auto* streaming_config = m_request.mutable_streaming_config();
|
||||
RecognitionConfig* config = streaming_config->mutable_config();
|
||||
|
||||
streaming_config->set_interim_results(interim);
|
||||
if (single_utterance == 1) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "enable_single_utterance\n");
|
||||
streaming_config->set_single_utterance(true);
|
||||
}
|
||||
else {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "enable_single_utterance is FALSE\n");
|
||||
streaming_config->set_single_utterance(false);
|
||||
}
|
||||
|
||||
config->set_language_code(lang);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "transcribe language %s \n", lang);
|
||||
|
||||
config->set_sample_rate_hertz(config_sample_rate);
|
||||
|
||||
config->set_encoding(RecognitionConfig::LINEAR16);
|
||||
|
||||
// the rest of config comes from channel vars
|
||||
|
||||
// number of channels in the audio stream (default: 1)
|
||||
if (channels > 1) {
|
||||
config->set_audio_channel_count(channels);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "audio_channel_count %d\n", channels);
|
||||
|
||||
// transcribe each separately?
|
||||
if (separate_recognition == 1) {
|
||||
config->set_enable_separate_recognition_per_channel(true);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "enable_separate_recognition_per_channel on\n");
|
||||
}
|
||||
}
|
||||
|
||||
// max alternatives
|
||||
if (max_alternatives > 1) {
|
||||
config->set_max_alternatives(max_alternatives);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "max_alternatives %d\n", max_alternatives);
|
||||
}
|
||||
|
||||
// profanity filter
|
||||
if (profanity_filter == 1) {
|
||||
config->set_profanity_filter(true);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "profanity_filter\n");
|
||||
}
|
||||
|
||||
// enable word offsets
|
||||
if (word_time_offset == 1) {
|
||||
config->set_enable_word_time_offsets(true);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "enable_word_time_offsets\n");
|
||||
}
|
||||
|
||||
// enable automatic punctuation
|
||||
if (punctuation == 1) {
|
||||
config->set_enable_automatic_punctuation(true);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "enable_automatic_punctuation\n");
|
||||
}
|
||||
else {
|
||||
config->set_enable_automatic_punctuation(false);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "disable_automatic_punctuation\n");
|
||||
}
|
||||
|
||||
// speech model
|
||||
if (model != NULL) {
|
||||
config->set_model(model);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "speech model %s\n", model);
|
||||
}
|
||||
|
||||
// use enhanced model
|
||||
if (enhanced == 1) {
|
||||
config->set_use_enhanced(true);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "use_enhanced\n");
|
||||
}
|
||||
|
||||
// hints
|
||||
if (hints != NULL) {
|
||||
auto* adaptation = config->mutable_adaptation();
|
||||
auto* phrase_set = adaptation->add_phrase_sets();
|
||||
auto *context = config->add_speech_contexts();
|
||||
float boost = -1;
|
||||
|
||||
// get boost setting for the phrase set in its entirety
|
||||
if (switch_true(switch_channel_get_variable(channel, "GOOGLE_SPEECH_HINTS_BOOST"))) {
|
||||
boost = (float) atof(switch_channel_get_variable(channel, "GOOGLE_SPEECH_HINTS_BOOST"));
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "boost value: %f\n", boost);
|
||||
phrase_set->set_boost(boost);
|
||||
}
|
||||
|
||||
// hints are either a simple comma-separated list of phrases, or a json array of objects
|
||||
// containing a phrase and a boost value
|
||||
auto *jHint = cJSON_Parse((char *) hints);
|
||||
if (jHint) {
|
||||
int i = 0;
|
||||
cJSON *jPhrase = NULL;
|
||||
cJSON_ArrayForEach(jPhrase, jHint) {
|
||||
auto* phrase = phrase_set->add_phrases();
|
||||
cJSON *jItem = cJSON_GetObjectItem(jPhrase, "phrase");
|
||||
if (jItem) {
|
||||
phrase->set_value(cJSON_GetStringValue(jItem));
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "phrase: %s\n", phrase->value().c_str());
|
||||
if (cJSON_GetObjectItem(jPhrase, "boost")) {
|
||||
phrase->set_boost((float) cJSON_GetObjectItem(jPhrase, "boost")->valuedouble);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "boost value: %f\n", phrase->boost());
|
||||
}
|
||||
i++;
|
||||
}
|
||||
}
|
||||
cJSON_Delete(jHint);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "added %d hints\n", i);
|
||||
}
|
||||
else {
|
||||
char *phrases[500] = { 0 };
|
||||
int argc = switch_separate_string((char *) hints, ',', phrases, 500);
|
||||
for (int i = 0; i < argc; i++) {
|
||||
auto* phrase = phrase_set->add_phrases();
|
||||
phrase->set_value(phrases[i]);
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "added %d hints\n", argc);
|
||||
}
|
||||
}
|
||||
|
||||
// alternative language
|
||||
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES")) {
|
||||
char *alt_langs[3] = { 0 };
|
||||
int argc = switch_separate_string((char *) var, ',', alt_langs, 3);
|
||||
for (int i = 0; i < argc; i++) {
|
||||
config->add_alternative_language_codes(alt_langs[i]);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "added alternative lang %s\n", alt_langs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// speaker diarization
|
||||
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_SPEAKER_DIARIZATION")) {
|
||||
auto* diarization_config = config->mutable_diarization_config();
|
||||
diarization_config->set_enable_speaker_diarization(true);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "enabling speaker diarization\n", var);
|
||||
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_SPEAKER_DIARIZATION_MIN_SPEAKER_COUNT")) {
|
||||
int count = std::max(atoi(var), 1);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "setting min speaker count to %d\n", count);
|
||||
diarization_config->set_min_speaker_count(count);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_SPEAKER_DIARIZATION_MAX_SPEAKER_COUNT")) {
|
||||
int count = std::max(atoi(var), 2);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "setting max speaker count to %d\n", count);
|
||||
diarization_config->set_max_speaker_count(count);
|
||||
}
|
||||
}
|
||||
|
||||
// recognition metadata
|
||||
auto* metadata = config->mutable_metadata();
|
||||
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_METADATA_INTERACTION_TYPE")) {
|
||||
if (case_insensitive_match("discussion", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_DISCUSSION);
|
||||
if (case_insensitive_match("presentation", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_PRESENTATION);
|
||||
if (case_insensitive_match("phone_call", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_PHONE_CALL);
|
||||
if (case_insensitive_match("voicemail", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_VOICEMAIL);
|
||||
if (case_insensitive_match("professionally_produced", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_PROFESSIONALLY_PRODUCED);
|
||||
if (case_insensitive_match("voice_search", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_VOICE_SEARCH);
|
||||
if (case_insensitive_match("voice_command", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_VOICE_COMMAND);
|
||||
if (case_insensitive_match("dictation", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_DICTATION);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_METADATA_INDUSTRY_NAICS_CODE")) {
|
||||
metadata->set_industry_naics_code_of_audio(atoi(var));
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_METADATA_MICROPHONE_DISTANCE")) {
|
||||
if (case_insensitive_match("nearfield", var)) metadata->set_microphone_distance(RecognitionMetadata_MicrophoneDistance_NEARFIELD);
|
||||
if (case_insensitive_match("midfield", var)) metadata->set_microphone_distance(RecognitionMetadata_MicrophoneDistance_MIDFIELD);
|
||||
if (case_insensitive_match("farfield", var)) metadata->set_microphone_distance(RecognitionMetadata_MicrophoneDistance_FARFIELD);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_METADATA_ORIGINAL_MEDIA_TYPE")) {
|
||||
if (case_insensitive_match("audio", var)) metadata->set_original_media_type(RecognitionMetadata_OriginalMediaType_AUDIO);
|
||||
if (case_insensitive_match("video", var)) metadata->set_original_media_type(RecognitionMetadata_OriginalMediaType_VIDEO);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_METADATA_RECORDING_DEVICE_TYPE")) {
|
||||
if (case_insensitive_match("smartphone", var)) metadata->set_recording_device_type(RecognitionMetadata_RecordingDeviceType_SMARTPHONE);
|
||||
if (case_insensitive_match("pc", var)) metadata->set_recording_device_type(RecognitionMetadata_RecordingDeviceType_PC);
|
||||
if (case_insensitive_match("phone_line", var)) metadata->set_recording_device_type(RecognitionMetadata_RecordingDeviceType_PHONE_LINE);
|
||||
if (case_insensitive_match("vehicle", var)) metadata->set_recording_device_type(RecognitionMetadata_RecordingDeviceType_VEHICLE);
|
||||
if (case_insensitive_match("other_outdoor_device", var)) metadata->set_recording_device_type(RecognitionMetadata_RecordingDeviceType_OTHER_OUTDOOR_DEVICE);
|
||||
if (case_insensitive_match("other_indoor_device", var)) metadata->set_recording_device_type(RecognitionMetadata_RecordingDeviceType_OTHER_INDOOR_DEVICE);
|
||||
}
|
||||
}
|
||||
|
||||
~GStreamer() {
|
||||
//switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_INFO, "GStreamer::~GStreamer - deleting channel and stub: %p\n", (void*)this);
|
||||
}
|
||||
|
||||
void connect() {
|
||||
assert(!m_connected);
|
||||
// Begin a stream.
|
||||
m_streamer = m_stub->StreamingRecognize(&m_context);
|
||||
m_connected = true;
|
||||
|
||||
// read thread is waiting on this
|
||||
m_promise.set_value();
|
||||
|
||||
// Write the first request, containing the config only.
|
||||
m_streamer->Write(m_request);
|
||||
|
||||
// send any buffered audio
|
||||
int nFrames = m_audioBuffer.getNumItems();
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p got stream ready, %d buffered frames\n", this, nFrames);
|
||||
if (nFrames) {
|
||||
char *p;
|
||||
do {
|
||||
p = m_audioBuffer.getNextChunk();
|
||||
if (p) {
|
||||
write(p, CHUNKSIZE);
|
||||
}
|
||||
} while (p);
|
||||
}
|
||||
}
|
||||
|
||||
bool write(void* data, uint32_t datalen) {
|
||||
if (!m_connected) {
|
||||
if (datalen % CHUNKSIZE == 0) {
|
||||
m_audioBuffer.add(data, datalen);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
m_request.set_audio_content(data, datalen);
|
||||
bool ok = m_streamer->Write(m_request);
|
||||
return ok;
|
||||
}
|
||||
|
||||
uint32_t nextMessageSize(void) {
|
||||
uint32_t size = 0;
|
||||
m_streamer->NextMessageSize(&size);
|
||||
return size;
|
||||
}
|
||||
|
||||
bool read(StreamingRecognizeResponse* response) {
|
||||
return m_streamer->Read(response);
|
||||
}
|
||||
|
||||
grpc::Status finish() {
|
||||
return m_streamer->Finish();
|
||||
}
|
||||
|
||||
void writesDone() {
|
||||
// grpc crashes if we call this twice on a stream
|
||||
if (!m_connected) {
|
||||
cancelConnect();
|
||||
}
|
||||
else if (!m_writesDone) {
|
||||
m_streamer->WritesDone();
|
||||
m_writesDone = true;
|
||||
}
|
||||
}
|
||||
|
||||
bool waitForConnect() {
|
||||
std::shared_future<void> sf(m_promise.get_future());
|
||||
sf.wait();
|
||||
return m_connected;
|
||||
}
|
||||
|
||||
void cancelConnect() {
|
||||
assert(!m_connected);
|
||||
m_promise.set_value();
|
||||
}
|
||||
|
||||
bool isConnected() {
|
||||
return m_connected;
|
||||
}
|
||||
|
||||
private:
|
||||
switch_core_session_t* m_session;
|
||||
grpc::ClientContext m_context;
|
||||
std::shared_ptr<grpc::Channel> m_channel;
|
||||
std::unique_ptr<Speech::Stub> m_stub;
|
||||
std::unique_ptr< grpc::ClientReaderWriterInterface<StreamingRecognizeRequest, StreamingRecognizeResponse> > m_streamer;
|
||||
StreamingRecognizeRequest m_request;
|
||||
bool m_writesDone;
|
||||
bool m_connected;
|
||||
std::promise<void> m_promise;
|
||||
SimpleBuffer m_audioBuffer;
|
||||
};
|
||||
|
||||
static void *SWITCH_THREAD_FUNC grpc_read_thread(switch_thread_t *thread, void *obj) {
|
||||
static int count;
|
||||
struct cap_cb *cb = (struct cap_cb *) obj;
|
||||
GStreamer* streamer = (GStreamer *) cb->streamer;
|
||||
|
||||
bool connected = streamer->waitForConnect();
|
||||
if (!connected) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "google transcribe grpc read thread exiting since we didnt connect\n") ;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Read responses.
|
||||
StreamingRecognizeResponse response;
|
||||
while (streamer->read(&response)) { // Returns false when no more to read.
|
||||
switch_core_session_t* session = switch_core_session_locate(cb->sessionId);
|
||||
if (!session) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "grpc_read_thread: session %s is gone!\n", cb->sessionId) ;
|
||||
return nullptr;
|
||||
}
|
||||
count++;
|
||||
auto speech_event_type = response.speech_event_type();
|
||||
if (response.has_error()) {
|
||||
Status status = response.error();
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "grpc_read_thread: error %s (%d)\n", status.message().c_str(), status.code()) ;
|
||||
cJSON* json = cJSON_CreateObject();
|
||||
cJSON_AddStringToObject(json, "type", "error");
|
||||
cJSON_AddStringToObject(json, "error", status.message().c_str());
|
||||
char* jsonString = cJSON_PrintUnformatted(json);
|
||||
cb->responseHandler(session, jsonString, cb->bugname);
|
||||
free(jsonString);
|
||||
cJSON_Delete(json);
|
||||
}
|
||||
|
||||
if (cb->play_file == 1){
|
||||
cb->responseHandler(session, "play_interrupt", cb->bugname);
|
||||
}
|
||||
|
||||
for (int r = 0; r < response.results_size(); ++r) {
|
||||
auto result = response.results(r);
|
||||
cJSON * jResult = cJSON_CreateObject();
|
||||
cJSON * jAlternatives = cJSON_CreateArray();
|
||||
cJSON * jStability = cJSON_CreateNumber(result.stability());
|
||||
cJSON * jIsFinal = cJSON_CreateBool(result.is_final());
|
||||
cJSON * jLanguageCode = cJSON_CreateString(result.language_code().c_str());
|
||||
cJSON * jChannelTag = cJSON_CreateNumber(result.channel_tag());
|
||||
|
||||
auto duration = result.result_end_time();
|
||||
int32_t seconds = duration.seconds();
|
||||
int64_t nanos = duration.nanos();
|
||||
int span = (int) trunc(seconds * 1000. + ((float) nanos / 1000000.));
|
||||
cJSON * jResultEndTime = cJSON_CreateNumber(span);
|
||||
|
||||
cJSON_AddItemToObject(jResult, "stability", jStability);
|
||||
cJSON_AddItemToObject(jResult, "is_final", jIsFinal);
|
||||
cJSON_AddItemToObject(jResult, "alternatives", jAlternatives);
|
||||
cJSON_AddItemToObject(jResult, "language_code", jLanguageCode);
|
||||
cJSON_AddItemToObject(jResult, "channel_tag", jChannelTag);
|
||||
cJSON_AddItemToObject(jResult, "result_end_time", jResultEndTime);
|
||||
|
||||
for (int a = 0; a < result.alternatives_size(); ++a) {
|
||||
auto alternative = result.alternatives(a);
|
||||
cJSON* jAlt = cJSON_CreateObject();
|
||||
cJSON* jConfidence = cJSON_CreateNumber(alternative.confidence());
|
||||
cJSON* jTranscript = cJSON_CreateString(alternative.transcript().c_str());
|
||||
cJSON_AddItemToObject(jAlt, "confidence", jConfidence);
|
||||
cJSON_AddItemToObject(jAlt, "transcript", jTranscript);
|
||||
|
||||
if (alternative.words_size() > 0) {
|
||||
cJSON * jWords = cJSON_CreateArray();
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "grpc_read_thread: %d words\n", alternative.words_size()) ;
|
||||
for (int b = 0; b < alternative.words_size(); b++) {
|
||||
auto words = alternative.words(b);
|
||||
cJSON* jWord = cJSON_CreateObject();
|
||||
cJSON_AddItemToObject(jWord, "word", cJSON_CreateString(words.word().c_str()));
|
||||
if (words.has_start_time()) {
|
||||
cJSON_AddItemToObject(jWord, "start_time", cJSON_CreateNumber(words.start_time().seconds()));
|
||||
}
|
||||
if (words.has_end_time()) {
|
||||
cJSON_AddItemToObject(jWord, "end_time", cJSON_CreateNumber(words.end_time().seconds()));
|
||||
}
|
||||
int speaker_tag = words.speaker_tag();
|
||||
if (speaker_tag > 0) {
|
||||
cJSON_AddItemToObject(jWord, "speaker_tag", cJSON_CreateNumber(speaker_tag));
|
||||
}
|
||||
float confidence = words.confidence();
|
||||
if (confidence > 0.0) {
|
||||
cJSON_AddItemToObject(jWord, "confidence", cJSON_CreateNumber(confidence));
|
||||
}
|
||||
|
||||
cJSON_AddItemToArray(jWords, jWord);
|
||||
}
|
||||
cJSON_AddItemToObject(jAlt, "words", jWords);
|
||||
}
|
||||
cJSON_AddItemToArray(jAlternatives, jAlt);
|
||||
}
|
||||
|
||||
char* json = cJSON_PrintUnformatted(jResult);
|
||||
cb->responseHandler(session, (const char *) json, cb->bugname);
|
||||
free(json);
|
||||
|
||||
cJSON_Delete(jResult);
|
||||
}
|
||||
|
||||
if (speech_event_type == StreamingRecognizeResponse_SpeechEventType_END_OF_SINGLE_UTTERANCE) {
|
||||
// we only get this when we have requested it, and recognition stops after we get this
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "grpc_read_thread: got end_of_utterance\n") ;
|
||||
cb->got_end_of_utterance = 1;
|
||||
cb->responseHandler(session, "end_of_utterance", cb->bugname);
|
||||
if (cb->wants_single_utterance) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "grpc_read_thread: sending writesDone because we want only a single utterance\n") ;
|
||||
streamer->writesDone();
|
||||
}
|
||||
}
|
||||
switch_core_session_rwunlock(session);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "grpc_read_thread: got %d responses\n", response.results_size());
|
||||
}
|
||||
|
||||
{
|
||||
switch_core_session_t* session = switch_core_session_locate(cb->sessionId);
|
||||
if (session) {
|
||||
grpc::Status status = streamer->finish();
|
||||
if (11 == status.error_code()) {
|
||||
if (std::string::npos != status.error_message().find("Exceeded maximum allowed stream duration")) {
|
||||
cb->responseHandler(session, "max_duration_exceeded", cb->bugname);
|
||||
}
|
||||
else {
|
||||
cb->responseHandler(session, "no_audio", cb->bugname);
|
||||
}
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "grpc_read_thread: finish() status %s (%d)\n", status.error_message().c_str(), status.error_code()) ;
|
||||
switch_core_session_rwunlock(session);
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
switch_status_t google_speech_init() {
|
||||
const char* gcsServiceKeyFile = std::getenv("GOOGLE_APPLICATION_CREDENTIALS");
|
||||
if (gcsServiceKeyFile) {
|
||||
try {
|
||||
auto creds = grpc::GoogleDefaultCredentials();
|
||||
} catch (const std::exception& e) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT,
|
||||
"Error initializing google api with provided credentials in %s: %s\n", gcsServiceKeyFile, e.what());
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
}
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_status_t google_speech_cleanup() {
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
switch_status_t google_speech_session_init(switch_core_session_t *session, responseHandler_t responseHandler,
|
||||
uint32_t to_rate, uint32_t samples_per_second, uint32_t channels, char* lang, int interim, char *bugname,
|
||||
int single_utterance, int separate_recognition, int max_alternatives, int profanity_filter, int word_time_offset,
|
||||
int punctuation, const char* model, int enhanced, const char* hints, char* play_file, void **ppUserData) {
|
||||
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
auto read_codec = switch_core_session_get_read_codec(session);
|
||||
uint32_t sampleRate = read_codec->implementation->actual_samples_per_second;
|
||||
struct cap_cb *cb;
|
||||
int err;
|
||||
|
||||
cb =(struct cap_cb *) switch_core_session_alloc(session, sizeof(*cb));
|
||||
strncpy(cb->sessionId, switch_core_session_get_uuid(session), MAX_SESSION_ID);
|
||||
strncpy(cb->bugname, bugname, MAX_BUG_LEN);
|
||||
cb->got_end_of_utterance = 0;
|
||||
cb->wants_single_utterance = single_utterance;
|
||||
if (play_file != NULL){
|
||||
cb->play_file = 1;
|
||||
}
|
||||
|
||||
switch_mutex_init(&cb->mutex, SWITCH_MUTEX_NESTED, switch_core_session_get_pool(session));
|
||||
if (sampleRate != to_rate) {
|
||||
cb->resampler = speex_resampler_init(channels, sampleRate, to_rate, SWITCH_RESAMPLE_QUALITY, &err);
|
||||
if (0 != err) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error initializing resampler: %s.\n",
|
||||
switch_channel_get_name(channel), speex_resampler_strerror(err));
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
} else {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "%s: no resampling needed for this call\n", switch_channel_get_name(channel));
|
||||
}
|
||||
cb->responseHandler = responseHandler;
|
||||
|
||||
// allocate vad if we are delaying connecting to the recognizer until we detect speech
|
||||
if (switch_channel_var_true(channel, "START_RECOGNIZING_ON_VAD")) {
|
||||
cb->vad = switch_vad_init(sampleRate, channels);
|
||||
if (cb->vad) {
|
||||
const char* var;
|
||||
int mode = 2;
|
||||
int silence_ms = 150;
|
||||
int voice_ms = 250;
|
||||
int debug = 0;
|
||||
|
||||
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_MODE")) {
|
||||
mode = atoi(var);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_SILENCE_MS")) {
|
||||
silence_ms = atoi(var);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_VOICE_MS")) {
|
||||
voice_ms = atoi(var);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_VOICE_MS")) {
|
||||
voice_ms = atoi(var);
|
||||
}
|
||||
switch_vad_set_mode(cb->vad, mode);
|
||||
switch_vad_set_param(cb->vad, "silence_ms", silence_ms);
|
||||
switch_vad_set_param(cb->vad, "voice_ms", voice_ms);
|
||||
switch_vad_set_param(cb->vad, "debug", debug);
|
||||
}
|
||||
}
|
||||
|
||||
GStreamer *streamer = NULL;
|
||||
try {
|
||||
streamer = new GStreamer(session, channels, lang, interim, to_rate, sampleRate, single_utterance, separate_recognition, max_alternatives,
|
||||
profanity_filter, word_time_offset, punctuation, model, enhanced, hints);
|
||||
cb->streamer = streamer;
|
||||
} catch (std::exception& e) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error initializing gstreamer: %s.\n",
|
||||
switch_channel_get_name(channel), e.what());
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
if (!cb->vad) streamer->connect();
|
||||
|
||||
// create the read thread
|
||||
switch_threadattr_t *thd_attr = NULL;
|
||||
switch_memory_pool_t *pool = switch_core_session_get_pool(session);
|
||||
|
||||
switch_threadattr_create(&thd_attr, pool);
|
||||
switch_threadattr_stacksize_set(thd_attr, SWITCH_THREAD_STACKSIZE);
|
||||
switch_thread_create(&cb->thread, thd_attr, grpc_read_thread, cb, pool);
|
||||
|
||||
*ppUserData = cb;
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_status_t google_speech_session_cleanup(switch_core_session_t *session, int channelIsClosing, switch_media_bug_t *bug) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
|
||||
if (bug) {
|
||||
struct cap_cb *cb = (struct cap_cb *) switch_core_media_bug_get_user_data(bug);
|
||||
switch_mutex_lock(cb->mutex);
|
||||
|
||||
if (!switch_channel_get_private(channel, cb->bugname)) {
|
||||
// race condition
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s Bug is not attached (race).\n", switch_channel_get_name(channel));
|
||||
switch_mutex_unlock(cb->mutex);
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
switch_channel_set_private(channel, cb->bugname, NULL);
|
||||
|
||||
// stop playback if available
|
||||
if (cb->play_file == 1){
|
||||
if (switch_channel_test_flag(channel, CF_BROADCAST)) {
|
||||
switch_channel_stop_broadcast(channel);
|
||||
} else {
|
||||
switch_channel_set_flag_value(channel, CF_BREAK, 1);
|
||||
}
|
||||
}
|
||||
|
||||
// close connection and get final responses
|
||||
GStreamer* streamer = (GStreamer *) cb->streamer;
|
||||
|
||||
if (streamer) {
|
||||
streamer->writesDone();
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "google_speech_session_cleanup: GStreamer (%p) waiting for read thread to complete\n", (void*)streamer);
|
||||
switch_status_t st;
|
||||
switch_thread_join(&st, cb->thread);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "google_speech_session_cleanup: GStreamer (%p) read thread completed\n", (void*)streamer);
|
||||
|
||||
delete streamer;
|
||||
cb->streamer = NULL;
|
||||
}
|
||||
|
||||
if (cb->resampler) {
|
||||
speex_resampler_destroy(cb->resampler);
|
||||
}
|
||||
if (cb->vad) {
|
||||
switch_vad_destroy(&cb->vad);
|
||||
cb->vad = nullptr;
|
||||
}
|
||||
if (!channelIsClosing) {
|
||||
switch_core_media_bug_remove(session, &bug);
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "google_speech_session_cleanup: Closed stream\n");
|
||||
|
||||
switch_mutex_unlock(cb->mutex);
|
||||
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s Bug is not attached.\n", switch_channel_get_name(channel));
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
switch_bool_t google_speech_frame(switch_media_bug_t *bug, void* user_data) {
|
||||
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
|
||||
struct cap_cb *cb = (struct cap_cb *) user_data;
|
||||
if (cb->streamer && (!cb->wants_single_utterance || !cb->got_end_of_utterance)) {
|
||||
GStreamer* streamer = (GStreamer *) cb->streamer;
|
||||
uint8_t data[SWITCH_RECOMMENDED_BUFFER_SIZE];
|
||||
switch_frame_t frame = {};
|
||||
frame.data = data;
|
||||
frame.buflen = SWITCH_RECOMMENDED_BUFFER_SIZE;
|
||||
|
||||
if (switch_mutex_trylock(cb->mutex) == SWITCH_STATUS_SUCCESS) {
|
||||
while (switch_core_media_bug_read(bug, &frame, SWITCH_TRUE) == SWITCH_STATUS_SUCCESS && !switch_test_flag((&frame), SFF_CNG)) {
|
||||
if (frame.datalen) {
|
||||
if (cb->vad && !streamer->isConnected()) {
|
||||
switch_vad_state_t state = switch_vad_process(cb->vad, (int16_t*) frame.data, frame.samples);
|
||||
if (state == SWITCH_VAD_STATE_START_TALKING) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "detected speech, connect to google speech now\n");
|
||||
streamer->connect();
|
||||
cb->responseHandler(session, "vad_detected", cb->bugname);
|
||||
}
|
||||
}
|
||||
|
||||
if (cb->resampler) {
|
||||
spx_int16_t out[SWITCH_RECOMMENDED_BUFFER_SIZE];
|
||||
spx_uint32_t out_len = SWITCH_RECOMMENDED_BUFFER_SIZE;
|
||||
spx_uint32_t in_len = frame.samples;
|
||||
size_t written;
|
||||
|
||||
speex_resampler_process_interleaved_int(cb->resampler,
|
||||
(const spx_int16_t *) frame.data,
|
||||
(spx_uint32_t *) &in_len,
|
||||
&out[0],
|
||||
&out_len);
|
||||
streamer->write( &out[0], sizeof(spx_int16_t) * out_len);
|
||||
}
|
||||
else {
|
||||
streamer->write( frame.data, sizeof(spx_int16_t) * frame.samples);
|
||||
}
|
||||
}
|
||||
}
|
||||
switch_mutex_unlock(cb->mutex);
|
||||
}
|
||||
}
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
}
|
||||
13
mod_google_transcribe/google_glue.h
Normal file
13
mod_google_transcribe/google_glue.h
Normal file
@@ -0,0 +1,13 @@
|
||||
#ifndef __GOOGLE_GLUE_H__
|
||||
#define __GOOGLE_GLUE_H__
|
||||
|
||||
switch_status_t google_speech_init();
|
||||
switch_status_t google_speech_cleanup();
|
||||
switch_status_t google_speech_session_init(switch_core_session_t *session, responseHandler_t responseHandler,
|
||||
uint32_t to_rate, uint32_t samples_per_second, uint32_t channels, char* lang, int interim, char *bugname, int single_utterence,
|
||||
int separate_recognition, int max_alternatives, int profinity_filter, int word_time_offset, int punctuation, const char* model, int enhanced,
|
||||
const char* hints, char* play_file, void **ppUserData);
|
||||
switch_status_t google_speech_session_cleanup(switch_core_session_t *session, int channelIsClosing, switch_media_bug_t *bug);
|
||||
switch_bool_t google_speech_frame(switch_media_bug_t *bug, void* user_data);
|
||||
|
||||
#endif
|
||||
484
mod_google_transcribe/mod_google_transcribe.c
Normal file
484
mod_google_transcribe/mod_google_transcribe.c
Normal file
@@ -0,0 +1,484 @@
|
||||
/*
|
||||
*
|
||||
* mod_google_transcribe.c -- Freeswitch module for real-time transcription using google's gRPC interface
|
||||
*
|
||||
*/
|
||||
#include "mod_google_transcribe.h"
|
||||
#include "google_glue.h"
|
||||
#include <stdlib.h>
|
||||
#include <switch.h>
|
||||
|
||||
static const uint32_t DEFAULT_SAMPLE_RATE = 8000;
|
||||
|
||||
/* Prototypes */
|
||||
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_transcribe_shutdown);
|
||||
SWITCH_MODULE_RUNTIME_FUNCTION(mod_transcribe_runtime);
|
||||
SWITCH_MODULE_LOAD_FUNCTION(mod_transcribe_load);
|
||||
|
||||
SWITCH_MODULE_DEFINITION(mod_google_transcribe, mod_transcribe_load, mod_transcribe_shutdown, NULL);
|
||||
|
||||
static switch_status_t do_stop(switch_core_session_t *session, char* bugname);
|
||||
|
||||
|
||||
static void responseHandler(switch_core_session_t* session, const char * json, const char* bugname) {
|
||||
switch_event_t *event;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
|
||||
if (0 == strcmp("vad_detected", json)) {
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_VAD_DETECTED);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
|
||||
}
|
||||
else if (0 == strcmp("end_of_utterance", json)) {
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_END_OF_UTTERANCE);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
|
||||
}
|
||||
else if (0 == strcmp("end_of_transcript", json)) {
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_END_OF_TRANSCRIPT);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
|
||||
}
|
||||
else if (0 == strcmp("start_of_transcript", json)) {
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_START_OF_TRANSCRIPT);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
|
||||
}
|
||||
else if (0 == strcmp("max_duration_exceeded", json)) {
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_MAX_DURATION_EXCEEDED);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
|
||||
}
|
||||
else if (0 == strcmp("no_audio", json)) {
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_NO_AUDIO_DETECTED);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
|
||||
}
|
||||
else if (0 == strcmp("play_interrupt", json)){
|
||||
switch_event_t *qevent;
|
||||
switch_status_t status;
|
||||
if (switch_event_create(&qevent, SWITCH_EVENT_DETECTED_SPEECH) == SWITCH_STATUS_SUCCESS) {
|
||||
if ((status = switch_core_session_queue_event(session, &qevent)) != SWITCH_STATUS_SUCCESS){
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "unable to queue play inturrupt event %d \n", status);
|
||||
}
|
||||
}else{
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "unable to create play inturrupt event \n");
|
||||
}
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_PLAY_INTERRUPT);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
|
||||
}
|
||||
else {
|
||||
int error = 0;
|
||||
cJSON* jMessage = cJSON_Parse(json);
|
||||
if (jMessage) {
|
||||
const char* type = cJSON_GetStringValue(cJSON_GetObjectItem(jMessage, "type"));
|
||||
if (type && 0 == strcmp(type, "error")) {
|
||||
error = 1;
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_ERROR);
|
||||
}
|
||||
cJSON_Delete(jMessage);
|
||||
}
|
||||
if (!error) {
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_RESULTS);
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "%s json payload: %s.\n", bugname ? bugname : "google_transcribe", json);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
|
||||
switch_event_add_body(event, "%s", json);
|
||||
}
|
||||
if (bugname) switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "media-bugname", bugname);
|
||||
switch_event_fire(&event);
|
||||
}
|
||||
|
||||
static switch_bool_t capture_callback(switch_media_bug_t *bug, void *user_data, switch_abc_type_t type)
|
||||
{
|
||||
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
|
||||
struct cap_cb* cb = (struct cap_cb*) switch_core_media_bug_get_user_data(bug);
|
||||
|
||||
switch (type) {
|
||||
case SWITCH_ABC_TYPE_INIT:
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Got SWITCH_ABC_TYPE_INIT.\n");
|
||||
responseHandler(session, "start_of_transcript", cb->bugname);
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_CLOSE:
|
||||
{
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Got SWITCH_ABC_TYPE_CLOSE, calling google_speech_session_cleanup.\n");
|
||||
responseHandler(session, "end_of_transcript", cb->bugname);
|
||||
google_speech_session_cleanup(session, 1, bug);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Finished SWITCH_ABC_TYPE_CLOSE.\n");
|
||||
}
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_READ:
|
||||
|
||||
return google_speech_frame(bug, user_data);
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_WRITE:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
|
||||
static switch_status_t transcribe_input_callback(switch_core_session_t *session, void *input, switch_input_type_t input_type, void *data, unsigned int len){
|
||||
if (input_type == SWITCH_INPUT_TYPE_EVENT) {
|
||||
switch_event_t *event;
|
||||
event = (switch_event_t *)input;
|
||||
if (event->event_id == SWITCH_EVENT_DETECTED_SPEECH) {
|
||||
return SWITCH_STATUS_BREAK;
|
||||
}
|
||||
}
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static switch_status_t do_stop(switch_core_session_t *session, char *bugname)
|
||||
{
|
||||
switch_status_t status = SWITCH_STATUS_SUCCESS;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = switch_channel_get_private(channel, bugname);
|
||||
|
||||
if (bug) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Received user command command, calling google_speech_session_cleanup (possibly to stop prev transcribe)\n");
|
||||
status = google_speech_session_cleanup(session, 0, bug);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "stopped transcription.\n");
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static switch_status_t start_capture2(switch_core_session_t *session, switch_media_bug_flag_t flags,
|
||||
uint32_t sample_rate, char* lang, int interim, int single_utterance, int separate_recognition, int max_alternatives,
|
||||
int profinity_filter, int word_time_offset, int punctuation, const char* model, int enhanced, const char* hints, char* play_file)
|
||||
{
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug;
|
||||
switch_status_t status;
|
||||
switch_codec_implementation_t read_impl = { 0 };
|
||||
void *pUserData;
|
||||
uint32_t samples_per_second;
|
||||
switch_input_args_t args = { 0 };
|
||||
|
||||
if (switch_channel_get_private(channel, MY_BUG_NAME)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "removing bug from previous transcribe\n");
|
||||
do_stop(session, MY_BUG_NAME);
|
||||
}
|
||||
|
||||
switch_core_session_get_read_impl(session, &read_impl);
|
||||
|
||||
if (switch_channel_pre_answer(channel) != SWITCH_STATUS_SUCCESS) {
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
samples_per_second = !strcasecmp(read_impl.iananame, "g722") ? read_impl.actual_samples_per_second : read_impl.samples_per_second;
|
||||
|
||||
if (SWITCH_STATUS_FALSE == google_speech_session_init(session, responseHandler, sample_rate, samples_per_second, flags & SMBF_STEREO ? 2 : 1, lang, interim, MY_BUG_NAME, single_utterance,
|
||||
separate_recognition, max_alternatives, profinity_filter, word_time_offset, punctuation, model, enhanced, hints, play_file, &pUserData)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error initializing google speech session.\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
if ((status = switch_core_media_bug_add(session, "google_transcribe", NULL, capture_callback, pUserData, 0, flags, &bug)) != SWITCH_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
|
||||
switch_channel_set_private(channel, MY_BUG_NAME, bug);
|
||||
|
||||
/* play the prompt, looking for detection result */
|
||||
if (play_file != NULL){
|
||||
args.input_callback = transcribe_input_callback;
|
||||
switch_ivr_play_file(session, NULL, play_file, &args);
|
||||
}
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static switch_status_t start_capture(switch_core_session_t *session, switch_media_bug_flag_t flags,
|
||||
char* lang, int interim, char* bugname)
|
||||
{
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug;
|
||||
switch_status_t status;
|
||||
switch_codec_implementation_t read_impl = { 0 };
|
||||
void *pUserData;
|
||||
uint32_t samples_per_second;
|
||||
int single_utterance = 0, separate_recognition = 0, max_alternatives = 0, profanity_filter = 0, word_time_offset = 0, punctuation = 0, enhanced = 0;
|
||||
const char* hints = NULL;
|
||||
const char* model = NULL;
|
||||
const char* var;
|
||||
|
||||
if (switch_channel_get_private(channel, bugname)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "removing bug from previous transcribe\n");
|
||||
do_stop(session, bugname);
|
||||
}
|
||||
|
||||
if (switch_true(switch_channel_get_variable(channel, "GOOGLE_SPEECH_SINGLE_UTTERANCE"))) {
|
||||
single_utterance = 1;
|
||||
}
|
||||
|
||||
// transcribe each separately?
|
||||
if (switch_true(switch_channel_get_variable(channel, "GOOGLE_SPEECH_SEPARATE_RECOGNITION_PER_CHANNEL"))) {
|
||||
separate_recognition = 1;
|
||||
}
|
||||
|
||||
// max alternatives
|
||||
if ((var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_MAX_ALTERNATIVES"))) {
|
||||
max_alternatives = atoi(var);
|
||||
}
|
||||
|
||||
// profanity filter
|
||||
if (switch_true(switch_channel_get_variable(channel, "GOOGLE_SPEECH_PROFANITY_FILTER"))) {
|
||||
profanity_filter = 1;
|
||||
}
|
||||
|
||||
// enable word offsets
|
||||
if (switch_true(switch_channel_get_variable(channel, "GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS"))) {
|
||||
word_time_offset = 1;
|
||||
}
|
||||
|
||||
// enable automatic punctuation
|
||||
if (switch_true(switch_channel_get_variable(channel, "GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION"))) {
|
||||
punctuation = 1;
|
||||
}
|
||||
|
||||
// speech model
|
||||
if ((var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_MODEL"))) {
|
||||
model = var;
|
||||
}
|
||||
|
||||
// use enhanced model
|
||||
if (switch_true(switch_channel_get_variable(channel, "GOOGLE_SPEECH_USE_ENHANCED"))) {
|
||||
enhanced = 1;
|
||||
}
|
||||
|
||||
// hints
|
||||
if ((var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_HINTS"))) {
|
||||
hints = var;
|
||||
}
|
||||
|
||||
switch_core_session_get_read_impl(session, &read_impl);
|
||||
|
||||
if (switch_channel_pre_answer(channel) != SWITCH_STATUS_SUCCESS) {
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
samples_per_second = !strcasecmp(read_impl.iananame, "g722") ? read_impl.actual_samples_per_second : read_impl.samples_per_second;
|
||||
|
||||
if (SWITCH_STATUS_FALSE == google_speech_session_init(session, responseHandler, DEFAULT_SAMPLE_RATE, samples_per_second, flags & SMBF_STEREO ? 2 : 1, lang, interim, bugname, single_utterance,
|
||||
separate_recognition, max_alternatives, profanity_filter, word_time_offset, punctuation, model, enhanced, hints, NULL, &pUserData)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error initializing google speech session.\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
if ((status = switch_core_media_bug_add(session, bugname, NULL, capture_callback, pUserData, 0, flags, &bug)) != SWITCH_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
|
||||
switch_channel_set_private(channel, bugname, bug);
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// #define TRANSCRIBE_API_SYNTAX "<uuid> [start|stop] [lang-code] [interim] [single-utterance](bool) [seperate-recognition](bool) [max-alternatives](int) [profinity-filter](bool) [word-time](bool) [punctuation](bool) [model](string) [enhanced](true) [hints](string without space) [play-file]"
|
||||
#define TRANSCRIBE2_API_SYNTAX "<uuid> [start|stop] [lang-code] [interim] [single-utterance] [seperate-recognition] [max-alternatives] [profinity-filter] [word-time] [punctuation] [sample-rate] [model] [enhanced] [hints] [play-file]"
|
||||
SWITCH_STANDARD_API(transcribe2_function)
|
||||
{
|
||||
char *mycmd = NULL, *argv[20] = { 0 };
|
||||
int argc = 0, enhanced = 0;
|
||||
uint32_t sample_rate = DEFAULT_SAMPLE_RATE;
|
||||
const char* hints = NULL;
|
||||
const char* model = NULL;
|
||||
char* play_file = NULL;
|
||||
|
||||
switch_status_t status = SWITCH_STATUS_FALSE;
|
||||
switch_media_bug_flag_t flags = SMBF_READ_STREAM /* | SMBF_WRITE_STREAM | SMBF_READ_PING */;
|
||||
|
||||
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
|
||||
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
|
||||
}
|
||||
|
||||
if (zstr(cmd) ||
|
||||
(argc < 2) ||
|
||||
(!strcasecmp(argv[1], "start") && argc < 10) ||
|
||||
zstr(argv[0])) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s %s.\n", cmd, argv[0]);
|
||||
stream->write_function(stream, "-USAGE: %s\n", TRANSCRIBE2_API_SYNTAX);
|
||||
goto done;
|
||||
} else {
|
||||
switch_core_session_t *lsession = NULL;
|
||||
|
||||
if ((lsession = switch_core_session_locate(argv[0]))) {
|
||||
if (!strcasecmp(argv[1], "stop")) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "stop transcribing\n");
|
||||
status = do_stop(lsession, MY_BUG_NAME);
|
||||
} else if (!strcasecmp(argv[1], "start")) {
|
||||
char* lang = argv[2];
|
||||
int interim = argc > 3 && !strcmp(argv[3], "true");
|
||||
int single_utterance = !strcmp(argv[4], "true"); // single-utterance
|
||||
int separate_recognition = !strcmp(argv[5], "true"); // sepreate-recognition
|
||||
int max_alternatives = atoi(argv[6]); // max-alternatives
|
||||
int profinity_filter = !strcmp(argv[7], "true"); // profinity-filter
|
||||
int word_time_offset = !strcmp(argv[8], "true"); // word-time
|
||||
int punctuation = !strcmp(argv[9], "true"); //punctuation
|
||||
if (argc > 10) {
|
||||
sample_rate = atol(argv[10]);
|
||||
}
|
||||
if (argc > 12){
|
||||
model = argv[11]; // model
|
||||
enhanced = !strcmp(argv[12], "true"); // enhanced
|
||||
}
|
||||
if (argc > 13){
|
||||
hints = argv[13]; // hints
|
||||
}
|
||||
if (argc > 14){
|
||||
play_file = argv[14];
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "start transcribing %s %s\n", lang, interim ? "interim": "complete");
|
||||
status = start_capture2(lsession, flags, sample_rate, lang, interim, single_utterance, separate_recognition,max_alternatives,
|
||||
profinity_filter, word_time_offset, punctuation, model, enhanced, hints, play_file);
|
||||
}
|
||||
switch_core_session_rwunlock(lsession);
|
||||
}
|
||||
}
|
||||
|
||||
if (status == SWITCH_STATUS_SUCCESS) {
|
||||
stream->write_function(stream, "+OK Success\n");
|
||||
} else {
|
||||
stream->write_function(stream, "-ERR Operation Failed\n");
|
||||
}
|
||||
|
||||
done:
|
||||
|
||||
switch_safe_free(mycmd);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
#define TRANSCRIBE_API_SYNTAX "<uuid> [start|stop] [lang-code] [interim|full] [stereo|mono] [bug-name]"
|
||||
SWITCH_STANDARD_API(transcribe_function)
|
||||
{
|
||||
char *mycmd = NULL, *argv[6] = { 0 };
|
||||
int argc = 0;
|
||||
switch_status_t status = SWITCH_STATUS_FALSE;
|
||||
switch_media_bug_flag_t flags = SMBF_READ_STREAM /* | SMBF_WRITE_STREAM | SMBF_READ_PING */;
|
||||
|
||||
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
|
||||
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
|
||||
}
|
||||
|
||||
if (zstr(cmd) ||
|
||||
(!strcasecmp(argv[1], "stop") && argc < 2) ||
|
||||
(!strcasecmp(argv[1], "start") && argc < 3) ||
|
||||
zstr(argv[0])) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s %s %s.\n", cmd, argv[0], argv[1]);
|
||||
stream->write_function(stream, "-USAGE: %s\n", TRANSCRIBE_API_SYNTAX);
|
||||
goto done;
|
||||
} else {
|
||||
switch_core_session_t *lsession = NULL;
|
||||
|
||||
if ((lsession = switch_core_session_locate(argv[0]))) {
|
||||
if (!strcasecmp(argv[1], "stop")) {
|
||||
char *bugname = argc > 2 ? argv[2] : MY_BUG_NAME;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "stop transcribing\n");
|
||||
status = do_stop(lsession, bugname);
|
||||
} else if (!strcasecmp(argv[1], "start")) {
|
||||
char* lang = argv[2];
|
||||
int interim = argc > 3 && !strcmp(argv[3], "interim");
|
||||
char *bugname = argc > 5 ? argv[5] : MY_BUG_NAME;
|
||||
if (argc > 4 && !strcmp(argv[4], "stereo")) {
|
||||
flags |= SMBF_WRITE_STREAM ;
|
||||
flags |= SMBF_STEREO;
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s start transcribing %s %s\n", bugname, lang, interim ? "interim": "complete");
|
||||
status = start_capture(lsession, flags, lang, interim, bugname);
|
||||
}
|
||||
switch_core_session_rwunlock(lsession);
|
||||
}
|
||||
}
|
||||
|
||||
if (status == SWITCH_STATUS_SUCCESS) {
|
||||
stream->write_function(stream, "+OK Success\n");
|
||||
} else {
|
||||
stream->write_function(stream, "-ERR Operation Failed\n");
|
||||
}
|
||||
|
||||
done:
|
||||
|
||||
switch_safe_free(mycmd);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
SWITCH_MODULE_LOAD_FUNCTION(mod_transcribe_load)
|
||||
{
|
||||
switch_api_interface_t *api_interface;
|
||||
|
||||
/* create/register custom event message type */
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_RESULTS) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_RESULTS);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_END_OF_UTTERANCE) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_END_OF_UTTERANCE);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_START_OF_TRANSCRIPT) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_START_OF_TRANSCRIPT);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_END_OF_TRANSCRIPT) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_END_OF_TRANSCRIPT);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_NO_AUDIO_DETECTED) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_NO_AUDIO_DETECTED);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_MAX_DURATION_EXCEEDED) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_MAX_DURATION_EXCEEDED);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_PLAY_INTERRUPT) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_PLAY_INTERRUPT);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
|
||||
/* connect my internal structure to the blank pointer passed to me */
|
||||
*module_interface = switch_loadable_module_create_module_interface(pool, modname);
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Google Speech Transcription API loading..\n");
|
||||
|
||||
if (SWITCH_STATUS_FALSE == google_speech_init()) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Failed initializing google speech interface\n");
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Google Speech Transcription API successfully loaded\n");
|
||||
|
||||
SWITCH_ADD_API(api_interface, "uuid_google_transcribe", "Google Speech Transcription API", transcribe_function, TRANSCRIBE_API_SYNTAX);
|
||||
SWITCH_ADD_API(api_interface, "uuid_google_transcribe2", "Google Speech Transcription API", transcribe2_function, TRANSCRIBE2_API_SYNTAX);
|
||||
switch_console_set_complete("add uuid_google_transcribe start lang-code");
|
||||
switch_console_set_complete("add uuid_google_transcribe stop ");
|
||||
|
||||
/* indicate that the module should continue to be loaded */
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Called when the system shuts down
|
||||
Macro expands to: switch_status_t mod_google_transcribe_shutdown() */
|
||||
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_transcribe_shutdown)
|
||||
{
|
||||
google_speech_cleanup();
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_RESULTS);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_END_OF_UTTERANCE);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_START_OF_TRANSCRIPT);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_END_OF_TRANSCRIPT);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_NO_AUDIO_DETECTED);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_MAX_DURATION_EXCEEDED);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_END_OF_UTTERANCE);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_PLAY_INTERRUPT);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
58
mod_google_transcribe/mod_google_transcribe.h
Normal file
58
mod_google_transcribe/mod_google_transcribe.h
Normal file
@@ -0,0 +1,58 @@
|
||||
#ifndef __MOD_GOOGLE_TRANSCRIBE_H__
|
||||
#define __MOD_GOOGLE_TRANSCRIBE_H__
|
||||
|
||||
#include <switch.h>
|
||||
#include <speex/speex_resampler.h>
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
#define MAX_SESSION_ID (256)
|
||||
#define MAX_BUG_LEN (64)
|
||||
#define MY_BUG_NAME "google_transcribe"
|
||||
#define TRANSCRIBE_EVENT_RESULTS "google_transcribe::transcription"
|
||||
#define TRANSCRIBE_EVENT_END_OF_UTTERANCE "google_transcribe::end_of_utterance"
|
||||
#define TRANSCRIBE_EVENT_START_OF_TRANSCRIPT "google_transcribe::start_of_transcript"
|
||||
#define TRANSCRIBE_EVENT_END_OF_TRANSCRIPT "google_transcribe::end_of_transcript"
|
||||
#define TRANSCRIBE_EVENT_NO_AUDIO_DETECTED "google_transcribe::no_audio_detected"
|
||||
#define TRANSCRIBE_EVENT_MAX_DURATION_EXCEEDED "google_transcribe::max_duration_exceeded"
|
||||
#define TRANSCRIBE_EVENT_PLAY_INTERRUPT "google_transcribe::play_interrupt"
|
||||
#define TRANSCRIBE_EVENT_VAD_DETECTED "google_transcribe::vad_detected"
|
||||
#define TRANSCRIBE_EVENT_ERROR "jambonz_transcribe::error"
|
||||
|
||||
|
||||
// simply write a wave file
|
||||
//#define DEBUG_TRANSCRIBE 0
|
||||
|
||||
|
||||
#ifdef DEBUG_TRANSCRIBE
|
||||
|
||||
/* per-channel data */
|
||||
struct cap_cb {
|
||||
switch_buffer_t *buffer;
|
||||
switch_mutex_t *mutex;
|
||||
char *base;
|
||||
SpeexResamplerState *resampler;
|
||||
FILE* fp;
|
||||
};
|
||||
#else
|
||||
/* per-channel data */
|
||||
typedef void (*responseHandler_t)(switch_core_session_t* session, const char* json, const char* bugname);
|
||||
|
||||
struct cap_cb {
|
||||
switch_mutex_t *mutex;
|
||||
char bugname[MAX_BUG_LEN+1];
|
||||
char sessionId[MAX_SESSION_ID+1];
|
||||
char *base;
|
||||
SpeexResamplerState *resampler;
|
||||
void* streamer;
|
||||
responseHandler_t responseHandler;
|
||||
switch_thread_t* thread;
|
||||
int wants_single_utterance;
|
||||
int got_end_of_utterance;
|
||||
int play_file;
|
||||
switch_vad_t * vad;
|
||||
uint32_t samples_per_second;
|
||||
};
|
||||
#endif
|
||||
|
||||
#endif
|
||||
51
mod_google_transcribe/simple_buffer.h
Normal file
51
mod_google_transcribe/simple_buffer.h
Normal file
@@ -0,0 +1,51 @@
|
||||
/**
|
||||
* (very) simple and limited circular buffer,
|
||||
* supporting only the use case of doing all of the adds
|
||||
* and then subsquently retrieves.
|
||||
*
|
||||
*/
|
||||
class SimpleBuffer {
|
||||
public:
|
||||
SimpleBuffer(uint32_t chunkSize, uint32_t numChunks) : numItems(0),
|
||||
m_numChunks(numChunks), m_chunkSize(chunkSize) {
|
||||
m_pData = new char[chunkSize * numChunks];
|
||||
m_pNextWrite = m_pData;
|
||||
}
|
||||
~SimpleBuffer() {
|
||||
delete [] m_pData;
|
||||
}
|
||||
|
||||
void add(void *data, uint32_t datalen) {
|
||||
if (datalen % m_chunkSize != 0) return;
|
||||
int numChunks = datalen / m_chunkSize;
|
||||
for (int i = 0; i < numChunks; i++) {
|
||||
memcpy(m_pNextWrite, data, m_chunkSize);
|
||||
data = static_cast<char*>(data) + m_chunkSize;
|
||||
if (numItems < m_numChunks) numItems++;
|
||||
|
||||
uint32_t offset = (m_pNextWrite - m_pData) / m_chunkSize;
|
||||
if (offset >= m_numChunks - 1) m_pNextWrite = m_pData;
|
||||
else m_pNextWrite += m_chunkSize;
|
||||
}
|
||||
}
|
||||
|
||||
char* getNextChunk() {
|
||||
if (numItems--) {
|
||||
char *p = m_pNextWrite;
|
||||
uint32_t offset = (m_pNextWrite - m_pData) / m_chunkSize;
|
||||
if (offset >= m_numChunks - 1) m_pNextWrite = m_pData;
|
||||
else m_pNextWrite += m_chunkSize;
|
||||
return p;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
uint32_t getNumItems() { return numItems;}
|
||||
|
||||
private:
|
||||
char *m_pData;
|
||||
uint32_t numItems;
|
||||
uint32_t m_chunkSize;
|
||||
uint32_t m_numChunks;
|
||||
char* m_pNextWrite;
|
||||
};
|
||||
8
mod_ibm_transcribe/LICENSE
Normal file
8
mod_ibm_transcribe/LICENSE
Normal file
@@ -0,0 +1,8 @@
|
||||
Copyright 2023, Drachtio Communications Services, LLC
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
9
mod_ibm_transcribe/Makefile.am
Normal file
9
mod_ibm_transcribe/Makefile.am
Normal file
@@ -0,0 +1,9 @@
|
||||
include $(top_srcdir)/build/modmake.rulesam
|
||||
MODNAME=mod_ibm_transcribe
|
||||
|
||||
mod_LTLIBRARIES = mod_ibm_transcribe.la
|
||||
mod_ibm_transcribe_la_SOURCES = mod_ibm_transcribe.c ibm_transcribe_glue.cpp audio_pipe.cpp parser.cpp
|
||||
mod_ibm_transcribe_la_CFLAGS = $(AM_CFLAGS)
|
||||
mod_ibm_transcribe_la_CXXFLAGS = $(AM_CXXFLAGS) -std=c++11
|
||||
mod_ibm_transcribe_la_LIBADD = $(switch_builddir)/libfreeswitch.la
|
||||
mod_ibm_transcribe_la_LDFLAGS = -avoid-version -module -no-undefined -shared `pkg-config --libs libwebsockets`
|
||||
57
mod_ibm_transcribe/README.md
Normal file
57
mod_ibm_transcribe/README.md
Normal file
@@ -0,0 +1,57 @@
|
||||
# mod_ibm_transcribe
|
||||
|
||||
A Freeswitch module that generates real-time transcriptions on a Freeswitch channel by using IBM Watson
|
||||
|
||||
## API
|
||||
|
||||
### Commands
|
||||
The freeswitch module exposes the following API commands:
|
||||
|
||||
```
|
||||
uuid_ibm_transcribe <uuid> start <lang-code> [interim]
|
||||
```
|
||||
Attaches media bug to channel and performs streaming recognize request.
|
||||
- `uuid` - unique identifier of Freeswitch channel
|
||||
- `lang-code` - a valid IBM [language code](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng#models-ng-supported) that is supported for streaming transcription
|
||||
- `interim` - If the 'interim' keyword is present then both interim and final transcription results will be returned; otherwise only final transcriptions will be returned
|
||||
|
||||
```
|
||||
uuid_ibm_transcribe <uuid> stop
|
||||
```
|
||||
Stop transcription on the channel.
|
||||
|
||||
### Channel Variables
|
||||
|
||||
| variable | Description |
|
||||
| --- | ----------- |
|
||||
| IBM_ACCESS_TOKEN | IBM access token used to authenticate |
|
||||
| IBM_SPEECH_INSTANCE_ID |IBM instance id |
|
||||
| IBM_SPEECH_MODEL | IBM speech model (https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-websockets) |
|
||||
| IBM_SPEECH_LANGUAGE_CUSTOMIZATION_ID |IBM speech language customization id |
|
||||
| IBM_SPEECH_ACOUSTIC_CUSTOMIZATION_ID | IBM accoustic customization id|
|
||||
| IBM_SPEECH_BASE_MODEL_VERSION | IBM base model version |
|
||||
| IBM_SPEECH_WATSON_METADATA | customer metadata to pass to IBM watson |
|
||||
| IBM_SPEECH_WATSON_LEARNING_OPT_OUT | 1 means opt out |
|
||||
|
||||
|
||||
### Events
|
||||
`ibm_transcribe::transcription` - returns an interim or final transcription. The event contains a JSON body describing the transcription result:
|
||||
```json
|
||||
{
|
||||
"result_index": 0,
|
||||
"results": [{
|
||||
"final": true,
|
||||
"alternatives": [{
|
||||
"transcript": "what kind of dog is that",
|
||||
"confidence": 0.83
|
||||
}]
|
||||
}]
|
||||
}
|
||||
```
|
||||
|
||||
## Usage
|
||||
When using [drachtio-fsrmf](https://www.npmjs.com/package/drachtio-fsmrf), you can access this API command via the api method on the 'endpoint' object.
|
||||
```js
|
||||
ep.api('uuid_ibm_transcribe', `${ep.uuid} start en-US interim`);
|
||||
```
|
||||
|
||||
513
mod_ibm_transcribe/audio_pipe.cpp
Normal file
513
mod_ibm_transcribe/audio_pipe.cpp
Normal file
@@ -0,0 +1,513 @@
|
||||
#include "audio_pipe.hpp"
|
||||
|
||||
#include <cassert>
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
|
||||
/* discard incoming text messages over the socket that are longer than this */
|
||||
#define MAX_RECV_BUF_SIZE (65 * 1024 * 10)
|
||||
#define RECV_BUF_REALLOC_SIZE (8 * 1024)
|
||||
|
||||
using namespace ibm;
|
||||
|
||||
namespace {
|
||||
static const char *requestedTcpKeepaliveSecs = std::getenv("MOD_AUDIO_FORK_TCP_KEEPALIVE_SECS");
|
||||
static int nTcpKeepaliveSecs = requestedTcpKeepaliveSecs ? ::atoi(requestedTcpKeepaliveSecs) : 55;
|
||||
}
|
||||
|
||||
int AudioPipe::lws_callback(struct lws *wsi,
|
||||
enum lws_callback_reasons reason,
|
||||
void *user, void *in, size_t len) {
|
||||
|
||||
struct AudioPipe::lws_per_vhost_data *vhd =
|
||||
(struct AudioPipe::lws_per_vhost_data *) lws_protocol_vh_priv_get(lws_get_vhost(wsi), lws_get_protocol(wsi));
|
||||
|
||||
struct lws_vhost* vhost = lws_get_vhost(wsi);
|
||||
AudioPipe ** ppAp = (AudioPipe **) user;
|
||||
|
||||
switch (reason) {
|
||||
case LWS_CALLBACK_PROTOCOL_INIT:
|
||||
vhd = (struct AudioPipe::lws_per_vhost_data *) lws_protocol_vh_priv_zalloc(lws_get_vhost(wsi), lws_get_protocol(wsi), sizeof(struct AudioPipe::lws_per_vhost_data));
|
||||
vhd->context = lws_get_context(wsi);
|
||||
vhd->protocol = lws_get_protocol(wsi);
|
||||
vhd->vhost = lws_get_vhost(wsi);
|
||||
break;
|
||||
|
||||
case LWS_CALLBACK_CLIENT_APPEND_HANDSHAKE_HEADER:
|
||||
break;
|
||||
|
||||
case LWS_CALLBACK_EVENT_WAIT_CANCELLED:
|
||||
processPendingConnects(vhd);
|
||||
processPendingDisconnects(vhd);
|
||||
processPendingWrites();
|
||||
break;
|
||||
case LWS_CALLBACK_CLIENT_CONNECTION_ERROR:
|
||||
{
|
||||
AudioPipe* ap = findAndRemovePendingConnect(wsi);
|
||||
int rc = lws_http_client_http_response(wsi);
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_CONNECTION_ERROR: %s, response status %d\n", in ? (char *)in : "(null)", rc);
|
||||
if (ap) {
|
||||
ap->m_state = LWS_CLIENT_FAILED;
|
||||
ap->m_callback(ap->m_uuid.c_str(), AudioPipe::CONNECT_FAIL, (char *) in, ap->isFinished(), ap->isInterimTranscriptsEnabled(), ap->getBugname().c_str());
|
||||
}
|
||||
else {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_CONNECTION_ERROR unable to find wsi %p..\n", wsi);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case LWS_CALLBACK_CLIENT_ESTABLISHED:
|
||||
{
|
||||
AudioPipe* ap = findAndRemovePendingConnect(wsi);
|
||||
if (ap) {
|
||||
std::ostringstream oss;
|
||||
*ppAp = ap;
|
||||
ap->m_vhd = vhd;
|
||||
ap->m_state = LWS_CLIENT_CONNECTED;
|
||||
|
||||
if (ap->isFinished()) {
|
||||
//std::cerr << "Got quick hangup from client while connecting, immediate close" << std::endl;
|
||||
ap->close();
|
||||
}
|
||||
else {
|
||||
oss << "{\"action\": \"start\",";
|
||||
oss << "\"content-type\": \"audio/l16;rate=16000\"";
|
||||
oss << ",\"interim_results\": true";
|
||||
oss << ",\"low_latency\": false";
|
||||
oss << "}";
|
||||
|
||||
ap->bufferForSending(oss.str().c_str());
|
||||
ap->m_callback(ap->m_uuid.c_str(), AudioPipe::CONNECT_SUCCESS, NULL, ap->isFinished(), ap->isInterimTranscriptsEnabled(), ap->getBugname().c_str());
|
||||
}
|
||||
|
||||
}
|
||||
else {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_ESTABLISHED %s unable to find wsi %p..\n", ap->m_uuid.c_str(), wsi);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case LWS_CALLBACK_CLIENT_CLOSED:
|
||||
{
|
||||
AudioPipe* ap = *ppAp;
|
||||
if (!ap) {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_CLOSED %s unable to find wsi %p..\n", ap->m_uuid.c_str(), wsi);
|
||||
return 0;
|
||||
}
|
||||
if (ap->m_state == LWS_CLIENT_DISCONNECTING) {
|
||||
// closed by us
|
||||
|
||||
lwsl_debug("%s socket closed by us\n", ap->m_uuid.c_str());
|
||||
ap->m_callback(ap->m_uuid.c_str(), AudioPipe::CONNECTION_CLOSED_GRACEFULLY, NULL, ap->isFinished(), ap->isInterimTranscriptsEnabled(), ap->getBugname().c_str());
|
||||
}
|
||||
else if (ap->m_state == LWS_CLIENT_CONNECTED) {
|
||||
// closed by far end
|
||||
lwsl_info("%s socket closed by far end\n", ap->m_uuid.c_str());
|
||||
ap->m_callback(ap->m_uuid.c_str(), AudioPipe::CONNECTION_DROPPED, NULL, ap->isFinished(), ap->isInterimTranscriptsEnabled(), ap->getBugname().c_str());
|
||||
}
|
||||
ap->m_state = LWS_CLIENT_DISCONNECTED;
|
||||
ap->setClosed();
|
||||
|
||||
//NB: after receiving any of the events above, any holder of a
|
||||
//pointer or reference to this object must treat is as no longer valid
|
||||
|
||||
*ppAp = NULL;
|
||||
delete ap;
|
||||
}
|
||||
break;
|
||||
|
||||
case LWS_CALLBACK_CLIENT_RECEIVE:
|
||||
{
|
||||
AudioPipe* ap = *ppAp;
|
||||
if (!ap) {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_RECEIVE %s unable to find wsi %p..\n", ap->m_uuid.c_str(), wsi);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (lws_frame_is_binary(wsi)) {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_RECEIVE received binary frame, discarding.\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (lws_is_first_fragment(wsi)) {
|
||||
// allocate a buffer for the entire chunk of memory needed
|
||||
assert(nullptr == ap->m_recv_buf);
|
||||
ap->m_recv_buf_len = len + lws_remaining_packet_payload(wsi);
|
||||
ap->m_recv_buf = (uint8_t*) malloc(ap->m_recv_buf_len);
|
||||
ap->m_recv_buf_ptr = ap->m_recv_buf;
|
||||
}
|
||||
|
||||
size_t write_offset = ap->m_recv_buf_ptr - ap->m_recv_buf;
|
||||
size_t remaining_space = ap->m_recv_buf_len - write_offset;
|
||||
if (remaining_space < len) {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_RECEIVE buffer realloc needed.\n");
|
||||
size_t newlen = ap->m_recv_buf_len + RECV_BUF_REALLOC_SIZE;
|
||||
if (newlen > MAX_RECV_BUF_SIZE) {
|
||||
free(ap->m_recv_buf);
|
||||
ap->m_recv_buf = ap->m_recv_buf_ptr = nullptr;
|
||||
ap->m_recv_buf_len = 0;
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_RECEIVE max buffer exceeded, truncating message.\n");
|
||||
}
|
||||
else {
|
||||
ap->m_recv_buf = (uint8_t*) realloc(ap->m_recv_buf, newlen);
|
||||
if (nullptr != ap->m_recv_buf) {
|
||||
ap->m_recv_buf_len = newlen;
|
||||
ap->m_recv_buf_ptr = ap->m_recv_buf + write_offset;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (nullptr != ap->m_recv_buf) {
|
||||
if (len > 0) {
|
||||
memcpy(ap->m_recv_buf_ptr, in, len);
|
||||
ap->m_recv_buf_ptr += len;
|
||||
}
|
||||
if (lws_is_final_fragment(wsi)) {
|
||||
if (nullptr != ap->m_recv_buf) {
|
||||
std::string msg((char *)ap->m_recv_buf, ap->m_recv_buf_ptr - ap->m_recv_buf);
|
||||
//std::cerr << "Recv: " << msg << std::endl;
|
||||
|
||||
ap->m_callback(ap->m_uuid.c_str(), AudioPipe::MESSAGE, msg.c_str(), ap->isFinished(), ap->isInterimTranscriptsEnabled(), ap->getBugname().c_str());
|
||||
if (nullptr != ap->m_recv_buf) free(ap->m_recv_buf);
|
||||
}
|
||||
ap->m_recv_buf = ap->m_recv_buf_ptr = nullptr;
|
||||
ap->m_recv_buf_len = 0;
|
||||
if (ap->isFinished()) {
|
||||
// got final transcript, close the connection
|
||||
ap->close();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case LWS_CALLBACK_CLIENT_WRITEABLE:
|
||||
{
|
||||
AudioPipe* ap = *ppAp;
|
||||
if (!ap) {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_WRITEABLE %s unable to find wsi %p..\n", ap->m_uuid.c_str(), wsi);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// check for text frames to send
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(ap->m_text_mutex);
|
||||
if (ap->m_metadata.length() > 0) {
|
||||
//std::cerr << "Sending: " << ap->m_metadata << std::endl;
|
||||
uint8_t buf[ap->m_metadata.length() + LWS_PRE];
|
||||
memcpy(buf + LWS_PRE, ap->m_metadata.c_str(), ap->m_metadata.length());
|
||||
int n = ap->m_metadata.length();
|
||||
int m = lws_write(wsi, buf + LWS_PRE, n, LWS_WRITE_TEXT);
|
||||
ap->m_metadata.clear();
|
||||
if (m < n) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// there may be audio data, but only one write per writeable event
|
||||
// get it next time
|
||||
lws_callback_on_writable(wsi);
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (ap->m_state == LWS_CLIENT_DISCONNECTING) {
|
||||
lws_close_reason(wsi, LWS_CLOSE_STATUS_NORMAL, NULL, 0);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// check for audio packets
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(ap->m_audio_mutex);
|
||||
if (ap->m_audio_buffer_write_offset > LWS_PRE) {
|
||||
size_t datalen = ap->m_audio_buffer_write_offset - LWS_PRE;
|
||||
int sent = lws_write(wsi, (unsigned char *) ap->m_audio_buffer + LWS_PRE, datalen, LWS_WRITE_BINARY);
|
||||
if (sent < datalen) {
|
||||
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_WRITEABLE %s attemped to send %lu only sent %d wsi %p..\n",
|
||||
ap->m_uuid.c_str(), datalen, sent, wsi);
|
||||
}
|
||||
ap->m_audio_buffer_write_offset = LWS_PRE;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return lws_callback_http_dummy(wsi, reason, user, in, len);
|
||||
}
|
||||
|
||||
|
||||
// static members
|
||||
static const lws_retry_bo_t retry = {
|
||||
nullptr, // retry_ms_table
|
||||
0, // retry_ms_table_count
|
||||
0, // conceal_count
|
||||
UINT16_MAX, // secs_since_valid_ping
|
||||
UINT16_MAX, // secs_since_valid_hangup
|
||||
0 // jitter_percent
|
||||
};
|
||||
|
||||
struct lws_context *AudioPipe::context = nullptr;
|
||||
std::mutex AudioPipe::mutex_connects;
|
||||
std::mutex AudioPipe::mutex_disconnects;
|
||||
std::mutex AudioPipe::mutex_writes;
|
||||
std::list<AudioPipe*> AudioPipe::pendingConnects;
|
||||
std::list<AudioPipe*> AudioPipe::pendingDisconnects;
|
||||
std::list<AudioPipe*> AudioPipe::pendingWrites;
|
||||
AudioPipe::log_emit_function AudioPipe::logger;
|
||||
std::mutex AudioPipe::mapMutex;
|
||||
bool AudioPipe::stopFlag;
|
||||
|
||||
void AudioPipe::processPendingConnects(lws_per_vhost_data *vhd) {
|
||||
std::list<AudioPipe*> connects;
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_connects);
|
||||
for (auto it = pendingConnects.begin(); it != pendingConnects.end(); ++it) {
|
||||
if ((*it)->m_state == LWS_CLIENT_IDLE) {
|
||||
connects.push_back(*it);
|
||||
(*it)->m_state = LWS_CLIENT_CONNECTING;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto it = connects.begin(); it != connects.end(); ++it) {
|
||||
AudioPipe* ap = *it;
|
||||
ap->connect_client(vhd);
|
||||
}
|
||||
}
|
||||
|
||||
void AudioPipe::processPendingDisconnects(lws_per_vhost_data *vhd) {
|
||||
std::list<AudioPipe*> disconnects;
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_disconnects);
|
||||
for (auto it = pendingDisconnects.begin(); it != pendingDisconnects.end(); ++it) {
|
||||
if ((*it)->m_state == LWS_CLIENT_DISCONNECTING) disconnects.push_back(*it);
|
||||
}
|
||||
pendingDisconnects.clear();
|
||||
}
|
||||
for (auto it = disconnects.begin(); it != disconnects.end(); ++it) {
|
||||
AudioPipe* ap = *it;
|
||||
lws_callback_on_writable(ap->m_wsi);
|
||||
}
|
||||
}
|
||||
|
||||
void AudioPipe::processPendingWrites() {
|
||||
std::list<AudioPipe*> writes;
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_writes);
|
||||
for (auto it = pendingWrites.begin(); it != pendingWrites.end(); ++it) {
|
||||
if ((*it)->m_state == LWS_CLIENT_CONNECTED) writes.push_back(*it);
|
||||
}
|
||||
pendingWrites.clear();
|
||||
}
|
||||
for (auto it = writes.begin(); it != writes.end(); ++it) {
|
||||
AudioPipe* ap = *it;
|
||||
lws_callback_on_writable(ap->m_wsi);
|
||||
}
|
||||
}
|
||||
|
||||
AudioPipe* AudioPipe::findAndRemovePendingConnect(struct lws *wsi) {
|
||||
AudioPipe* ap = NULL;
|
||||
std::lock_guard<std::mutex> guard(mutex_connects);
|
||||
std::list<AudioPipe* > toRemove;
|
||||
|
||||
for (auto it = pendingConnects.begin(); it != pendingConnects.end() && !ap; ++it) {
|
||||
int state = (*it)->m_state;
|
||||
|
||||
if ((*it)->m_wsi == nullptr)
|
||||
toRemove.push_back(*it);
|
||||
|
||||
if ((state == LWS_CLIENT_CONNECTING) &&
|
||||
(*it)->m_wsi == wsi) ap = *it;
|
||||
}
|
||||
|
||||
for (auto it = toRemove.begin(); it != toRemove.end(); ++it)
|
||||
pendingConnects.remove(*it);
|
||||
|
||||
if (ap) {
|
||||
pendingConnects.remove(ap);
|
||||
}
|
||||
|
||||
return ap;
|
||||
}
|
||||
|
||||
AudioPipe* AudioPipe::findPendingConnect(struct lws *wsi) {
|
||||
AudioPipe* ap = NULL;
|
||||
std::lock_guard<std::mutex> guard(mutex_connects);
|
||||
|
||||
for (auto it = pendingConnects.begin(); it != pendingConnects.end() && !ap; ++it) {
|
||||
int state = (*it)->m_state;
|
||||
if ((state == LWS_CLIENT_CONNECTING) &&
|
||||
(*it)->m_wsi == wsi) ap = *it;
|
||||
}
|
||||
return ap;
|
||||
}
|
||||
|
||||
void AudioPipe::addPendingConnect(AudioPipe* ap) {
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_connects);
|
||||
pendingConnects.push_back(ap);
|
||||
lwsl_debug("%s after adding connect there are %lu pending connects\n",
|
||||
ap->m_uuid.c_str(), pendingConnects.size());
|
||||
}
|
||||
lws_cancel_service(context);
|
||||
}
|
||||
void AudioPipe::addPendingDisconnect(AudioPipe* ap) {
|
||||
ap->m_state = LWS_CLIENT_DISCONNECTING;
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_disconnects);
|
||||
pendingDisconnects.push_back(ap);
|
||||
lwsl_debug("%s after adding disconnect there are %lu pending disconnects\n",
|
||||
ap->m_uuid.c_str(), pendingDisconnects.size());
|
||||
}
|
||||
lws_cancel_service(ap->m_vhd->context);
|
||||
}
|
||||
void AudioPipe::addPendingWrite(AudioPipe* ap) {
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(mutex_writes);
|
||||
pendingWrites.push_back(ap);
|
||||
}
|
||||
lws_cancel_service(ap->m_vhd->context);
|
||||
}
|
||||
|
||||
bool AudioPipe::lws_service_thread() {
|
||||
struct lws_context_creation_info info;
|
||||
|
||||
const struct lws_protocols protocols[] = {
|
||||
{
|
||||
"",
|
||||
AudioPipe::lws_callback,
|
||||
sizeof(void *),
|
||||
1024,
|
||||
},
|
||||
{ NULL, NULL, 0, 0 }
|
||||
};
|
||||
|
||||
memset(&info, 0, sizeof info);
|
||||
info.port = CONTEXT_PORT_NO_LISTEN;
|
||||
info.options = LWS_SERVER_OPTION_DO_SSL_GLOBAL_INIT;
|
||||
info.protocols = protocols;
|
||||
info.ka_time = nTcpKeepaliveSecs; // tcp keep-alive timer
|
||||
info.ka_probes = 4; // number of times to try ka before closing connection
|
||||
info.ka_interval = 5; // time between ka's
|
||||
info.timeout_secs = 10; // doc says timeout for "various processes involving network roundtrips"
|
||||
info.keepalive_timeout = 5; // seconds to allow remote client to hold on to an idle HTTP/1.1 connection
|
||||
info.timeout_secs_ah_idle = 10; // secs to allow a client to hold an ah without using it
|
||||
info.retry_and_idle_policy = &retry;
|
||||
|
||||
lwsl_notice("AudioPipe::lws_service_thread creating context\n");
|
||||
|
||||
context = lws_create_context(&info);
|
||||
if (!context) {
|
||||
lwsl_err("AudioPipe::lws_service_thread failed creating context\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
int n;
|
||||
do {
|
||||
n = lws_service(context, 0);
|
||||
} while (n >= 0 && !stopFlag);
|
||||
|
||||
lwsl_notice("AudioPipe::lws_service_thread ending\n");
|
||||
lws_context_destroy(context);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void AudioPipe::initialize(int loglevel, log_emit_function logger) {
|
||||
|
||||
lws_set_log_level(loglevel, logger);
|
||||
|
||||
lwsl_notice("AudioPipe::initialize starting\n");
|
||||
std::lock_guard<std::mutex> lock(mapMutex);
|
||||
std::thread t(&AudioPipe::lws_service_thread);
|
||||
stopFlag = false;
|
||||
t.detach();
|
||||
}
|
||||
|
||||
bool AudioPipe::deinitialize() {
|
||||
lwsl_notice("AudioPipe::deinitialize\n");
|
||||
std::lock_guard<std::mutex> lock(mapMutex);
|
||||
stopFlag = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
// instance members
|
||||
AudioPipe::AudioPipe(const char* uuid, const char* host, unsigned int port, const char* path,
|
||||
size_t bufLen, size_t minFreespace, notifyHandler_t callback) :
|
||||
m_uuid(uuid), m_host(host), m_port(port), m_path(path), m_finished(false),
|
||||
m_audio_buffer_min_freespace(minFreespace), m_audio_buffer_max_len(bufLen), m_gracefulShutdown(false),
|
||||
m_audio_buffer_write_offset(LWS_PRE), m_recv_buf(nullptr), m_recv_buf_ptr(nullptr), m_interim(false),
|
||||
m_state(LWS_CLIENT_IDLE), m_wsi(nullptr), m_vhd(nullptr), m_callback(callback) {
|
||||
|
||||
m_audio_buffer = new uint8_t[m_audio_buffer_max_len];
|
||||
}
|
||||
AudioPipe::~AudioPipe() {
|
||||
//std::cerr << "AudioPipe::~AudioPipe " << std::endl;
|
||||
if (m_audio_buffer) delete [] m_audio_buffer;
|
||||
if (m_recv_buf) delete [] m_recv_buf;
|
||||
}
|
||||
|
||||
void AudioPipe::connect(void) {
|
||||
addPendingConnect(this);
|
||||
}
|
||||
|
||||
bool AudioPipe::connect_client(struct lws_per_vhost_data *vhd) {
|
||||
assert(m_audio_buffer != nullptr);
|
||||
assert(m_vhd == nullptr);
|
||||
struct lws_client_connect_info i;
|
||||
|
||||
memset(&i, 0, sizeof(i));
|
||||
i.context = vhd->context;
|
||||
i.port = m_port;
|
||||
i.address = m_host.c_str();
|
||||
i.path = m_path.c_str();
|
||||
i.host = i.address;
|
||||
i.origin = i.address;
|
||||
i.ssl_connection = LCCSCF_USE_SSL;
|
||||
i.pwsi = &(m_wsi);
|
||||
|
||||
m_state = LWS_CLIENT_CONNECTING;
|
||||
m_vhd = vhd;
|
||||
|
||||
m_wsi = lws_client_connect_via_info(&i);
|
||||
lwsl_debug("%s attempting connection, wsi is %p\n", m_uuid.c_str(), m_wsi);
|
||||
|
||||
return nullptr != m_wsi;
|
||||
}
|
||||
|
||||
void AudioPipe::bufferForSending(const char* text) {
|
||||
if (m_state != LWS_CLIENT_CONNECTED) return;
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(m_text_mutex);
|
||||
m_metadata.append(text);
|
||||
}
|
||||
addPendingWrite(this);
|
||||
}
|
||||
|
||||
void AudioPipe::unlockAudioBuffer() {
|
||||
if (m_audio_buffer_write_offset > LWS_PRE) addPendingWrite(this);
|
||||
m_audio_mutex.unlock();
|
||||
}
|
||||
|
||||
void AudioPipe::close() {
|
||||
if (m_state != LWS_CLIENT_CONNECTED) return;
|
||||
addPendingDisconnect(this);
|
||||
}
|
||||
|
||||
void AudioPipe::finish() {
|
||||
if (m_finished || m_state != LWS_CLIENT_CONNECTED) {
|
||||
m_finished = true;
|
||||
return;
|
||||
}
|
||||
m_finished = true;
|
||||
bufferForSending("{\"action\": \"stop\"}");
|
||||
}
|
||||
|
||||
void AudioPipe::waitForClose() {
|
||||
std::shared_future<void> sf(m_promise.get_future());
|
||||
sf.wait();
|
||||
return;
|
||||
}
|
||||
161
mod_ibm_transcribe/audio_pipe.hpp
Normal file
161
mod_ibm_transcribe/audio_pipe.hpp
Normal file
@@ -0,0 +1,161 @@
|
||||
#ifndef __IBM_AUDIO_PIPE_HPP__
|
||||
#define __IBM_AUDIO_PIPE_HPP__
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
#include <mutex>
|
||||
#include <future>
|
||||
#include <queue>
|
||||
#include <unordered_map>
|
||||
#include <thread>
|
||||
|
||||
#include <libwebsockets.h>
|
||||
|
||||
namespace ibm {
|
||||
|
||||
class AudioPipe {
|
||||
public:
|
||||
enum LwsState_t {
|
||||
LWS_CLIENT_IDLE,
|
||||
LWS_CLIENT_CONNECTING,
|
||||
LWS_CLIENT_CONNECTED,
|
||||
LWS_CLIENT_FAILED,
|
||||
LWS_CLIENT_DISCONNECTING,
|
||||
LWS_CLIENT_DISCONNECTED
|
||||
};
|
||||
enum NotifyEvent_t {
|
||||
CONNECT_SUCCESS,
|
||||
CONNECT_FAIL,
|
||||
CONNECTION_DROPPED,
|
||||
CONNECTION_CLOSED_GRACEFULLY,
|
||||
MESSAGE
|
||||
};
|
||||
typedef void (*log_emit_function)(int level, const char *line);
|
||||
typedef void (*notifyHandler_t)(const char *sessionId, NotifyEvent_t event, const char* message, bool finished, bool wantsInterim, const char* bugname);
|
||||
|
||||
struct lws_per_vhost_data {
|
||||
struct lws_context *context;
|
||||
struct lws_vhost *vhost;
|
||||
const struct lws_protocols *protocol;
|
||||
};
|
||||
|
||||
static void initialize(int loglevel, log_emit_function logger);
|
||||
static bool deinitialize();
|
||||
static bool lws_service_thread();
|
||||
|
||||
// constructor
|
||||
AudioPipe(const char* uuid, const char* host, unsigned int port, const char* path,
|
||||
size_t bufLen, size_t minFreespace, notifyHandler_t callback);
|
||||
~AudioPipe();
|
||||
|
||||
LwsState_t getLwsState(void) { return m_state; }
|
||||
void connect(void);
|
||||
void bufferForSending(const char* text);
|
||||
size_t binarySpaceAvailable(void) {
|
||||
return m_audio_buffer_max_len - m_audio_buffer_write_offset;
|
||||
}
|
||||
size_t binaryMinSpace(void) {
|
||||
return m_audio_buffer_min_freespace;
|
||||
}
|
||||
char * binaryWritePtr(void) {
|
||||
return (char *) m_audio_buffer + m_audio_buffer_write_offset;
|
||||
}
|
||||
void binaryWritePtrAdd(size_t len) {
|
||||
m_audio_buffer_write_offset += len;
|
||||
}
|
||||
void binaryWritePtrResetToZero(void) {
|
||||
m_audio_buffer_write_offset = 0;
|
||||
}
|
||||
void lockAudioBuffer(void) {
|
||||
m_audio_mutex.lock();
|
||||
}
|
||||
void unlockAudioBuffer(void) ;
|
||||
|
||||
void enableInterimTranscripts(bool interim) {
|
||||
m_interim = interim;
|
||||
}
|
||||
bool isInterimTranscriptsEnabled(void) {
|
||||
return m_interim;
|
||||
}
|
||||
|
||||
void setAccessToken(const char* accessToken) {
|
||||
m_access_token = accessToken;
|
||||
}
|
||||
std::string& getAccessToken(void) {
|
||||
return m_access_token;
|
||||
}
|
||||
|
||||
void setBugname(const char* bugname) {
|
||||
m_bugname = bugname;
|
||||
}
|
||||
std::string& getBugname(void) {
|
||||
return m_bugname;
|
||||
}
|
||||
|
||||
void close() ;
|
||||
void finish();
|
||||
void waitForClose();
|
||||
void setClosed() { m_promise.set_value(); }
|
||||
bool isFinished() { return m_finished;}
|
||||
|
||||
// no default constructor or copying
|
||||
AudioPipe() = delete;
|
||||
AudioPipe(const AudioPipe&) = delete;
|
||||
void operator=(const AudioPipe&) = delete;
|
||||
|
||||
private:
|
||||
|
||||
static int lws_callback(struct lws *wsi, enum lws_callback_reasons reason, void *user, void *in, size_t len);
|
||||
static struct lws_context *context;
|
||||
static std::mutex mutex_connects;
|
||||
static std::mutex mutex_disconnects;
|
||||
static std::mutex mutex_writes;
|
||||
static std::list<AudioPipe*> pendingConnects;
|
||||
static std::list<AudioPipe*> pendingDisconnects;
|
||||
static std::list<AudioPipe*> pendingWrites;
|
||||
static log_emit_function logger;
|
||||
static std::mutex mapMutex;
|
||||
static bool stopFlag;
|
||||
|
||||
static AudioPipe* findAndRemovePendingConnect(struct lws *wsi);
|
||||
static AudioPipe* findPendingConnect(struct lws *wsi);
|
||||
static void addPendingConnect(AudioPipe* ap);
|
||||
static void addPendingDisconnect(AudioPipe* ap);
|
||||
static void addPendingWrite(AudioPipe* ap);
|
||||
static void processPendingConnects(lws_per_vhost_data *vhd);
|
||||
static void processPendingDisconnects(lws_per_vhost_data *vhd);
|
||||
static void processPendingWrites(void);
|
||||
|
||||
|
||||
bool connect_client(struct lws_per_vhost_data *vhd);
|
||||
|
||||
LwsState_t m_state;
|
||||
std::string m_uuid;
|
||||
std::string m_host;
|
||||
unsigned int m_port;
|
||||
std::string m_path;
|
||||
std::string m_metadata;
|
||||
std::mutex m_text_mutex;
|
||||
std::mutex m_audio_mutex;
|
||||
int m_sslFlags;
|
||||
struct lws *m_wsi;
|
||||
uint8_t *m_audio_buffer;
|
||||
size_t m_audio_buffer_max_len;
|
||||
size_t m_audio_buffer_write_offset;
|
||||
size_t m_audio_buffer_min_freespace;
|
||||
uint8_t* m_recv_buf;
|
||||
uint8_t* m_recv_buf_ptr;
|
||||
size_t m_recv_buf_len;
|
||||
struct lws_per_vhost_data* m_vhd;
|
||||
notifyHandler_t m_callback;
|
||||
log_emit_function m_logger;
|
||||
bool m_gracefulShutdown;
|
||||
bool m_finished;
|
||||
bool m_interim;
|
||||
std::string m_access_token;
|
||||
std::string m_bugname;
|
||||
std::promise<void> m_promise;
|
||||
};
|
||||
|
||||
} // namespace ibm
|
||||
#endif
|
||||
488
mod_ibm_transcribe/ibm_transcribe_glue.cpp
Normal file
488
mod_ibm_transcribe/ibm_transcribe_glue.cpp
Normal file
@@ -0,0 +1,488 @@
|
||||
#include <switch.h>
|
||||
#include <switch_json.h>
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <list>
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
#include <cassert>
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <regex>
|
||||
#include <map>
|
||||
#include <iostream>
|
||||
|
||||
#include "mod_ibm_transcribe.h"
|
||||
#include "simple_buffer.h"
|
||||
#include "parser.hpp"
|
||||
#include "audio_pipe.hpp"
|
||||
|
||||
#define RTP_PACKETIZATION_PERIOD 20
|
||||
#define FRAME_SIZE_8000 320 /*which means each 20ms frame as 320 bytes at 8 khz (1 channel only)*/
|
||||
|
||||
namespace {
|
||||
static bool hasDefaultCredentials = false;
|
||||
static const char* defaultApiKey = nullptr;
|
||||
static const char *requestedBufferSecs = std::getenv("MOD_AUDIO_FORK_BUFFER_SECS");
|
||||
static int nAudioBufferSecs = std::max(1, std::min(requestedBufferSecs ? ::atoi(requestedBufferSecs) : 2, 7));
|
||||
static const char *requestedNumServiceThreads = std::getenv("MOD_AUDIO_FORK_SERVICE_THREADS");
|
||||
static unsigned int nServiceThreads = std::max(1, std::min(requestedNumServiceThreads ? ::atoi(requestedNumServiceThreads) : 1, 5));
|
||||
static unsigned int idxCallCount = 0;
|
||||
static uint32_t playCount = 0;
|
||||
static const std::map<ibm::AudioPipe::NotifyEvent_t, std::string> Event2Str = {
|
||||
{ibm::AudioPipe::CONNECT_SUCCESS, "CONNECT_SUCCESS"},
|
||||
{ibm::AudioPipe::CONNECT_FAIL, "CONNECT_FAIL"},
|
||||
{ibm::AudioPipe::CONNECTION_DROPPED, "CONNECTION_DROPPED"},
|
||||
{ibm::AudioPipe::CONNECTION_CLOSED_GRACEFULLY, "CONNECTION_CLOSED_GRACEFULLY"},
|
||||
{ibm::AudioPipe::MESSAGE, "MESSAGE"}
|
||||
};
|
||||
static std::string EventStr(ibm::AudioPipe::NotifyEvent_t event) {
|
||||
auto it = Event2Str.find(event);
|
||||
if (it != Event2Str.end()) {
|
||||
return it->second;
|
||||
}
|
||||
return "UNKNOWN";
|
||||
}
|
||||
|
||||
/*
|
||||
static void reaper(private_t *tech_pvt) {
|
||||
std::shared_ptr<ibm::AudioPipe> pAp;
|
||||
pAp.reset((ibm::AudioPipe *)tech_pvt->pAudioPipe);
|
||||
tech_pvt->pAudioPipe = nullptr;
|
||||
|
||||
std::thread t([pAp]{
|
||||
pAp->finish();
|
||||
pAp->waitForClose();
|
||||
});
|
||||
t.detach();
|
||||
}
|
||||
*/
|
||||
static void destroy_tech_pvt(private_t *tech_pvt) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "%s (%u) destroy_tech_pvt\n", tech_pvt->sessionId, tech_pvt->id);
|
||||
if (tech_pvt) {
|
||||
if (tech_pvt->pAudioPipe) {
|
||||
ibm::AudioPipe* p = (ibm::AudioPipe *) tech_pvt->pAudioPipe;
|
||||
delete p;
|
||||
tech_pvt->pAudioPipe = nullptr;
|
||||
}
|
||||
if (tech_pvt->resampler) {
|
||||
speex_resampler_destroy(tech_pvt->resampler);
|
||||
tech_pvt->resampler = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
if (tech_pvt->vad) {
|
||||
switch_vad_destroy(&tech_pvt->vad);
|
||||
tech_pvt->vad = nullptr;
|
||||
}
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
static void responseHandler(switch_core_session_t* session,
|
||||
const char* eventName, const char * json, const char* bugname, int finished) {
|
||||
switch_event_t *event;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, eventName);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "ibm");
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-session-finished", finished ? "true" : "false");
|
||||
if (finished) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "responseHandler returning event %s, from finished recognition session\n", eventName);
|
||||
}
|
||||
if (json) switch_event_add_body(event, "%s", json);
|
||||
if (bugname) switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "media-bugname", bugname);
|
||||
switch_event_fire(&event);
|
||||
}
|
||||
|
||||
std::string encodeURIComponent(std::string decoded)
|
||||
{
|
||||
|
||||
std::ostringstream oss;
|
||||
std::regex r("[!'\\(\\)*-.0-9A-Za-z_~:]");
|
||||
|
||||
for (char &c : decoded)
|
||||
{
|
||||
if (std::regex_match((std::string){c}, r))
|
||||
{
|
||||
oss << c;
|
||||
}
|
||||
else
|
||||
{
|
||||
oss << "%" << std::uppercase << std::hex << (0xff & c);
|
||||
}
|
||||
}
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
std::string& constructPath(switch_core_session_t* session, std::string& path,
|
||||
int sampleRate, int channels, const char* language, int interim) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
const char *var ;
|
||||
std::ostringstream oss;
|
||||
|
||||
const char* instanceId = switch_channel_get_variable(channel, "IBM_SPEECH_INSTANCE_ID");
|
||||
|
||||
oss << "/instances/" << instanceId << "/v1/recognize";
|
||||
|
||||
// access token
|
||||
if (var = switch_channel_get_variable(channel, "IBM_ACCESS_TOKEN")) {
|
||||
oss << "?access_token=" << var;
|
||||
}
|
||||
|
||||
// model = voice
|
||||
if (var = switch_channel_get_variable(channel, "IBM_SPEECH_MODEL")) {
|
||||
oss << "&model=" << var;
|
||||
}
|
||||
else {
|
||||
oss << "&model=" << language;
|
||||
}
|
||||
|
||||
if (var = switch_channel_get_variable(channel, "IBM_SPEECH_LANGUAGE_CUSTOMIZATION_ID")) {
|
||||
oss << "&language_customization_id=" << var;
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "IBM_SPEECH_ACOUSTIC_CUSTOMIZATION_ID")) {
|
||||
oss << "&acoustic_customization_id=" << var;
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "IBM_SPEECH_BASE_MODEL_VERSION")) {
|
||||
oss << "&base_model_version=" << var;
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "IBM_SPEECH_WATSON_METADATA")) {
|
||||
oss << "&x-watson-metadata=" << var;
|
||||
}
|
||||
if (switch_true(switch_channel_get_variable(channel, "IBM_SPEECH_WATSON_LEARNING_OPT_OUT"))) {
|
||||
oss << "&x-watson-learning-opt-out=true";
|
||||
}
|
||||
|
||||
path = oss.str();
|
||||
return path;
|
||||
}
|
||||
|
||||
static void eventCallback(const char* sessionId, ibm::AudioPipe::NotifyEvent_t event, const char* message, bool finished, bool wantsInterim, const char* bugname) {
|
||||
switch_core_session_t* session = switch_core_session_locate(sessionId);
|
||||
if (session) {
|
||||
bool releaseAudioPipe = false;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "received %s: %s\n", EventStr(event).c_str(), message);
|
||||
switch (event) {
|
||||
case ibm::AudioPipe::CONNECT_SUCCESS:
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "connection successful\n");
|
||||
responseHandler(session, TRANSCRIBE_EVENT_CONNECT_SUCCESS, NULL, bugname, finished);
|
||||
break;
|
||||
case ibm::AudioPipe::CONNECT_FAIL:
|
||||
{
|
||||
// first thing: we can no longer access the AudioPipe
|
||||
std::stringstream json;
|
||||
json << "{\"reason\":\"" << message << "\"}";
|
||||
releaseAudioPipe = true;
|
||||
responseHandler(session, TRANSCRIBE_EVENT_CONNECT_FAIL, (char *) json.str().c_str(), bugname, finished);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_NOTICE, "connection failed: %s\n", message);
|
||||
}
|
||||
break;
|
||||
case ibm::AudioPipe::CONNECTION_DROPPED:
|
||||
// first thing: we can no longer access the AudioPipe
|
||||
releaseAudioPipe = true;
|
||||
responseHandler(session, TRANSCRIBE_EVENT_DISCONNECT, NULL, bugname, finished);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "connection dropped from far end\n");
|
||||
break;
|
||||
case ibm::AudioPipe::CONNECTION_CLOSED_GRACEFULLY:
|
||||
// first thing: we can no longer access the AudioPipe
|
||||
releaseAudioPipe = true;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "connection closed gracefully\n");
|
||||
break;
|
||||
case ibm::AudioPipe::MESSAGE:
|
||||
if (!wantsInterim && NULL != strstr(message, "\"state\": \"listening\"")) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "ibm service is listening\n");
|
||||
}
|
||||
else if (NULL != strstr(message, "\"final\": false")) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "got interim transcript: %s\n", message);
|
||||
}
|
||||
else if (NULL != strstr(message, "\"error\":")) {
|
||||
responseHandler(session, TRANSCRIBE_EVENT_ERROR, message, bugname, finished);
|
||||
}
|
||||
else responseHandler(session, TRANSCRIBE_EVENT_RESULTS, message, bugname, finished);
|
||||
break;
|
||||
|
||||
default:
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_NOTICE, "got unexpected msg from ibm %d:%s\n", event, message);
|
||||
break;
|
||||
}
|
||||
if (releaseAudioPipe) {
|
||||
switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, bugname);
|
||||
if (bug) {
|
||||
private_t* tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug);
|
||||
if (tech_pvt) tech_pvt->pAudioPipe = nullptr;
|
||||
}
|
||||
}
|
||||
switch_core_session_rwunlock(session);
|
||||
}
|
||||
}
|
||||
|
||||
switch_status_t fork_data_init(private_t *tech_pvt, switch_core_session_t *session,
|
||||
int sampling, int desiredSampling, int channels, char *lang, int interim, char* bugname) {
|
||||
|
||||
int err;
|
||||
switch_codec_implementation_t read_impl;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
|
||||
const char* region = switch_channel_get_variable(channel, "IBM_SPEECH_REGION");
|
||||
const char* instanceId = switch_channel_get_variable(channel, "IBM_SPEECH_INSTANCE_ID");
|
||||
if (!region || !instanceId || !switch_channel_get_variable(channel, "IBM_ACCESS_TOKEN")) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
|
||||
"missing IBM_SPEECH_REGION or IBM_SPEECH_INSTANCE_ID or IBM_ACCESS_TOKEN\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
switch_core_session_get_read_impl(session, &read_impl);
|
||||
|
||||
memset(tech_pvt, 0, sizeof(private_t));
|
||||
|
||||
std::ostringstream oss;
|
||||
oss << "api." << region << ".speech-to-text.watson.cloud.ibm.com";
|
||||
std::string host = oss.str();
|
||||
std::string path;
|
||||
constructPath(session, path, desiredSampling, channels, lang, interim);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "host: %s, path: %s\n", host.c_str(), path.c_str());
|
||||
|
||||
strncpy(tech_pvt->sessionId, switch_core_session_get_uuid(session), MAX_SESSION_ID);
|
||||
strncpy(tech_pvt->host,host.c_str(), MAX_WS_URL_LEN);
|
||||
tech_pvt->port = 443;
|
||||
strncpy(tech_pvt->path, path.c_str(), MAX_PATH_LEN);
|
||||
tech_pvt->sampling = desiredSampling;
|
||||
tech_pvt->channels = channels;
|
||||
tech_pvt->id = ++idxCallCount;
|
||||
tech_pvt->buffer_overrun_notified = 0;
|
||||
|
||||
size_t buflen = LWS_PRE + (FRAME_SIZE_8000 * desiredSampling / 8000 * channels * 1000 / RTP_PACKETIZATION_PERIOD * nAudioBufferSecs);
|
||||
|
||||
ibm::AudioPipe* ap = new ibm::AudioPipe(tech_pvt->sessionId, tech_pvt->host, tech_pvt->port, tech_pvt->path,
|
||||
buflen, read_impl.decoded_bytes_per_packet, eventCallback);
|
||||
if (!ap) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error allocating AudioPipe\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
const char* access_token = switch_channel_get_variable(channel, "IBM_ACCESS_TOKEN");
|
||||
ap->setAccessToken(access_token);
|
||||
ap->setBugname(bugname);
|
||||
if (interim) ap->enableInterimTranscripts(true);
|
||||
|
||||
tech_pvt->pAudioPipe = static_cast<void *>(ap);
|
||||
|
||||
switch_mutex_init(&tech_pvt->mutex, SWITCH_MUTEX_NESTED, switch_core_session_get_pool(session));
|
||||
|
||||
if (desiredSampling != sampling) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) resampling from %u to %u\n", tech_pvt->id, sampling, desiredSampling);
|
||||
tech_pvt->resampler = speex_resampler_init(channels, sampling, desiredSampling, SWITCH_RESAMPLE_QUALITY, &err);
|
||||
if (0 != err) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error initializing resampler: %s.\n", speex_resampler_strerror(err));
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
}
|
||||
else {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) no resampling needed for this call\n", tech_pvt->id);
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) fork_data_init\n", tech_pvt->id);
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void lws_logger(int level, const char *line) {
|
||||
switch_log_level_t llevel = SWITCH_LOG_DEBUG;
|
||||
|
||||
switch (level) {
|
||||
case LLL_ERR: llevel = SWITCH_LOG_ERROR; break;
|
||||
case LLL_WARN: llevel = SWITCH_LOG_WARNING; break;
|
||||
case LLL_NOTICE: llevel = SWITCH_LOG_NOTICE; break;
|
||||
case LLL_INFO: llevel = SWITCH_LOG_INFO; break;
|
||||
break;
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "%s\n", line);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
extern "C" {
|
||||
switch_status_t ibm_transcribe_init() {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "mod_ibm_transcribe: audio buffer (in secs): %d secs\n", nAudioBufferSecs);
|
||||
|
||||
int logs = LLL_ERR | LLL_WARN | LLL_NOTICE || LLL_INFO | LLL_PARSER | LLL_HEADER | LLL_EXT | LLL_CLIENT | LLL_LATENCY | LLL_DEBUG ;
|
||||
|
||||
ibm::AudioPipe::initialize(logs, lws_logger);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "AudioPipe::initialize completed\n");
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_status_t ibm_transcribe_cleanup() {
|
||||
bool cleanup = false;
|
||||
cleanup = ibm::AudioPipe::deinitialize();
|
||||
if (cleanup == true) {
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
switch_status_t ibm_transcribe_session_init(switch_core_session_t *session,
|
||||
uint32_t samples_per_second, uint32_t channels, char* lang, int interim, char* bugname, void **ppUserData)
|
||||
{
|
||||
int err;
|
||||
|
||||
// allocate per-session data structure
|
||||
private_t* tech_pvt = (private_t *) switch_core_session_alloc(session, sizeof(private_t));
|
||||
if (!tech_pvt) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "error allocating memory!\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
if (SWITCH_STATUS_SUCCESS != fork_data_init(tech_pvt, session, samples_per_second, 16000, channels, lang, interim, bugname /*, responseHandler */)) {
|
||||
destroy_tech_pvt(tech_pvt);
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
*ppUserData = tech_pvt;
|
||||
|
||||
ibm::AudioPipe *pAudioPipe = static_cast<ibm::AudioPipe *>(tech_pvt->pAudioPipe);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "connecting now\n");
|
||||
pAudioPipe->connect();
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "connection in progress\n");
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_status_t ibm_transcribe_session_stop(switch_core_session_t *session,int channelIsClosing, char* bugname) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, MY_BUG_NAME);
|
||||
if (!bug) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "ibm_transcribe_session_stop: no bug - websocket conection already closed\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
private_t* tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug);
|
||||
uint32_t id = tech_pvt->id;
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "(%u) ibm_transcribe_session_stop\n", id);
|
||||
|
||||
if (!tech_pvt) return SWITCH_STATUS_FALSE;
|
||||
|
||||
// close connection and get final responses
|
||||
switch_mutex_lock(tech_pvt->mutex);
|
||||
switch_channel_set_private(channel, bugname, NULL);
|
||||
if (!channelIsClosing) switch_core_media_bug_remove(session, &bug);
|
||||
|
||||
ibm::AudioPipe *pAudioPipe = static_cast<ibm::AudioPipe *>(tech_pvt->pAudioPipe);
|
||||
if (pAudioPipe) {
|
||||
//reaper(tech_pvt);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "(%u) ibm_transcribe_session_stop, send stop request to get final transcript\n", id);
|
||||
pAudioPipe->finish();
|
||||
tech_pvt->pAudioPipe = nullptr;
|
||||
}
|
||||
else {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "(%u) ibm_transcribe_session_stop, null audiopipe\n", id);
|
||||
}
|
||||
destroy_tech_pvt(tech_pvt);
|
||||
switch_mutex_unlock(tech_pvt->mutex);
|
||||
switch_mutex_destroy(tech_pvt->mutex);
|
||||
tech_pvt->mutex = nullptr;
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "(%u) ibm_transcribe_session_stop exiting\n", id);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_bool_t ibm_transcribe_frame(switch_core_session_t *session, switch_media_bug_t *bug) {
|
||||
private_t* tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug);
|
||||
size_t inuse = 0;
|
||||
bool dirty = false;
|
||||
char *p = (char *) "{\"msg\": \"buffer overrun\"}";
|
||||
|
||||
if (!tech_pvt) return SWITCH_TRUE;
|
||||
|
||||
if (switch_mutex_trylock(tech_pvt->mutex) == SWITCH_STATUS_SUCCESS) {
|
||||
if (!tech_pvt->pAudioPipe) {
|
||||
switch_mutex_unlock(tech_pvt->mutex);
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
ibm::AudioPipe *pAudioPipe = static_cast<ibm::AudioPipe *>(tech_pvt->pAudioPipe);
|
||||
if (pAudioPipe->getLwsState() != ibm::AudioPipe::LWS_CLIENT_CONNECTED) {
|
||||
switch_mutex_unlock(tech_pvt->mutex);
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
|
||||
pAudioPipe->lockAudioBuffer();
|
||||
size_t available = pAudioPipe->binarySpaceAvailable();
|
||||
if (NULL == tech_pvt->resampler) {
|
||||
switch_frame_t frame = { 0 };
|
||||
frame.data = pAudioPipe->binaryWritePtr();
|
||||
frame.buflen = available;
|
||||
while (true) {
|
||||
|
||||
// check if buffer would be overwritten; dump packets if so
|
||||
if (available < pAudioPipe->binaryMinSpace()) {
|
||||
if (!tech_pvt->buffer_overrun_notified) {
|
||||
tech_pvt->buffer_overrun_notified = 1;
|
||||
responseHandler(session, TRANSCRIBE_EVENT_BUFFER_OVERRUN, NULL, tech_pvt->bugname, 0);
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "(%u) dropping packets!\n",
|
||||
tech_pvt->id);
|
||||
pAudioPipe->binaryWritePtrResetToZero();
|
||||
|
||||
frame.data = pAudioPipe->binaryWritePtr();
|
||||
frame.buflen = available = pAudioPipe->binarySpaceAvailable();
|
||||
}
|
||||
|
||||
switch_status_t rv = switch_core_media_bug_read(bug, &frame, SWITCH_TRUE);
|
||||
if (rv != SWITCH_STATUS_SUCCESS) break;
|
||||
if (frame.datalen) {
|
||||
pAudioPipe->binaryWritePtrAdd(frame.datalen);
|
||||
frame.buflen = available = pAudioPipe->binarySpaceAvailable();
|
||||
frame.data = pAudioPipe->binaryWritePtr();
|
||||
dirty = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
uint8_t data[SWITCH_RECOMMENDED_BUFFER_SIZE];
|
||||
switch_frame_t frame = { 0 };
|
||||
frame.data = data;
|
||||
frame.buflen = SWITCH_RECOMMENDED_BUFFER_SIZE;
|
||||
while (switch_core_media_bug_read(bug, &frame, SWITCH_TRUE) == SWITCH_STATUS_SUCCESS) {
|
||||
if (frame.datalen) {
|
||||
spx_uint32_t out_len = available >> 1; // space for samples which are 2 bytes
|
||||
spx_uint32_t in_len = frame.samples;
|
||||
|
||||
speex_resampler_process_interleaved_int(tech_pvt->resampler,
|
||||
(const spx_int16_t *) frame.data,
|
||||
(spx_uint32_t *) &in_len,
|
||||
(spx_int16_t *) ((char *) pAudioPipe->binaryWritePtr()),
|
||||
&out_len);
|
||||
|
||||
if (out_len > 0) {
|
||||
// bytes written = (num samples) * (2 bytes per sample) * (num channels)
|
||||
size_t bytes_written = out_len * 2 * tech_pvt->channels;
|
||||
//std::cerr << "read " << in_len << " samples, wrote " << out_len << " samples, wrote " << bytes_written << " bytes " << std::endl;
|
||||
pAudioPipe->binaryWritePtrAdd(bytes_written);
|
||||
available = pAudioPipe->binarySpaceAvailable();
|
||||
|
||||
dirty = true;
|
||||
}
|
||||
if (available < pAudioPipe->binaryMinSpace()) {
|
||||
if (!tech_pvt->buffer_overrun_notified) {
|
||||
tech_pvt->buffer_overrun_notified = 1;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "(%u) dropping packets!\n",
|
||||
tech_pvt->id);
|
||||
responseHandler(session, TRANSCRIBE_EVENT_BUFFER_OVERRUN, NULL, tech_pvt->bugname, 0);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pAudioPipe->unlockAudioBuffer();
|
||||
switch_mutex_unlock(tech_pvt->mutex);
|
||||
}
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
}
|
||||
11
mod_ibm_transcribe/ibm_transcribe_glue.h
Normal file
11
mod_ibm_transcribe/ibm_transcribe_glue.h
Normal file
@@ -0,0 +1,11 @@
|
||||
#ifndef __IBM_GLUE_H__
|
||||
#define __IBM_GLUE_H__
|
||||
|
||||
switch_status_t ibm_transcribe_init();
|
||||
switch_status_t ibm_transcribe_cleanup();
|
||||
switch_status_t ibm_transcribe_session_init(switch_core_session_t *session,
|
||||
uint32_t samples_per_second, uint32_t channels, char* lang, int interim, char* bugname, void **ppUserData);
|
||||
switch_status_t ibm_transcribe_session_stop(switch_core_session_t *session, int channelIsClosing, char* bugname);
|
||||
switch_bool_t ibm_transcribe_frame(switch_core_session_t *session, switch_media_bug_t *bug);
|
||||
|
||||
#endif
|
||||
223
mod_ibm_transcribe/mod_ibm_transcribe.c
Normal file
223
mod_ibm_transcribe/mod_ibm_transcribe.c
Normal file
@@ -0,0 +1,223 @@
|
||||
/*
|
||||
*
|
||||
* mod_ibm_transcribe.c -- Freeswitch module for using ibm streaming transcribe api
|
||||
*
|
||||
*/
|
||||
#include "mod_ibm_transcribe.h"
|
||||
#include "ibm_transcribe_glue.h"
|
||||
|
||||
/* Prototypes */
|
||||
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_ibm_transcribe_shutdown);
|
||||
SWITCH_MODULE_LOAD_FUNCTION(mod_ibm_transcribe_load);
|
||||
|
||||
SWITCH_MODULE_DEFINITION(mod_ibm_transcribe, mod_ibm_transcribe_load, mod_ibm_transcribe_shutdown, NULL);
|
||||
|
||||
static switch_status_t do_stop(switch_core_session_t *session, char* bugname);
|
||||
|
||||
static switch_bool_t capture_callback(switch_media_bug_t *bug, void *user_data, switch_abc_type_t type)
|
||||
{
|
||||
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
|
||||
|
||||
switch (type) {
|
||||
case SWITCH_ABC_TYPE_INIT:
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Got SWITCH_ABC_TYPE_INIT.\n");
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_CLOSE:
|
||||
{
|
||||
private_t *tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Got SWITCH_ABC_TYPE_CLOSE.\n");
|
||||
|
||||
ibm_transcribe_session_stop(session, 1, tech_pvt->bugname);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Finished SWITCH_ABC_TYPE_CLOSE.\n");
|
||||
}
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_READ:
|
||||
|
||||
return ibm_transcribe_frame(session, bug);
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_WRITE:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
|
||||
static switch_status_t start_capture(switch_core_session_t *session, switch_media_bug_flag_t flags,
|
||||
char* lang, int interim, char* bugname)
|
||||
{
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug;
|
||||
switch_status_t status;
|
||||
switch_codec_implementation_t read_impl = { 0 };
|
||||
void *pUserData;
|
||||
uint32_t samples_per_second;
|
||||
|
||||
if (switch_channel_get_private(channel, MY_BUG_NAME)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "removing bug from previous transcribe\n");
|
||||
do_stop(session, bugname);
|
||||
}
|
||||
|
||||
switch_core_session_get_read_impl(session, &read_impl);
|
||||
|
||||
if (switch_channel_pre_answer(channel) != SWITCH_STATUS_SUCCESS) {
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
samples_per_second = !strcasecmp(read_impl.iananame, "g722") ? read_impl.actual_samples_per_second : read_impl.samples_per_second;
|
||||
|
||||
if (SWITCH_STATUS_FALSE == ibm_transcribe_session_init(session, samples_per_second, flags & SMBF_STEREO ? 2 : 1, lang, interim, bugname, &pUserData)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error initializing ibm speech session.\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
if ((status = switch_core_media_bug_add(session, "ibm_transcribe", NULL, capture_callback, pUserData, 0, flags, &bug)) != SWITCH_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
switch_channel_set_private(channel, MY_BUG_NAME, bug);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "added media bug for ibm transcribe\n");
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static switch_status_t do_stop(switch_core_session_t *session, char* bugname)
|
||||
{
|
||||
switch_status_t status = SWITCH_STATUS_SUCCESS;
|
||||
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = switch_channel_get_private(channel, MY_BUG_NAME);
|
||||
|
||||
if (bug) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Received user command command to stop transcribe.\n");
|
||||
status = ibm_transcribe_session_stop(session, 0, bugname);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "stopped transcribe.\n");
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
#define TRANSCRIBE_API_SYNTAX "<uuid> [start|stop] lang-code [interim] [stereo|mono]"
|
||||
SWITCH_STANDARD_API(ibm_transcribe_function)
|
||||
{
|
||||
char *mycmd = NULL, *argv[6] = { 0 };
|
||||
int argc = 0;
|
||||
switch_status_t status = SWITCH_STATUS_FALSE;
|
||||
switch_media_bug_flag_t flags = SMBF_READ_STREAM /* | SMBF_WRITE_STREAM | SMBF_READ_PING */;
|
||||
|
||||
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
|
||||
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
|
||||
}
|
||||
|
||||
if (zstr(cmd) ||
|
||||
(!strcasecmp(argv[1], "stop") && argc < 2) ||
|
||||
(!strcasecmp(argv[1], "start") && argc < 3) ||
|
||||
zstr(argv[0])) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s %s %s.\n", cmd, argv[0], argv[1]);
|
||||
stream->write_function(stream, "-USAGE: %s\n", TRANSCRIBE_API_SYNTAX);
|
||||
goto done;
|
||||
} else {
|
||||
switch_core_session_t *lsession = NULL;
|
||||
|
||||
if ((lsession = switch_core_session_locate(argv[0]))) {
|
||||
if (!strcasecmp(argv[1], "stop")) {
|
||||
char *bugname = argc > 2 ? argv[2] : MY_BUG_NAME;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "stop transcribing\n");
|
||||
status = do_stop(lsession, bugname);
|
||||
} else if (!strcasecmp(argv[1], "start")) {
|
||||
char* lang = argv[2];
|
||||
int interim = argc > 3 && !strcmp(argv[3], "interim");
|
||||
char *bugname = argc > 5 ? argv[5] : MY_BUG_NAME;
|
||||
if (argc > 4 && !strcmp(argv[4], "stereo")) {
|
||||
flags |= SMBF_WRITE_STREAM ;
|
||||
flags |= SMBF_STEREO;
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "start transcribing %s %s\n", lang, interim ? "interim": "complete");
|
||||
status = start_capture(lsession, flags, lang, interim, bugname);
|
||||
}
|
||||
switch_core_session_rwunlock(lsession);
|
||||
}
|
||||
}
|
||||
|
||||
if (status == SWITCH_STATUS_SUCCESS) {
|
||||
stream->write_function(stream, "+OK Success\n");
|
||||
} else {
|
||||
stream->write_function(stream, "-ERR Operation Failed\n");
|
||||
}
|
||||
|
||||
done:
|
||||
|
||||
switch_safe_free(mycmd);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
SWITCH_MODULE_LOAD_FUNCTION(mod_ibm_transcribe_load)
|
||||
{
|
||||
switch_api_interface_t *api_interface;
|
||||
|
||||
/* create/register custom event message type */
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_RESULTS) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_RESULTS);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_NO_AUDIO_DETECTED) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_NO_AUDIO_DETECTED);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_VAD_DETECTED) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_VAD_DETECTED);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_CONNECT_SUCCESS) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_CONNECT_SUCCESS);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_CONNECT_FAIL) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_CONNECT_FAIL);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_BUFFER_OVERRUN) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_BUFFER_OVERRUN);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_DISCONNECT) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_DISCONNECT);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
|
||||
/* connect my internal structure to the blank pointer passed to me */
|
||||
*module_interface = switch_loadable_module_create_module_interface(pool, modname);
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "IBM Speech Transcription API loading..\n");
|
||||
|
||||
if (SWITCH_STATUS_FALSE == ibm_transcribe_init()) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Failed initializing ibm speech interface\n");
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "IBM Speech Transcription API successfully loaded\n");
|
||||
|
||||
SWITCH_ADD_API(api_interface, "uuid_ibm_transcribe", "IBM Speech Transcription API", ibm_transcribe_function, TRANSCRIBE_API_SYNTAX);
|
||||
switch_console_set_complete("add uuid_ibm_transcribe start lang-code [interim|final] [stereo|mono]");
|
||||
switch_console_set_complete("add uuid_ibm_transcribe stop ");
|
||||
|
||||
/* indicate that the module should continue to be loaded */
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Called when the system shuts down
|
||||
Macro expands to: switch_status_t mod_ibm_transcribe_shutdown() */
|
||||
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_ibm_transcribe_shutdown)
|
||||
{
|
||||
ibm_transcribe_cleanup();
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_RESULTS);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_NO_AUDIO_DETECTED);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_VAD_DETECTED);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_CONNECT_SUCCESS);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_CONNECT_FAIL);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_BUFFER_OVERRUN);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_DISCONNECT);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user