eliminate support for multiple lws threads as part of fixing valgrind errors

Signed-off-by: Dave Horton <daveh@beachdognet.com>
This commit is contained in:
Dave Horton
2023-12-26 10:57:15 -05:00
parent a2324972eb
commit 420e51eac7
140 changed files with 19851 additions and 0 deletions

View File

@@ -0,0 +1,8 @@
Copyright 2023, Drachtio Communications Services, LLC
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@@ -0,0 +1,9 @@
include $(top_srcdir)/build/modmake.rulesam
MODNAME=mod_assemblyai_transcribe
mod_LTLIBRARIES = mod_assemblyai_transcribe.la
mod_assemblyai_transcribe_la_SOURCES = mod_assemblyai_transcribe.c aai_transcribe_glue.cpp audio_pipe.cpp parser.cpp
mod_assemblyai_transcribe_la_CFLAGS = $(AM_CFLAGS)
mod_assemblyai_transcribe_la_CXXFLAGS = $(AM_CXXFLAGS) -std=c++11
mod_assemblyai_transcribe_la_LIBADD = $(switch_builddir)/libfreeswitch.la
mod_assemblyai_transcribe_la_LDFLAGS = -avoid-version -module -no-undefined -shared `pkg-config --libs libwebsockets`

View File

@@ -0,0 +1,96 @@
# mod_deepgram_transcribe
A Freeswitch module that generates real-time transcriptions on a Freeswitch channel by using Deepgram's streaming transcription API
## API
### Commands
The freeswitch module exposes the following API commands:
```
uuid_deepgram_transcribe <uuid> start <lang-code> [interim]
```
Attaches media bug to channel and performs streaming recognize request.
- `uuid` - unique identifier of Freeswitch channel
- `lang-code` - a valid AWS [language code](https://docs.deepgram.amazon.com/transcribe/latest/dg/what-is-transcribe.html) that is supported for streaming transcription
- `interim` - If the 'interim' keyword is present then both interim and final transcription results will be returned; otherwise only final transcriptions will be returned
```
uuid_deepgram_transcribe <uuid> stop
```
Stop transcription on the channel.
### Channel Variables
| variable | Description |
| --- | ----------- |
| DEEPGRAM_API_KEY | Deepgram API key used to authenticate |
| DEEPGRAM_SPEECH_TIER | https://developers.deepgram.com/documentation/features/tier/ |
| DEEPGRAM_SPEECH_CUSTOM_MODEL | custom model id |
| DEEPGRAM_SPEECH_MODEL | https://developers.deepgram.com/documentation/features/model/ |
| DEEPGRAM_SPEECH_MODEL_VERSION | https://developers.deepgram.com/documentation/features/version/ |
| DEEPGRAM_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION | https://developers.deepgram.com/documentation/features/punctuate/ |
| DEEPGRAM_SPEECH_PROFANITY_FILTER | https://developers.deepgram.com/documentation/features/profanity-filter/ |
| DEEPGRAM_SPEECH_REDACT | https://developers.deepgram.com/documentation/features/redact/ |
| DEEPGRAM_SPEECH_DIARIZE | https://developers.deepgram.com/documentation/features/diarize/ |
| DEEPGRAM_SPEECH_DIARIZE_VERSION | https://developers.deepgram.com/documentation/features/diarize/ |
| DEEPGRAM_SPEECH_NER | https://developers.deepgram.com/documentation/features/named-entity-recognition/ |
| DEEPGRAM_SPEECH_ALTERNATIVES | number of alternative hypotheses to return (default: 1) |
| DEEPGRAM_SPEECH_NUMERALS | https://developers.deepgram.com/documentation/features/numerals/ |
| DEEPGRAM_SPEECH_SEARCH | https://developers.deepgram.com/documentation/features/search/ |
| DEEPGRAM_SPEECH_KEYWORDS | https://developers.deepgram.com/documentation/features/keywords/ |
| DEEPGRAM_SPEECH_REPLACE | https://developers.deepgram.com/documentation/features/replace/ |
| DEEPGRAM_SPEECH_TAG | https://developers.deepgram.com/documentation/features/tag/ |
| DEEPGRAM_SPEECH_ENDPOINTING | https://developers.deepgram.com/documentation/features/endpointing/ |
| DEEPGRAM_SPEECH_VAD_TURNOFF | https://developers.deepgram.com/documentation/features/voice-activity-detection/ |
### Events
`deepgram_transcribe::transcription` - returns an interim or final transcription. The event contains a JSON body describing the transcription result:
```js
{
"channel_index": [0, 1],
"duration": 4.59,
"start": 0.0,
"is_final": true,
"speech_final": true,
"channel": {
"alternatives": [{
"transcript": "hello hello hello",
"confidence": 0.98583984,
"words": [{
"word": "hello",
"start": 3.0865219,
"end": 3.206,
"confidence": 0.99902344
}, {
"word": "hello",
"start": 3.5644348,
"end": 3.644087,
"confidence": 0.9741211
}, {
"word": "hello",
"start": 4.042348,
"end": 4.3609567,
"confidence": 0.98583984
}]
}]
},
"metadata": {
"request_id": "37835678-5d3b-4c77-910e-f8914c882cec",
"model_info": {
"name": "conversationalai",
"version": "2021-11-10.1",
"tier": "base"
},
"model_uuid": "6b28e919-8427-4f32-9847-492e2efd7daf"
}
}
```
## Usage
When using [drachtio-fsrmf](https://www.npmjs.com/package/drachtio-fsmrf), you can access this API command via the api method on the 'endpoint' object.
```js
ep.api('uuid_deepgram_transcribe', `${ep.uuid} start en-US interim`);
```

View File

@@ -0,0 +1,432 @@
#include <switch.h>
#include <switch_json.h>
#include <string.h>
#include <string>
#include <mutex>
#include <thread>
#include <list>
#include <algorithm>
#include <functional>
#include <cassert>
#include <cstdlib>
#include <fstream>
#include <sstream>
#include <regex>
#include "mod_assemblyai_transcribe.h"
#include "simple_buffer.h"
#include "parser.hpp"
#include "audio_pipe.hpp"
#define RTP_PACKETIZATION_PERIOD 20
#define FRAME_SIZE_8000 320 /*which means each 20ms frame as 320 bytes at 8 khz (1 channel only)*/
namespace {
static bool hasDefaultCredentials = false;
static const char* defaultApiKey = nullptr;
static const char *requestedBufferSecs = std::getenv("MOD_AUDIO_FORK_BUFFER_SECS");
static int nAudioBufferSecs = std::max(1, std::min(requestedBufferSecs ? ::atoi(requestedBufferSecs) : 2, 5));
static const char *requestedNumServiceThreads = std::getenv("MOD_AUDIO_FORK_SERVICE_THREADS");
static unsigned int idxCallCount = 0;
static uint32_t playCount = 0;
static void reaper(private_t *tech_pvt) {
std::shared_ptr<assemblyai::AudioPipe> pAp;
pAp.reset((assemblyai::AudioPipe *)tech_pvt->pAudioPipe);
tech_pvt->pAudioPipe = nullptr;
std::thread t([pAp, tech_pvt]{
pAp->finish();
pAp->waitForClose();
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "%s (%u) got remote close\n", tech_pvt->sessionId, tech_pvt->id);
});
t.detach();
}
static void destroy_tech_pvt(private_t *tech_pvt) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "%s (%u) destroy_tech_pvt\n", tech_pvt->sessionId, tech_pvt->id);
if (tech_pvt) {
if (tech_pvt->pAudioPipe) {
assemblyai::AudioPipe* p = (assemblyai::AudioPipe *) tech_pvt->pAudioPipe;
delete p;
tech_pvt->pAudioPipe = nullptr;
}
if (tech_pvt->resampler) {
speex_resampler_destroy(tech_pvt->resampler);
tech_pvt->resampler = NULL;
}
/*
if (tech_pvt->vad) {
switch_vad_destroy(&tech_pvt->vad);
tech_pvt->vad = nullptr;
}
*/
}
}
std::string encodeURIComponent(std::string decoded)
{
std::ostringstream oss;
std::regex r("[!'\\(\\)*-.0-9A-Za-z_~:]");
for (char &c : decoded)
{
if (std::regex_match((std::string){c}, r))
{
oss << c;
}
else
{
oss << "%" << std::uppercase << std::hex << (0xff & c);
}
}
return oss.str();
}
std::string& constructPath(switch_core_session_t* session, std::string& path,
int sampleRate, int channels, const char* language, int interim) {
switch_channel_t *channel = switch_core_session_get_channel(session);
const char *var ;
std::ostringstream oss;
oss << "v2/realtime/ws?sample_rate=8000";
const char* hints = switch_channel_get_variable(channel, "ASSEMBLYAI_WORD_BOOST");
if (hints) {
oss << "&word_boost=";
oss << encodeURIComponent(hints);
}
path = oss.str();
return path;
}
static void eventCallback(const char* sessionId, assemblyai::AudioPipe::NotifyEvent_t event, const char* message, bool finished) {
switch_core_session_t* session = switch_core_session_locate(sessionId);
if (session) {
switch_channel_t *channel = switch_core_session_get_channel(session);
switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, MY_BUG_NAME);
if (bug) {
private_t* tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug);
if (tech_pvt) {
switch (event) {
case assemblyai::AudioPipe::CONNECT_SUCCESS:
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "connection successful\n");
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_CONNECT_SUCCESS, NULL, tech_pvt->bugname, finished);
break;
case assemblyai::AudioPipe::CONNECT_FAIL:
{
// first thing: we can no longer access the AudioPipe
std::stringstream json;
json << "{\"reason\":\"" << message << "\"}";
tech_pvt->pAudioPipe = nullptr;
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_CONNECT_FAIL, (char *) json.str().c_str(), tech_pvt->bugname, finished);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_NOTICE, "connection failed: %s\n", message);
}
break;
case assemblyai::AudioPipe::CONNECTION_DROPPED:
// first thing: we can no longer access the AudioPipe
tech_pvt->pAudioPipe = nullptr;
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_DISCONNECT, NULL, tech_pvt->bugname, finished);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "connection dropped from far end\n");
break;
case assemblyai::AudioPipe::CONNECTION_CLOSED_GRACEFULLY:
// first thing: we can no longer access the AudioPipe
tech_pvt->pAudioPipe = nullptr;
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "connection closed gracefully\n");
break;
case assemblyai::AudioPipe::MESSAGE:
{
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "assemblyai message: %s\n", message);
if (strstr(message, "\"error\":")) {
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_ERROR, message, tech_pvt->bugname, finished);
}
if (strstr(message, "\"message_type\":\"SessionBegins\"")) {
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_SESSION_BEGINS, message, tech_pvt->bugname, finished);
}
if (strstr(message, "\"message_type\":\"SessionTerminated\"")) {
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_SESSION_TERMINATED, message, tech_pvt->bugname, finished);
}
else if (strstr(message, "\"message_type\":\"FinalTranscript\"") || strstr(message, "\"message_type\":\"PartialTranscript\"")) {
/* discard empty partials */
if (strstr(message, "\"message_type\":\"PartialTranscript\"") &&
strstr(message, "\"text\":\"\"") && strstr(message, "\"confidence\":0")) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "discarding empty partial transcript from assemblyai\n");
break;
}
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_RESULTS, message, tech_pvt->bugname, finished);
}
}
break;
default:
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_NOTICE, "got unexpected msg from assemblyai %d:%s\n", event, message);
break;
}
}
}
switch_core_session_rwunlock(session);
}
}
switch_status_t fork_data_init(private_t *tech_pvt, switch_core_session_t *session,
int sampling, int desiredSampling, int channels, char *lang, int interim,
char* bugname, responseHandler_t responseHandler) {
int err;
switch_codec_implementation_t read_impl;
switch_channel_t *channel = switch_core_session_get_channel(session);
switch_core_session_get_read_impl(session, &read_impl);
memset(tech_pvt, 0, sizeof(private_t));
std::string path;
constructPath(session, path, desiredSampling, channels, lang, interim);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "path: %s\n", path.c_str());
strncpy(tech_pvt->sessionId, switch_core_session_get_uuid(session), MAX_SESSION_ID);
strncpy(tech_pvt->host, "api.assemblyai.com", MAX_WS_URL_LEN);
tech_pvt->port = 443;
strncpy(tech_pvt->path, path.c_str(), MAX_PATH_LEN);
tech_pvt->sampling = desiredSampling;
tech_pvt->responseHandler = responseHandler;
tech_pvt->channels = channels;
tech_pvt->id = ++idxCallCount;
tech_pvt->buffer_overrun_notified = 0;
size_t buflen = LWS_PRE + (FRAME_SIZE_8000 * desiredSampling / 8000 * channels * 1000 / RTP_PACKETIZATION_PERIOD * nAudioBufferSecs);
const char* apiKey = switch_channel_get_variable(channel, "ASSEMBLYAI_API_KEY");
if (!apiKey && defaultApiKey) apiKey = defaultApiKey;
else if (!apiKey) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "no assemblyai api key provided\n");
return SWITCH_STATUS_FALSE;
}
assemblyai::AudioPipe* ap = new assemblyai::AudioPipe(tech_pvt->sessionId, tech_pvt->host, tech_pvt->port, tech_pvt->path,
buflen, read_impl.decoded_bytes_per_packet, apiKey, eventCallback);
if (!ap) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error allocating AudioPipe\n");
return SWITCH_STATUS_FALSE;
}
tech_pvt->pAudioPipe = static_cast<void *>(ap);
switch_mutex_init(&tech_pvt->mutex, SWITCH_MUTEX_NESTED, switch_core_session_get_pool(session));
if (desiredSampling != sampling) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) resampling from %u to %u\n", tech_pvt->id, sampling, desiredSampling);
tech_pvt->resampler = speex_resampler_init(channels, sampling, desiredSampling, SWITCH_RESAMPLE_QUALITY, &err);
if (0 != err) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error initializing resampler: %s.\n", speex_resampler_strerror(err));
return SWITCH_STATUS_FALSE;
}
}
else {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) no resampling needed for this call\n", tech_pvt->id);
}
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) fork_data_init\n", tech_pvt->id);
return SWITCH_STATUS_SUCCESS;
}
void lws_logger(int level, const char *line) {
switch_log_level_t llevel = SWITCH_LOG_DEBUG;
switch (level) {
case LLL_ERR: llevel = SWITCH_LOG_ERROR; break;
case LLL_WARN: llevel = SWITCH_LOG_WARNING; break;
case LLL_NOTICE: llevel = SWITCH_LOG_NOTICE; break;
case LLL_INFO: llevel = SWITCH_LOG_INFO; break;
break;
}
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "%s\n", line);
}
}
extern "C" {
switch_status_t aai_transcribe_init() {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "mod_assemblyai_transcribe: audio buffer (in secs): %d secs\n", nAudioBufferSecs);
int logs = LLL_ERR | LLL_WARN | LLL_NOTICE || LLL_INFO | LLL_PARSER | LLL_HEADER | LLL_EXT | LLL_CLIENT | LLL_LATENCY | LLL_DEBUG ;
assemblyai::AudioPipe::initialize(logs, lws_logger);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "AudioPipe::initialize completed\n");
const char* apiKey = std::getenv("DEEPGRAM_API_KEY");
if (NULL == apiKey) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE,
"\"DEEPGRAM_API_KEY\" env var not set; authentication will expect channel variables of same names to be set\n");
}
else {
hasDefaultCredentials = true;
defaultApiKey = apiKey;
}
return SWITCH_STATUS_SUCCESS;
}
switch_status_t aai_transcribe_cleanup() {
bool cleanup = false;
cleanup = assemblyai::AudioPipe::deinitialize();
if (cleanup == true) {
return SWITCH_STATUS_SUCCESS;
}
return SWITCH_STATUS_FALSE;
}
switch_status_t aai_transcribe_session_init(switch_core_session_t *session,
responseHandler_t responseHandler, uint32_t samples_per_second, uint32_t channels,
char* lang, int interim, char* bugname, void **ppUserData)
{
int err;
// allocate per-session data structure
private_t* tech_pvt = (private_t *) switch_core_session_alloc(session, sizeof(private_t));
if (!tech_pvt) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "error allocating memory!\n");
return SWITCH_STATUS_FALSE;
}
if (SWITCH_STATUS_SUCCESS != fork_data_init(tech_pvt, session, samples_per_second, 8000, channels, lang, interim, bugname, responseHandler)) {
destroy_tech_pvt(tech_pvt);
return SWITCH_STATUS_FALSE;
}
*ppUserData = tech_pvt;
assemblyai::AudioPipe *pAudioPipe = static_cast<assemblyai::AudioPipe *>(tech_pvt->pAudioPipe);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "connecting now\n");
pAudioPipe->connect();
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "connection in progress\n");
return SWITCH_STATUS_SUCCESS;
}
switch_status_t aai_transcribe_session_stop(switch_core_session_t *session,int channelIsClosing, char* bugname) {
switch_channel_t *channel = switch_core_session_get_channel(session);
switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, MY_BUG_NAME);
if (!bug) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "aai_transcribe_session_stop: no bug - websocket conection already closed\n");
return SWITCH_STATUS_FALSE;
}
private_t* tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug);
uint32_t id = tech_pvt->id;
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) aai_transcribe_session_stop\n", id);
if (!tech_pvt) return SWITCH_STATUS_FALSE;
// close connection and get final responses
switch_mutex_lock(tech_pvt->mutex);
switch_channel_set_private(channel, bugname, NULL);
if (!channelIsClosing) switch_core_media_bug_remove(session, &bug);
assemblyai::AudioPipe *pAudioPipe = static_cast<assemblyai::AudioPipe *>(tech_pvt->pAudioPipe);
if (pAudioPipe) {
//TODO: I think here we should call a method on pAudioPipe to send a terminate session message to assemblyai
//see: https://www.assemblyai.com/docs/guides/real-time-streaming-transcription#terminating-a-session
reaper(tech_pvt);
}
destroy_tech_pvt(tech_pvt);
switch_mutex_unlock(tech_pvt->mutex);
switch_mutex_destroy(tech_pvt->mutex);
tech_pvt->mutex = nullptr;
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%u) aai_transcribe_session_stop\n", id);
return SWITCH_STATUS_SUCCESS;
}
switch_bool_t aai_transcribe_frame(switch_core_session_t *session, switch_media_bug_t *bug) {
private_t* tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug);
size_t inuse = 0;
bool dirty = false;
char *p = (char *) "{\"msg\": \"buffer overrun\"}";
if (!tech_pvt) return SWITCH_TRUE;
if (switch_mutex_trylock(tech_pvt->mutex) == SWITCH_STATUS_SUCCESS) {
if (!tech_pvt->pAudioPipe) {
switch_mutex_unlock(tech_pvt->mutex);
return SWITCH_TRUE;
}
assemblyai::AudioPipe *pAudioPipe = static_cast<assemblyai::AudioPipe *>(tech_pvt->pAudioPipe);
if (pAudioPipe->getLwsState() != assemblyai::AudioPipe::LWS_CLIENT_CONNECTED) {
switch_mutex_unlock(tech_pvt->mutex);
return SWITCH_TRUE;
}
pAudioPipe->lockAudioBuffer();
size_t available = pAudioPipe->binarySpaceAvailable();
if (NULL == tech_pvt->resampler) {
switch_frame_t frame = { 0 };
frame.data = pAudioPipe->binaryWritePtr();
frame.buflen = available;
while (true) {
// check if buffer would be overwritten; dump packets if so
if (available < pAudioPipe->binaryMinSpace()) {
if (!tech_pvt->buffer_overrun_notified) {
tech_pvt->buffer_overrun_notified = 1;
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_BUFFER_OVERRUN, NULL, tech_pvt->bugname, 0);
}
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "(%u) dropping packets!\n",
tech_pvt->id);
pAudioPipe->binaryWritePtrResetToZero();
frame.data = pAudioPipe->binaryWritePtr();
frame.buflen = available = pAudioPipe->binarySpaceAvailable();
}
switch_status_t rv = switch_core_media_bug_read(bug, &frame, SWITCH_TRUE);
if (rv != SWITCH_STATUS_SUCCESS) break;
if (frame.datalen) {
pAudioPipe->binaryWritePtrAdd(frame.datalen);
frame.buflen = available = pAudioPipe->binarySpaceAvailable();
frame.data = pAudioPipe->binaryWritePtr();
dirty = true;
}
}
}
else {
uint8_t data[SWITCH_RECOMMENDED_BUFFER_SIZE];
switch_frame_t frame = { 0 };
frame.data = data;
frame.buflen = SWITCH_RECOMMENDED_BUFFER_SIZE;
while (switch_core_media_bug_read(bug, &frame, SWITCH_TRUE) == SWITCH_STATUS_SUCCESS) {
if (frame.datalen) {
spx_uint32_t out_len = available >> 1; // space for samples which are 2 bytes
spx_uint32_t in_len = frame.samples;
speex_resampler_process_interleaved_int(tech_pvt->resampler,
(const spx_int16_t *) frame.data,
(spx_uint32_t *) &in_len,
(spx_int16_t *) ((char *) pAudioPipe->binaryWritePtr()),
&out_len);
if (out_len > 0) {
// bytes written = num samples * 2 * num channels
size_t bytes_written = out_len << tech_pvt->channels;
pAudioPipe->binaryWritePtrAdd(bytes_written);
available = pAudioPipe->binarySpaceAvailable();
dirty = true;
}
if (available < pAudioPipe->binaryMinSpace()) {
if (!tech_pvt->buffer_overrun_notified) {
tech_pvt->buffer_overrun_notified = 1;
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "(%u) dropping packets!\n",
tech_pvt->id);
tech_pvt->responseHandler(session, TRANSCRIBE_EVENT_BUFFER_OVERRUN, NULL, tech_pvt->bugname, 0);
}
break;
}
}
}
}
pAudioPipe->unlockAudioBuffer();
switch_mutex_unlock(tech_pvt->mutex);
}
return SWITCH_TRUE;
}
}

View File

@@ -0,0 +1,11 @@
#ifndef __DG_GLUE_H__
#define __DG_GLUE_H__
switch_status_t aai_transcribe_init();
switch_status_t aai_transcribe_cleanup();
switch_status_t aai_transcribe_session_init(switch_core_session_t *session, responseHandler_t responseHandler,
uint32_t samples_per_second, uint32_t channels, char* lang, int interim, char* bugname, void **ppUserData);
switch_status_t aai_transcribe_session_stop(switch_core_session_t *session, int channelIsClosing, char* bugname);
switch_bool_t aai_transcribe_frame(switch_core_session_t *session, switch_media_bug_t *bug);
#endif

View File

@@ -0,0 +1,521 @@
#include <cassert>
#include <iostream>
#include <sstream>
#include "audio_pipe.hpp"
#include "base64.hpp"
/* discard incoming text messages over the socket that are longer than this */
#define MAX_RECV_BUF_SIZE (65 * 1024 * 10)
#define RECV_BUF_REALLOC_SIZE (8 * 1024)
using namespace assemblyai;
namespace {
static const char *requestedTcpKeepaliveSecs = std::getenv("MOD_AUDIO_FORK_TCP_KEEPALIVE_SECS");
static int nTcpKeepaliveSecs = requestedTcpKeepaliveSecs ? ::atoi(requestedTcpKeepaliveSecs) : 55;
}
static int dch_lws_http_basic_auth_gen(const char *apiKey, char *buf, size_t len) {
size_t n = strlen(apiKey);
if (len < n + 7)
return 1;
strcpy(buf,"Token ");
strcpy(buf + 6, apiKey);
return 0;
}
int AudioPipe::lws_callback(struct lws *wsi,
enum lws_callback_reasons reason,
void *user, void *in, size_t len) {
struct AudioPipe::lws_per_vhost_data *vhd =
(struct AudioPipe::lws_per_vhost_data *) lws_protocol_vh_priv_get(lws_get_vhost(wsi), lws_get_protocol(wsi));
struct lws_vhost* vhost = lws_get_vhost(wsi);
AudioPipe ** ppAp = (AudioPipe **) user;
switch (reason) {
case LWS_CALLBACK_PROTOCOL_INIT:
vhd = (struct AudioPipe::lws_per_vhost_data *) lws_protocol_vh_priv_zalloc(lws_get_vhost(wsi), lws_get_protocol(wsi), sizeof(struct AudioPipe::lws_per_vhost_data));
vhd->context = lws_get_context(wsi);
vhd->protocol = lws_get_protocol(wsi);
vhd->vhost = lws_get_vhost(wsi);
break;
case LWS_CALLBACK_CLIENT_APPEND_HANDSHAKE_HEADER:
{
AudioPipe* ap = findPendingConnect(wsi);
if (ap) {
std::string apiKey = ap->getApiKey();
unsigned char **p = (unsigned char **)in, *end = (*p) + len;
char b[256];
memset(b, 0, sizeof(b));
strcpy(b, apiKey.c_str());
if (lws_add_http_header_by_token(wsi, WSI_TOKEN_HTTP_AUTHORIZATION, (unsigned char *)b, strlen(b), p, end)) return -1;
}
}
break;
case LWS_CALLBACK_EVENT_WAIT_CANCELLED:
processPendingConnects(vhd);
processPendingDisconnects(vhd);
processPendingWrites();
break;
case LWS_CALLBACK_CLIENT_CONNECTION_ERROR:
{
AudioPipe* ap = findAndRemovePendingConnect(wsi);
int rc = lws_http_client_http_response(wsi);
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_CONNECTION_ERROR: %s, response status %d\n", in ? (char *)in : "(null)", rc);
if (ap) {
ap->m_state = LWS_CLIENT_FAILED;
ap->m_callback(ap->m_uuid.c_str(), AudioPipe::CONNECT_FAIL, (char *) in, ap->isFinished());
}
else {
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_CONNECTION_ERROR unable to find wsi %p..\n", wsi);
}
}
break;
case LWS_CALLBACK_CLIENT_ESTABLISHED:
{
AudioPipe* ap = findAndRemovePendingConnect(wsi);
if (ap) {
*ppAp = ap;
ap->m_vhd = vhd;
ap->m_state = LWS_CLIENT_CONNECTED;
ap->m_callback(ap->m_uuid.c_str(), AudioPipe::CONNECT_SUCCESS, NULL, ap->isFinished());
}
else {
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_ESTABLISHED %s unable to find wsi %p..\n", ap->m_uuid.c_str(), wsi);
}
}
break;
case LWS_CALLBACK_CLIENT_CLOSED:
{
AudioPipe* ap = *ppAp;
if (!ap) {
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_CLOSED %s unable to find wsi %p..\n", ap->m_uuid.c_str(), wsi);
return 0;
}
if (ap->m_state == LWS_CLIENT_DISCONNECTING) {
// closed by us
lwsl_debug("%s socket closed by us\n", ap->m_uuid.c_str());
ap->m_callback(ap->m_uuid.c_str(), AudioPipe::CONNECTION_CLOSED_GRACEFULLY, NULL, ap->isFinished());
}
else if (ap->m_state == LWS_CLIENT_CONNECTED) {
// closed by far end
lwsl_info("%s socket closed by far end\n", ap->m_uuid.c_str());
ap->m_callback(ap->m_uuid.c_str(), AudioPipe::CONNECTION_DROPPED, NULL, ap->isFinished());
}
ap->m_state = LWS_CLIENT_DISCONNECTED;
ap->setClosed();
//NB: after receiving any of the events above, any holder of a
//pointer or reference to this object must treat is as no longer valid
//*ppAp = NULL;
//delete ap;
}
break;
case LWS_CALLBACK_CLIENT_RECEIVE:
{
AudioPipe* ap = *ppAp;
if (!ap) {
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_RECEIVE %s unable to find wsi %p..\n", ap->m_uuid.c_str(), wsi);
return 0;
}
if (lws_frame_is_binary(wsi)) {
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_RECEIVE received binary frame, discarding.\n");
return 0;
}
if (lws_is_first_fragment(wsi)) {
// allocate a buffer for the entire chunk of memory needed
assert(nullptr == ap->m_recv_buf);
ap->m_recv_buf_len = len + lws_remaining_packet_payload(wsi);
ap->m_recv_buf = (uint8_t*) malloc(ap->m_recv_buf_len);
ap->m_recv_buf_ptr = ap->m_recv_buf;
}
size_t write_offset = ap->m_recv_buf_ptr - ap->m_recv_buf;
size_t remaining_space = ap->m_recv_buf_len - write_offset;
if (remaining_space < len) {
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_RECEIVE buffer realloc needed.\n");
size_t newlen = ap->m_recv_buf_len + RECV_BUF_REALLOC_SIZE;
if (newlen > MAX_RECV_BUF_SIZE) {
free(ap->m_recv_buf);
ap->m_recv_buf = ap->m_recv_buf_ptr = nullptr;
ap->m_recv_buf_len = 0;
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_RECEIVE max buffer exceeded, truncating message.\n");
}
else {
ap->m_recv_buf = (uint8_t*) realloc(ap->m_recv_buf, newlen);
if (nullptr != ap->m_recv_buf) {
ap->m_recv_buf_len = newlen;
ap->m_recv_buf_ptr = ap->m_recv_buf + write_offset;
}
}
}
if (nullptr != ap->m_recv_buf) {
if (len > 0) {
memcpy(ap->m_recv_buf_ptr, in, len);
ap->m_recv_buf_ptr += len;
}
if (lws_is_final_fragment(wsi)) {
if (nullptr != ap->m_recv_buf) {
std::string msg((char *)ap->m_recv_buf, ap->m_recv_buf_ptr - ap->m_recv_buf);
ap->m_callback(ap->m_uuid.c_str(), AudioPipe::MESSAGE, msg.c_str(), ap->isFinished());
if (nullptr != ap->m_recv_buf) free(ap->m_recv_buf);
}
ap->m_recv_buf = ap->m_recv_buf_ptr = nullptr;
ap->m_recv_buf_len = 0;
}
}
}
break;
case LWS_CALLBACK_CLIENT_WRITEABLE:
{
AudioPipe* ap = *ppAp;
if (!ap) {
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_WRITEABLE %s unable to find wsi %p..\n", ap->m_uuid.c_str(), wsi);
return 0;
}
// check for text frames to send
{
std::lock_guard<std::mutex> lk(ap->m_text_mutex);
if (ap->m_metadata.length() > 0) {
uint8_t buf[ap->m_metadata.length() + LWS_PRE];
memcpy(buf + LWS_PRE, ap->m_metadata.c_str(), ap->m_metadata.length());
int n = ap->m_metadata.length();
int m = lws_write(wsi, buf + LWS_PRE, n, LWS_WRITE_TEXT);
ap->m_metadata.clear();
if (m < n) {
return -1;
}
// there may be audio data, but only one write per writeable event
// get it next time
lws_callback_on_writable(wsi);
return 0;
}
}
if (ap->m_state == LWS_CLIENT_DISCONNECTING) {
lws_close_reason(wsi, LWS_CLOSE_STATUS_NORMAL, NULL, 0);
return -1;
}
// check for audio packets
{
std::lock_guard<std::mutex> lk(ap->m_audio_mutex);
//TODO: we need to have at least 100ms buffered which is 5 packets at 320 bytes per packet
if (ap->m_audio_buffer_write_offset > LWS_PRE) {
size_t datalen = ap->m_audio_buffer_write_offset - LWS_PRE;
if (datalen >= 1600) {
std::ostringstream oss;
oss << "{\"audio_data\":\"" << drachtio::base64_encode((unsigned char const *) ap->m_audio_buffer + LWS_PRE, datalen) << "\"}";
std::string result = oss.str();
uint8_t buf[result.length() + LWS_PRE];
memcpy(buf + LWS_PRE,result.c_str(), result.length());
int n = result.length();
int m = lws_write(wsi, buf + LWS_PRE, n, LWS_WRITE_TEXT);
if (m < n) {
lwsl_err("AudioPipe::lws_service_thread LWS_CALLBACK_CLIENT_WRITEABLE attemped to send %lu bytes only sent %d wsi %p..\n",
n, m, wsi);
}
ap->m_audio_buffer_write_offset = LWS_PRE;
}
}
}
return 0;
}
break;
default:
break;
}
return lws_callback_http_dummy(wsi, reason, user, in, len);
}
// static members
static const lws_retry_bo_t retry = {
nullptr, // retry_ms_table
0, // retry_ms_table_count
0, // conceal_count
UINT16_MAX, // secs_since_valid_ping
UINT16_MAX, // secs_since_valid_hangup
0 // jitter_percent
};
struct lws_context *AudioPipe::context = nullptr;
std::string AudioPipe::protocolName;
std::mutex AudioPipe::mutex_connects;
std::mutex AudioPipe::mutex_disconnects;
std::mutex AudioPipe::mutex_writes;
std::list<AudioPipe*> AudioPipe::pendingConnects;
std::list<AudioPipe*> AudioPipe::pendingDisconnects;
std::list<AudioPipe*> AudioPipe::pendingWrites;
AudioPipe::log_emit_function AudioPipe::logger;
std::mutex AudioPipe::mapMutex;
bool AudioPipe::stopFlag;
void AudioPipe::processPendingConnects(lws_per_vhost_data *vhd) {
std::list<AudioPipe*> connects;
{
std::lock_guard<std::mutex> guard(mutex_connects);
for (auto it = pendingConnects.begin(); it != pendingConnects.end(); ++it) {
if ((*it)->m_state == LWS_CLIENT_IDLE) {
connects.push_back(*it);
(*it)->m_state = LWS_CLIENT_CONNECTING;
}
}
}
for (auto it = connects.begin(); it != connects.end(); ++it) {
AudioPipe* ap = *it;
ap->connect_client(vhd);
}
}
void AudioPipe::processPendingDisconnects(lws_per_vhost_data *vhd) {
std::list<AudioPipe*> disconnects;
{
std::lock_guard<std::mutex> guard(mutex_disconnects);
for (auto it = pendingDisconnects.begin(); it != pendingDisconnects.end(); ++it) {
if ((*it)->m_state == LWS_CLIENT_DISCONNECTING) disconnects.push_back(*it);
}
pendingDisconnects.clear();
}
for (auto it = disconnects.begin(); it != disconnects.end(); ++it) {
AudioPipe* ap = *it;
lws_callback_on_writable(ap->m_wsi);
}
}
void AudioPipe::processPendingWrites() {
std::list<AudioPipe*> writes;
{
std::lock_guard<std::mutex> guard(mutex_writes);
for (auto it = pendingWrites.begin(); it != pendingWrites.end(); ++it) {
if ((*it)->m_state == LWS_CLIENT_CONNECTED) writes.push_back(*it);
}
pendingWrites.clear();
}
for (auto it = writes.begin(); it != writes.end(); ++it) {
AudioPipe* ap = *it;
lws_callback_on_writable(ap->m_wsi);
}
}
AudioPipe* AudioPipe::findAndRemovePendingConnect(struct lws *wsi) {
AudioPipe* ap = NULL;
std::lock_guard<std::mutex> guard(mutex_connects);
std::list<AudioPipe* > toRemove;
for (auto it = pendingConnects.begin(); it != pendingConnects.end() && !ap; ++it) {
int state = (*it)->m_state;
if ((*it)->m_wsi == nullptr)
toRemove.push_back(*it);
if ((state == LWS_CLIENT_CONNECTING) &&
(*it)->m_wsi == wsi) ap = *it;
}
for (auto it = toRemove.begin(); it != toRemove.end(); ++it)
pendingConnects.remove(*it);
if (ap) {
pendingConnects.remove(ap);
}
return ap;
}
AudioPipe* AudioPipe::findPendingConnect(struct lws *wsi) {
AudioPipe* ap = NULL;
std::lock_guard<std::mutex> guard(mutex_connects);
for (auto it = pendingConnects.begin(); it != pendingConnects.end() && !ap; ++it) {
int state = (*it)->m_state;
if ((state == LWS_CLIENT_CONNECTING) &&
(*it)->m_wsi == wsi) ap = *it;
}
return ap;
}
void AudioPipe::addPendingConnect(AudioPipe* ap) {
{
std::lock_guard<std::mutex> guard(mutex_connects);
pendingConnects.push_back(ap);
lwsl_debug("%s after adding connect there are %lu pending connects\n",
ap->m_uuid.c_str(), pendingConnects.size());
}
lws_cancel_service(context);
}
void AudioPipe::addPendingDisconnect(AudioPipe* ap) {
ap->m_state = LWS_CLIENT_DISCONNECTING;
{
std::lock_guard<std::mutex> guard(mutex_disconnects);
pendingDisconnects.push_back(ap);
lwsl_debug("%s after adding disconnect there are %lu pending disconnects\n",
ap->m_uuid.c_str(), pendingDisconnects.size());
}
lws_cancel_service(ap->m_vhd->context);
}
void AudioPipe::addPendingWrite(AudioPipe* ap) {
{
std::lock_guard<std::mutex> guard(mutex_writes);
pendingWrites.push_back(ap);
}
lws_cancel_service(ap->m_vhd->context);
}
bool AudioPipe::lws_service_thread() {
struct lws_context_creation_info info;
const struct lws_protocols protocols[] = {
{
"",
AudioPipe::lws_callback,
sizeof(void *),
1024,
},
{ NULL, NULL, 0, 0 }
};
memset(&info, 0, sizeof info);
info.port = CONTEXT_PORT_NO_LISTEN;
info.options = LWS_SERVER_OPTION_DO_SSL_GLOBAL_INIT;
info.protocols = protocols;
info.ka_time = nTcpKeepaliveSecs; // tcp keep-alive timer
info.ka_probes = 4; // number of times to try ka before closing connection
info.ka_interval = 5; // time between ka's
info.timeout_secs = 10; // doc says timeout for "various processes involving network roundtrips"
info.keepalive_timeout = 5; // seconds to allow remote client to hold on to an idle HTTP/1.1 connection
info.timeout_secs_ah_idle = 10; // secs to allow a client to hold an ah without using it
info.retry_and_idle_policy = &retry;
lwsl_notice("AudioPipe::lws_service_thread creating context\n");
context = lws_create_context(&info);
if (!context) {
lwsl_err("AudioPipe::lws_service_thread failed creating context\n");
return false;
}
int n;
do {
n = lws_service(context, 0);
} while (n >= 0 && !stopFlag);
lwsl_notice("AudioPipe::lws_service_thread ending\n");
lws_context_destroy(context);
return true;
}
void AudioPipe::initialize(int loglevel, log_emit_function logger) {
lws_set_log_level(loglevel, logger);
lwsl_notice("AudioPipe::initialize starting\n");
std::lock_guard<std::mutex> lock(mapMutex);
std::thread t(&AudioPipe::lws_service_thread);
stopFlag = false;
t.detach();
}
bool AudioPipe::deinitialize() {
lwsl_notice("AudioPipe::deinitialize\n");
std::lock_guard<std::mutex> lock(mapMutex);
stopFlag = true;
return true;
}
// instance members
AudioPipe::AudioPipe(const char* uuid, const char* host, unsigned int port, const char* path,
size_t bufLen, size_t minFreespace, const char* apiKey, notifyHandler_t callback) :
m_uuid(uuid), m_host(host), m_port(port), m_path(path), m_finished(false),
m_audio_buffer_min_freespace(minFreespace), m_audio_buffer_max_len(bufLen), m_gracefulShutdown(false),
m_audio_buffer_write_offset(LWS_PRE), m_recv_buf(nullptr), m_recv_buf_ptr(nullptr),
m_state(LWS_CLIENT_IDLE), m_wsi(nullptr), m_vhd(nullptr), m_apiKey(apiKey), m_callback(callback) {
m_audio_buffer = new uint8_t[m_audio_buffer_max_len];
}
AudioPipe::~AudioPipe() {
if (m_audio_buffer) delete [] m_audio_buffer;
if (m_recv_buf) delete [] m_recv_buf;
}
void AudioPipe::connect(void) {
addPendingConnect(this);
}
bool AudioPipe::connect_client(struct lws_per_vhost_data *vhd) {
assert(m_audio_buffer != nullptr);
assert(m_vhd == nullptr);
struct lws_client_connect_info i;
memset(&i, 0, sizeof(i));
i.context = vhd->context;
i.port = m_port;
i.address = m_host.c_str();
i.path = m_path.c_str();
i.host = i.address;
i.origin = i.address;
i.ssl_connection = LCCSCF_USE_SSL;
//i.protocol = protocolName.c_str();
i.pwsi = &(m_wsi);
m_state = LWS_CLIENT_CONNECTING;
m_vhd = vhd;
m_wsi = lws_client_connect_via_info(&i);
lwsl_debug("%s attempting connection, wsi is %p\n", m_uuid.c_str(), m_wsi);
return nullptr != m_wsi;
}
void AudioPipe::bufferForSending(const char* text) {
if (m_state != LWS_CLIENT_CONNECTED) return;
{
std::lock_guard<std::mutex> lk(m_text_mutex);
m_metadata.append(text);
}
addPendingWrite(this);
}
void AudioPipe::unlockAudioBuffer() {
if (m_audio_buffer_write_offset > LWS_PRE) addPendingWrite(this);
m_audio_mutex.unlock();
}
void AudioPipe::close() {
if (m_state != LWS_CLIENT_CONNECTED) return;
addPendingDisconnect(this);
}
void AudioPipe::finish() {
if (m_finished || m_state != LWS_CLIENT_CONNECTED) return;
m_finished = true;
bufferForSending("{\"terminate_session\": true}");
}
void AudioPipe::waitForClose() {
std::shared_future<void> sf(m_promise.get_future());
sf.wait();
return;
}

View File

@@ -0,0 +1,143 @@
#ifndef __AAI_AUDIO_PIPE_HPP__
#define __AAI_AUDIO_PIPE_HPP__
#include <string>
#include <list>
#include <mutex>
#include <future>
#include <queue>
#include <unordered_map>
#include <thread>
#include <libwebsockets.h>
namespace assemblyai {
class AudioPipe {
public:
enum LwsState_t {
LWS_CLIENT_IDLE,
LWS_CLIENT_CONNECTING,
LWS_CLIENT_CONNECTED,
LWS_CLIENT_FAILED,
LWS_CLIENT_DISCONNECTING,
LWS_CLIENT_DISCONNECTED
};
enum NotifyEvent_t {
CONNECT_SUCCESS,
CONNECT_FAIL,
CONNECTION_DROPPED,
CONNECTION_CLOSED_GRACEFULLY,
MESSAGE
};
typedef void (*log_emit_function)(int level, const char *line);
typedef void (*notifyHandler_t)(const char *sessionId, NotifyEvent_t event, const char* message, bool finished);
struct lws_per_vhost_data {
struct lws_context *context;
struct lws_vhost *vhost;
const struct lws_protocols *protocol;
};
static void initialize(int loglevel, log_emit_function logger);
static bool deinitialize();
static bool lws_service_thread();
// constructor
AudioPipe(const char* uuid, const char* host, unsigned int port, const char* path,
size_t bufLen, size_t minFreespace, const char* apiKey, notifyHandler_t callback);
~AudioPipe();
LwsState_t getLwsState(void) { return m_state; }
std::string& getApiKey(void) {
return m_apiKey;
}
void connect(void);
void bufferForSending(const char* text);
size_t binarySpaceAvailable(void) {
return m_audio_buffer_max_len - m_audio_buffer_write_offset;
}
size_t binaryMinSpace(void) {
return m_audio_buffer_min_freespace;
}
char * binaryWritePtr(void) {
return (char *) m_audio_buffer + m_audio_buffer_write_offset;
}
void binaryWritePtrAdd(size_t len) {
m_audio_buffer_write_offset += len;
}
void binaryWritePtrResetToZero(void) {
m_audio_buffer_write_offset = 0;
}
void lockAudioBuffer(void) {
m_audio_mutex.lock();
}
void unlockAudioBuffer(void) ;
void close() ;
void finish();
void waitForClose();
void setClosed() { m_promise.set_value(); }
bool isFinished() { return m_finished;}
// no default constructor or copying
AudioPipe() = delete;
AudioPipe(const AudioPipe&) = delete;
void operator=(const AudioPipe&) = delete;
private:
static int lws_callback(struct lws *wsi, enum lws_callback_reasons reason, void *user, void *in, size_t len);
static struct lws_context *context;
static std::string protocolName;
static std::mutex mutex_connects;
static std::mutex mutex_disconnects;
static std::mutex mutex_writes;
static std::list<AudioPipe*> pendingConnects;
static std::list<AudioPipe*> pendingDisconnects;
static std::list<AudioPipe*> pendingWrites;
static log_emit_function logger;
static std::mutex mapMutex;
static bool stopFlag;
static AudioPipe* findAndRemovePendingConnect(struct lws *wsi);
static AudioPipe* findPendingConnect(struct lws *wsi);
static void addPendingConnect(AudioPipe* ap);
static void addPendingDisconnect(AudioPipe* ap);
static void addPendingWrite(AudioPipe* ap);
static void processPendingConnects(lws_per_vhost_data *vhd);
static void processPendingDisconnects(lws_per_vhost_data *vhd);
static void processPendingWrites(void);
bool connect_client(struct lws_per_vhost_data *vhd);
LwsState_t m_state;
std::string m_uuid;
std::string m_host;
unsigned int m_port;
std::string m_path;
std::string m_metadata;
std::mutex m_text_mutex;
std::mutex m_audio_mutex;
int m_sslFlags;
struct lws *m_wsi;
uint8_t *m_audio_buffer;
size_t m_audio_buffer_max_len;
size_t m_audio_buffer_write_offset;
size_t m_audio_buffer_min_freespace;
uint8_t* m_recv_buf;
uint8_t* m_recv_buf_ptr;
size_t m_recv_buf_len;
struct lws_per_vhost_data* m_vhd;
notifyHandler_t m_callback;
log_emit_function m_logger;
std::string m_apiKey;
bool m_gracefulShutdown;
bool m_finished;
std::string m_bugname;
std::promise<void> m_promise;
};
} // namespace assemblyai
#endif

View File

@@ -0,0 +1,178 @@
/*
******
base64.hpp is a repackaging of the base64.cpp and base64.h files into a
single header suitable for use as a header only library. This conversion was
done by Peter Thorson (webmaster@zaphoyd.com) in 2012. All modifications to
the code are redistributed under the same license as the original, which is
listed below.
******
base64.cpp and base64.h
Copyright (C) 2004-2008 René Nyffenegger
This source code is provided 'as-is', without any express or implied
warranty. In no event will the author be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this source code must not be misrepresented; you must not
claim that you wrote the original source code. If you use this source code
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original source code.
3. This notice may not be removed or altered from any source distribution.
René Nyffenegger rene.nyffenegger@adp-gmbh.ch
*/
#ifndef _BASE64_HPP_
#define _BASE64_HPP_
#include <string>
namespace drachtio {
static std::string const base64_chars =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789+/";
/// Test whether a character is a valid base64 character
/**
* @param c The character to test
* @return true if c is a valid base64 character
*/
static inline bool is_base64(unsigned char c) {
return (c == 43 || // +
(c >= 47 && c <= 57) || // /-9
(c >= 65 && c <= 90) || // A-Z
(c >= 97 && c <= 122)); // a-z
}
/// Encode a char buffer into a base64 string
/**
* @param input The input data
* @param len The length of input in bytes
* @return A base64 encoded string representing input
*/
inline std::string base64_encode(unsigned char const * input, size_t len) {
std::string ret;
int i = 0;
int j = 0;
unsigned char char_array_3[3];
unsigned char char_array_4[4];
while (len--) {
char_array_3[i++] = *(input++);
if (i == 3) {
char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
char_array_4[1] = ((char_array_3[0] & 0x03) << 4) +
((char_array_3[1] & 0xf0) >> 4);
char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) +
((char_array_3[2] & 0xc0) >> 6);
char_array_4[3] = char_array_3[2] & 0x3f;
for(i = 0; (i <4) ; i++) {
ret += base64_chars[char_array_4[i]];
}
i = 0;
}
}
if (i) {
for(j = i; j < 3; j++) {
char_array_3[j] = '\0';
}
char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
char_array_4[1] = ((char_array_3[0] & 0x03) << 4) +
((char_array_3[1] & 0xf0) >> 4);
char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) +
((char_array_3[2] & 0xc0) >> 6);
char_array_4[3] = char_array_3[2] & 0x3f;
for (j = 0; (j < i + 1); j++) {
ret += base64_chars[char_array_4[j]];
}
while((i++ < 3)) {
ret += '=';
}
}
return ret;
}
/// Encode a string into a base64 string
/**
* @param input The input data
* @return A base64 encoded string representing input
*/
inline std::string base64_encode(std::string const & input) {
return base64_encode(
reinterpret_cast<const unsigned char *>(input.data()),
input.size()
);
}
/// Decode a base64 encoded string into a string of raw bytes
/**
* @param input The base64 encoded input data
* @return A string representing the decoded raw bytes
*/
inline std::string base64_decode(std::string const & input) {
size_t in_len = input.size();
int i = 0;
int j = 0;
int in_ = 0;
unsigned char char_array_4[4], char_array_3[3];
std::string ret;
while (in_len-- && ( input[in_] != '=') && is_base64(input[in_])) {
char_array_4[i++] = input[in_]; in_++;
if (i ==4) {
for (i = 0; i <4; i++) {
char_array_4[i] = static_cast<unsigned char>(base64_chars.find(char_array_4[i]));
}
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
for (i = 0; (i < 3); i++) {
ret += char_array_3[i];
}
i = 0;
}
}
if (i) {
for (j = i; j <4; j++)
char_array_4[j] = 0;
for (j = 0; j <4; j++)
char_array_4[j] = static_cast<unsigned char>(base64_chars.find(char_array_4[j]));
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
for (j = 0; (j < i - 1); j++) {
ret += static_cast<std::string::value_type>(char_array_3[j]);
}
}
return ret;
}
} // namespace websocketpp
#endif // _BASE64_HPP_

View File

@@ -0,0 +1,211 @@
/*
*
* mod_assemblyai_transcribe.c -- Freeswitch module for using assemblyai streaming transcribe api
*
*/
#include "mod_assemblyai_transcribe.h"
#include "aai_transcribe_glue.h"
/* Prototypes */
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_assemblyai_transcribe_shutdown);
SWITCH_MODULE_LOAD_FUNCTION(mod_assemblyai_transcribe_load);
SWITCH_MODULE_DEFINITION(mod_assemblyai_transcribe, mod_assemblyai_transcribe_load, mod_assemblyai_transcribe_shutdown, NULL);
static switch_status_t do_stop(switch_core_session_t *session, char* bugname);
static void responseHandler(switch_core_session_t* session,
const char* eventName, const char * json, const char* bugname, int finished) {
switch_event_t *event;
switch_channel_t *channel = switch_core_session_get_channel(session);
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, eventName);
switch_channel_event_set_data(channel, event);
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "assemblyai");
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-session-finished", finished ? "true" : "false");
if (finished) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "responseHandler returning event %s, from finished recognition session\n", eventName);
}
if (json) switch_event_add_body(event, "%s", json);
if (bugname) switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "media-bugname", bugname);
switch_event_fire(&event);
}
static switch_bool_t capture_callback(switch_media_bug_t *bug, void *user_data, switch_abc_type_t type)
{
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
switch (type) {
case SWITCH_ABC_TYPE_INIT:
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Got SWITCH_ABC_TYPE_INIT.\n");
break;
case SWITCH_ABC_TYPE_CLOSE:
{
private_t *tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Got SWITCH_ABC_TYPE_CLOSE.\n");
aai_transcribe_session_stop(session, 1, tech_pvt->bugname);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Finished SWITCH_ABC_TYPE_CLOSE.\n");
}
break;
case SWITCH_ABC_TYPE_READ:
return aai_transcribe_frame(session, bug);
break;
case SWITCH_ABC_TYPE_WRITE:
default:
break;
}
return SWITCH_TRUE;
}
static switch_status_t start_capture(switch_core_session_t *session, switch_media_bug_flag_t flags,
char* lang, int interim, char* bugname)
{
switch_channel_t *channel = switch_core_session_get_channel(session);
switch_media_bug_t *bug;
switch_status_t status;
switch_codec_implementation_t read_impl = { 0 };
void *pUserData;
uint32_t samples_per_second;
if (switch_channel_get_private(channel, MY_BUG_NAME)) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "removing bug from previous transcribe\n");
do_stop(session, bugname);
}
switch_core_session_get_read_impl(session, &read_impl);
if (switch_channel_pre_answer(channel) != SWITCH_STATUS_SUCCESS) {
return SWITCH_STATUS_FALSE;
}
samples_per_second = !strcasecmp(read_impl.iananame, "g722") ? read_impl.actual_samples_per_second : read_impl.samples_per_second;
if (SWITCH_STATUS_FALSE == aai_transcribe_session_init(session, responseHandler, samples_per_second, flags & SMBF_STEREO ? 2 : 1, lang, interim, bugname, &pUserData)) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error initializing assemblyai speech session.\n");
return SWITCH_STATUS_FALSE;
}
if ((status = switch_core_media_bug_add(session, "aai_transcribe", NULL, capture_callback, pUserData, 0, flags, &bug)) != SWITCH_STATUS_SUCCESS) {
return status;
}
switch_channel_set_private(channel, MY_BUG_NAME, bug);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "added media bug for assemblyai transcribe\n");
return SWITCH_STATUS_SUCCESS;
}
static switch_status_t do_stop(switch_core_session_t *session, char* bugname)
{
switch_status_t status = SWITCH_STATUS_SUCCESS;
switch_channel_t *channel = switch_core_session_get_channel(session);
switch_media_bug_t *bug = switch_channel_get_private(channel, MY_BUG_NAME);
if (bug) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Received user command command to stop transcribe.\n");
status = aai_transcribe_session_stop(session, 0, bugname);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "stopped transcribe.\n");
}
return status;
}
#define TRANSCRIBE_API_SYNTAX "<uuid> [start|stop] lang-code [interim] [stereo|mono]"
SWITCH_STANDARD_API(aai_transcribe_function)
{
char *mycmd = NULL, *argv[6] = { 0 };
int argc = 0;
switch_status_t status = SWITCH_STATUS_FALSE;
switch_media_bug_flag_t flags = SMBF_READ_STREAM /* | SMBF_WRITE_STREAM | SMBF_READ_PING */;
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
}
if (zstr(cmd) ||
(!strcasecmp(argv[1], "stop") && argc < 2) ||
(!strcasecmp(argv[1], "start") && argc < 3) ||
zstr(argv[0])) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s %s %s.\n", cmd, argv[0], argv[1]);
stream->write_function(stream, "-USAGE: %s\n", TRANSCRIBE_API_SYNTAX);
goto done;
} else {
switch_core_session_t *lsession = NULL;
if ((lsession = switch_core_session_locate(argv[0]))) {
if (!strcasecmp(argv[1], "stop")) {
char *bugname = argc > 2 ? argv[2] : MY_BUG_NAME;
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "stop transcribing\n");
status = do_stop(lsession, bugname);
} else if (!strcasecmp(argv[1], "start")) {
char* lang = argv[2];
int interim = argc > 3 && !strcmp(argv[3], "interim");
char *bugname = argc > 5 ? argv[5] : MY_BUG_NAME;
if (argc > 4 && !strcmp(argv[4], "stereo")) {
flags |= SMBF_WRITE_STREAM ;
flags |= SMBF_STEREO;
}
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "start transcribing %s %s\n", lang, interim ? "interim": "complete");
status = start_capture(lsession, flags, lang, interim, bugname);
}
switch_core_session_rwunlock(lsession);
}
}
if (status == SWITCH_STATUS_SUCCESS) {
stream->write_function(stream, "+OK Success\n");
} else {
stream->write_function(stream, "-ERR Operation Failed\n");
}
done:
switch_safe_free(mycmd);
return SWITCH_STATUS_SUCCESS;
}
SWITCH_MODULE_LOAD_FUNCTION(mod_assemblyai_transcribe_load)
{
switch_api_interface_t *api_interface;
/* create/register custom event message type */
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_RESULTS) != SWITCH_STATUS_SUCCESS) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_RESULTS);
return SWITCH_STATUS_TERM;
}
/* connect my internal structure to the blank pointer passed to me */
*module_interface = switch_loadable_module_create_module_interface(pool, modname);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Deepgram Speech Transcription API loading..\n");
if (SWITCH_STATUS_FALSE == aai_transcribe_init()) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Failed initializing dg speech interface\n");
}
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Deepgram Speech Transcription API successfully loaded\n");
SWITCH_ADD_API(api_interface, "uuid_assemblyai_transcribe", "Deepgram Speech Transcription API", aai_transcribe_function, TRANSCRIBE_API_SYNTAX);
switch_console_set_complete("add uuid_assemblyai_transcribe start lang-code [interim|final] [stereo|mono]");
switch_console_set_complete("add uuid_assemblyai_transcribe stop ");
/* indicate that the module should continue to be loaded */
return SWITCH_STATUS_SUCCESS;
}
/*
Called when the system shuts down
Macro expands to: switch_status_t mod_assemblyai_transcribe_shutdown() */
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_assemblyai_transcribe_shutdown)
{
aai_transcribe_cleanup();
switch_event_free_subclass(TRANSCRIBE_EVENT_RESULTS);
return SWITCH_STATUS_SUCCESS;
}

View File

@@ -0,0 +1,50 @@
#ifndef __MOD_AWS_TRANSCRIBE_H__
#define __MOD_AWS_TRANSCRIBE_H__
#include <switch.h>
#include <speex/speex_resampler.h>
#include <unistd.h>
#define MY_BUG_NAME "assemblyai_transcribe"
#define TRANSCRIBE_EVENT_RESULTS "assemblyai_transcribe::transcription"
#define TRANSCRIBE_EVENT_SESSION_BEGINS "assemblyai_transcribe::session_begins"
#define TRANSCRIBE_EVENT_SESSION_TERMINATED "assemblyai_transcribe::session_termanated"
#define TRANSCRIBE_EVENT_ERROR "assemblyai_transcribe::error"
#define TRANSCRIBE_EVENT_NO_AUDIO_DETECTED "assemblyai_transcribe::no_audio_detected"
#define TRANSCRIBE_EVENT_VAD_DETECTED "assemblyai_transcribe::vad_detected"
#define TRANSCRIBE_EVENT_CONNECT_SUCCESS "assemblyai_transcribe::connect"
#define TRANSCRIBE_EVENT_CONNECT_FAIL "assemblyai_transcribe::connect_failed"
#define TRANSCRIBE_EVENT_BUFFER_OVERRUN "assemblyai_transcribe::buffer_overrun"
#define TRANSCRIBE_EVENT_DISCONNECT "assemblyai_transcribe::disconnect"
#define MAX_LANG (12)
#define MAX_SESSION_ID (256)
#define MAX_WS_URL_LEN (512)
#define MAX_PATH_LEN (4096)
#define MAX_BUG_LEN (64)
typedef void (*responseHandler_t)(switch_core_session_t* session, const char* eventName, const char* json, const char* bugname, int finished);
struct private_data {
switch_mutex_t *mutex;
char sessionId[MAX_SESSION_ID];
SpeexResamplerState *resampler;
responseHandler_t responseHandler;
void *pAudioPipe;
int ws_state;
char host[MAX_WS_URL_LEN];
unsigned int port;
char path[MAX_PATH_LEN];
char bugname[MAX_BUG_LEN+1];
int sampling;
int channels;
unsigned int id;
int buffer_overrun_notified:1;
int is_finished:1;
};
typedef struct private_data private_t;
#endif

View File

@@ -0,0 +1,21 @@
#include "parser.hpp"
#include <switch.h>
cJSON* parse_json(switch_core_session_t* session, const std::string& data, std::string& type) {
cJSON* json = NULL;
const char *szType = NULL;
json = cJSON_Parse(data.c_str());
if (!json) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "parse - failed parsing incoming msg as JSON: %s\n", data.c_str());
return NULL;
}
szType = cJSON_GetObjectCstr(json, "type");
if (szType) {
type.assign(szType);
}
else {
type.assign("json");
}
return json;
}

View File

@@ -0,0 +1,9 @@
#ifndef __PARSER_H__
#define __PARSER_H__
#include <string>
#include <switch_json.h>
cJSON* parse_json(switch_core_session_t* session, const std::string& data, std::string& type) ;
#endif

View File

@@ -0,0 +1,51 @@
/**
* (very) simple and limited circular buffer,
* supporting only the use case of doing all of the adds
* and then subsquently retrieves.
*
*/
class SimpleBuffer {
public:
SimpleBuffer(uint32_t chunkSize, uint32_t numChunks) : numItems(0),
m_numChunks(numChunks), m_chunkSize(chunkSize) {
m_pData = new char[chunkSize * numChunks];
m_pNextWrite = m_pData;
}
~SimpleBuffer() {
delete [] m_pData;
}
void add(void *data, uint32_t datalen) {
if (datalen % m_chunkSize != 0) return;
int numChunks = datalen / m_chunkSize;
for (int i = 0; i < numChunks; i++) {
memcpy(m_pNextWrite, data, m_chunkSize);
data = static_cast<char*>(data) + m_chunkSize;
if (numItems < m_numChunks) numItems++;
uint32_t offset = (m_pNextWrite - m_pData) / m_chunkSize;
if (offset >= m_numChunks - 1) m_pNextWrite = m_pData;
else m_pNextWrite += m_chunkSize;
}
}
char* getNextChunk() {
if (numItems--) {
char *p = m_pNextWrite;
uint32_t offset = (m_pNextWrite - m_pData) / m_chunkSize;
if (offset >= m_numChunks - 1) m_pNextWrite = m_pData;
else m_pNextWrite += m_chunkSize;
return p;
}
return nullptr;
}
uint32_t getNumItems() { return numItems;}
private:
char *m_pData;
uint32_t numItems;
uint32_t m_chunkSize;
uint32_t m_numChunks;
char* m_pNextWrite;
};