mirror of
https://github.com/jambonz/freeswitch-modules.git
synced 2025-12-19 08:37:44 +00:00
* enable audio logging if env AZURE_AUDIO_LOGGING is set * wip * per discussion with microsoft, add nullptr to creation of speechSynthesizer to ensure it knows we do not want it to play to device Signed-off-by: Dave Horton <daveh@beachdognet.com> * logging * fix bug in creation of config string * fix ticket 230 - Microsoft TTS having configuration data as part of audio generation * azure transcribe, resuse existing cap_cb if azure configuration is changed Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com> * clean up azure code for how to re-create gsstream when configuration is changed Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com> * fix review comments Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com> * fix review comment Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com> * fix review comment Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com> * wrap function in try catch --------- Signed-off-by: Dave Horton <daveh@beachdognet.com> Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com> Co-authored-by: Hoan HL <quan.luuhoang8@gmail.com>
691 lines
27 KiB
C++
691 lines
27 KiB
C++
#include <cstdlib>
|
|
|
|
#include <switch.h>
|
|
#include <switch_json.h>
|
|
|
|
#include <string.h>
|
|
#include <mutex>
|
|
#include <thread>
|
|
#include <condition_variable>
|
|
#include <string>
|
|
#include <sstream>
|
|
#include <deque>
|
|
#include <memory>
|
|
|
|
#include <speechapi_cxx.h>
|
|
|
|
#include "mod_azure_transcribe.h"
|
|
#include "simple_buffer.h"
|
|
|
|
#define CHUNKSIZE (320)
|
|
#define DEFAULT_SPEECH_TIMEOUT "180000"
|
|
|
|
using namespace Microsoft::CognitiveServices::Speech;
|
|
using namespace Microsoft::CognitiveServices::Speech::Audio;
|
|
|
|
const char ALLOC_TAG[] = "drachtio";
|
|
|
|
static bool hasDefaultCredentials = false;
|
|
static bool sdkInitialized = false;
|
|
static const char* sdkLog = std::getenv("AZURE_SDK_LOGFILE");
|
|
static const char* proxyIP = std::getenv("JAMBONES_HTTP_PROXY_IP");
|
|
static const char* proxyPort = std::getenv("JAMBONES_HTTP_PROXY_PORT");
|
|
static const char* proxyUsername = std::getenv("JAMBONES_HTTP_PROXY_USERNAME");
|
|
static const char* proxyPassword = std::getenv("JAMBONES_HTTP_PROXY_PASSWORD");
|
|
static const bool use_single_connection = switch_true(std::getenv("AZURE_SPEECH_USE_SINGLE_CONNECTION"));
|
|
|
|
class GStreamer {
|
|
public:
|
|
GStreamer(
|
|
const char *sessionId,
|
|
const char *bugname,
|
|
u_int16_t channels,
|
|
char *lang,
|
|
int interim,
|
|
uint32_t samples_per_second,
|
|
const char* region,
|
|
const char* subscriptionKey,
|
|
responseHandler_t responseHandler
|
|
) : m_sessionId(sessionId), m_bugname(bugname), m_finished(false), m_stopped(false), m_interim(interim),
|
|
m_connected(false), m_connecting(false), m_audioBuffer(320 * (samples_per_second == 8000 ? 1 : 2), 15),
|
|
m_responseHandler(responseHandler) {
|
|
|
|
switch_core_session_t* psession = switch_core_session_locate(sessionId);
|
|
if (!psession) throw std::invalid_argument( "session id no longer active" );
|
|
switch_channel_t *channel = switch_core_session_get_channel(psession);
|
|
|
|
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::GStreamer(%p) region %s, language %s\n",
|
|
this, region, lang);
|
|
|
|
|
|
const char* endpoint = switch_channel_get_variable(channel, "AZURE_SERVICE_ENDPOINT");
|
|
const char* endpointId = switch_channel_get_variable(channel, "AZURE_SERVICE_ENDPOINT_ID");
|
|
|
|
auto sourceLanguageConfig = SourceLanguageConfig::FromLanguage(lang);
|
|
auto format = AudioStreamFormat::GetWaveFormatPCM(8000, 16, channels);
|
|
auto options = AudioProcessingOptions::Create(AUDIO_INPUT_PROCESSING_ENABLE_DEFAULT);
|
|
auto speechConfig = nullptr != endpoint ?
|
|
(nullptr != subscriptionKey ?
|
|
SpeechConfig::FromEndpoint(endpoint, subscriptionKey) :
|
|
SpeechConfig::FromEndpoint(endpoint)) :
|
|
SpeechConfig::FromSubscription(subscriptionKey, region);
|
|
if (switch_true(switch_channel_get_variable(channel, "AZURE_USE_OUTPUT_FORMAT_DETAILED"))) {
|
|
speechConfig->SetOutputFormat(OutputFormat::Detailed);
|
|
}
|
|
if (nullptr != endpointId) {
|
|
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG, "setting endpoint id: %s\n", endpointId);
|
|
speechConfig->SetEndpointId(endpointId);
|
|
}
|
|
if (!sdkInitialized && sdkLog) {
|
|
sdkInitialized = true;
|
|
speechConfig->SetProperty(PropertyId::Speech_LogFilename, sdkLog);
|
|
}
|
|
if (switch_true(switch_channel_get_variable(channel, "AZURE_AUDIO_LOGGING"))) {
|
|
speechConfig->EnableAudioLogging();
|
|
}
|
|
|
|
if (nullptr != proxyIP && nullptr != proxyPort) {
|
|
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG, "setting proxy: %s:%s\n", proxyIP, proxyPort);
|
|
speechConfig->SetProxy(proxyIP, atoi(proxyPort), proxyUsername, proxyPassword);
|
|
}
|
|
|
|
m_pushStream = AudioInputStream::CreatePushStream(format);
|
|
auto audioConfig = AudioConfig::FromStreamInput(m_pushStream);
|
|
|
|
// alternative language
|
|
const char* var;
|
|
if (var = switch_channel_get_variable(channel, "AZURE_SPEECH_ALTERNATIVE_LANGUAGE_CODES")) {
|
|
std::vector<std::string> languages;
|
|
char *alt_langs[3] = { 0 };
|
|
int argc = switch_separate_string((char *) var, ',', alt_langs, 3);
|
|
|
|
languages.push_back(lang); // primary language
|
|
for (int i = 0; i < argc; i++) {
|
|
languages.push_back( alt_langs[i]);
|
|
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG, "added alternative lang %s\n", alt_langs[i]);
|
|
}
|
|
auto autoDetectSourceLanguageConfig = AutoDetectSourceLanguageConfig::FromLanguages(languages);
|
|
m_recognizer = SpeechRecognizer::FromConfig(speechConfig, autoDetectSourceLanguageConfig, audioConfig);
|
|
}
|
|
else {
|
|
auto sourceLanguageConfig = SourceLanguageConfig::FromLanguage(lang);
|
|
m_recognizer = SpeechRecognizer::FromConfig(speechConfig, sourceLanguageConfig, audioConfig);
|
|
}
|
|
|
|
|
|
// set properties
|
|
auto &properties = m_recognizer->Properties;
|
|
|
|
// profanity options: Allowed values are "masked", "removed", and "raw".
|
|
const char* profanity = switch_channel_get_variable(channel, "AZURE_PROFANITY_OPTION");
|
|
if (profanity) {
|
|
properties.SetProperty(PropertyId::SpeechServiceResponse_ProfanityOption, profanity);
|
|
}
|
|
// report signal-to-noise ratio
|
|
if (switch_true(switch_channel_get_variable(channel, "AZURE_REQUEST_SNR"))) {
|
|
properties.SetProperty(PropertyId::SpeechServiceResponse_RequestSnr, TrueString);
|
|
}
|
|
// initial speech timeout in milliseconds
|
|
const char* timeout = switch_channel_get_variable(channel, "AZURE_INITIAL_SPEECH_TIMEOUT_MS");
|
|
if (timeout) properties.SetProperty(PropertyId::SpeechServiceConnection_InitialSilenceTimeoutMs, timeout);
|
|
else properties.SetProperty(PropertyId::SpeechServiceConnection_InitialSilenceTimeoutMs, DEFAULT_SPEECH_TIMEOUT);
|
|
|
|
const char* segmentationInterval = switch_channel_get_variable(channel, "AZURE_SPEECH_SEGMENTATION_SILENCE_TIMEOUT_MS");
|
|
if (segmentationInterval) {
|
|
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG, "setting segmentation interval to %s ms\n", segmentationInterval);
|
|
properties.SetProperty(PropertyId::Speech_SegmentationSilenceTimeoutMs, segmentationInterval);
|
|
}
|
|
|
|
//https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-identification?tabs=once&pivots=programming-language-cpp#at-start-and-continuous-language-identification
|
|
const char* languageIdMode = switch_channel_get_variable(channel, "AZURE_LANGUAGE_ID_MODE");
|
|
if (languageIdMode) {
|
|
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG, "setting SpeechServiceConnection_LanguageIdMode to %s \n", languageIdMode);
|
|
properties.SetProperty(PropertyId::SpeechServiceConnection_LanguageIdMode, languageIdMode);
|
|
}
|
|
|
|
// recognition mode - readonly according to Azure docs:
|
|
// https://docs.microsoft.com/en-us/javascript/api/microsoft-cognitiveservices-speech-sdk/propertyid?view=azure-node-latest
|
|
/*
|
|
const char* recoMode = switch_channel_get_variable(channel, "AZURE_RECOGNITION_MODE");
|
|
if (recoMode) {
|
|
properties.SetProperty(PropertyId::SpeechServiceConnection_RecoMode, recoMode);
|
|
}
|
|
*/
|
|
|
|
// hints
|
|
const char* hints = switch_channel_get_variable(channel, "AZURE_SPEECH_HINTS");
|
|
if (hints) {
|
|
auto grammar = PhraseListGrammar::FromRecognizer(m_recognizer);
|
|
char *phrases[500] = { 0 };
|
|
int argc = switch_separate_string((char *)hints, ',', phrases, 500);
|
|
for (int i = 0; i < argc; i++) {
|
|
grammar->AddPhrase(phrases[i]);
|
|
}
|
|
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG, "added %d hints\n", argc);
|
|
}
|
|
|
|
auto onSessionStopped = [this](const SessionEventArgs& args) {
|
|
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
|
m_stopped = true;
|
|
if (psession) {
|
|
auto sessionId = args.SessionId;
|
|
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer: got session stopped from microsoft\n");
|
|
switch_core_session_rwunlock(psession);
|
|
}
|
|
};
|
|
auto onSpeechStartDetected = [this, responseHandler](const RecognitionEventArgs& args) {
|
|
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
|
if (psession) {
|
|
auto sessionId = args.SessionId;
|
|
responseHandler(psession, TRANSCRIBE_EVENT_START_OF_UTTERANCE, NULL, m_bugname.c_str(), m_finished);
|
|
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer start of speech\n");
|
|
switch_core_session_rwunlock(psession);
|
|
}
|
|
};
|
|
auto onSpeechEndDetected = [this, responseHandler](const RecognitionEventArgs& args) {
|
|
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
|
if (psession) {
|
|
auto sessionId = args.SessionId;
|
|
responseHandler(psession, TRANSCRIBE_EVENT_END_OF_UTTERANCE, NULL, m_bugname.c_str(), m_finished);
|
|
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer end of speech\n");
|
|
switch_core_session_rwunlock(psession);
|
|
}
|
|
};
|
|
auto onRecognitionEvent = [this, responseHandler](const SpeechRecognitionEventArgs& args) {
|
|
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
|
if (psession) {
|
|
auto result = args.Result;
|
|
auto reason = result->Reason;
|
|
const auto& properties = result->Properties;
|
|
auto json = properties.GetProperty(PropertyId::SpeechServiceResponse_JsonResult);
|
|
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer onRecognitionEvent reason %d results: %s,\n", reason, json.c_str());
|
|
|
|
switch (reason) {
|
|
case ResultReason::RecognizingSpeech:
|
|
case ResultReason::RecognizedSpeech:
|
|
// note: interim results don't have "RecognitionStatus": "Success"
|
|
responseHandler(psession, TRANSCRIBE_EVENT_RESULTS, json.c_str(), m_bugname.c_str(), m_finished);
|
|
break;
|
|
case ResultReason::NoMatch:
|
|
responseHandler(psession, TRANSCRIBE_EVENT_NO_SPEECH_DETECTED, json.c_str(), m_bugname.c_str(), m_finished);
|
|
break;
|
|
|
|
default:
|
|
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "GStreamer unexpected result '%s': reason %d\n",
|
|
json.c_str(), reason);
|
|
responseHandler(psession, TRANSCRIBE_EVENT_ERROR, json.c_str(), m_bugname.c_str(), m_finished);
|
|
|
|
break;
|
|
}
|
|
switch_core_session_rwunlock(psession);
|
|
}
|
|
};
|
|
|
|
auto onCanceled = [this, responseHandler](const SpeechRecognitionCanceledEventArgs& args) {
|
|
if (m_finished) return;
|
|
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
|
if (psession) {
|
|
auto result = args.Result;
|
|
auto details = args.ErrorDetails;
|
|
auto code = args.ErrorCode;
|
|
cJSON* json = cJSON_CreateObject();
|
|
cJSON_AddStringToObject(json, "type", "error");
|
|
cJSON_AddStringToObject(json, "error", details.c_str());
|
|
char* jsonString = cJSON_PrintUnformatted(json);
|
|
responseHandler(psession, TRANSCRIBE_EVENT_ERROR, jsonString, m_bugname.c_str(), m_finished);
|
|
free(jsonString);
|
|
cJSON_Delete(json);
|
|
switch_core_session_rwunlock(psession);
|
|
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer recognition canceled, error %d: %s\n", code, details.c_str());
|
|
}
|
|
};
|
|
|
|
m_recognizer->SessionStopped += onSessionStopped;
|
|
m_recognizer->SpeechStartDetected += onSpeechStartDetected;
|
|
m_recognizer->SpeechEndDetected += onSpeechEndDetected;
|
|
if (interim) m_recognizer->Recognizing += onRecognitionEvent;
|
|
m_recognizer->Recognized += onRecognitionEvent;
|
|
m_recognizer->Canceled += onCanceled;
|
|
|
|
// Store the final configuration string
|
|
m_configuration_string = createConfigurationStr(channels, lang, interim, samples_per_second, region, subscriptionKey, psession);
|
|
|
|
switch_core_session_rwunlock(psession);
|
|
}
|
|
|
|
~GStreamer() {
|
|
//switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::~GStreamer %p\n", this);
|
|
}
|
|
|
|
const char* configuration() {
|
|
return m_configuration_string.c_str();
|
|
}
|
|
|
|
void connect() {
|
|
if (m_connecting) return;
|
|
m_connecting = true;
|
|
|
|
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer:connect %p connecting to azure speech..\n", this);
|
|
|
|
auto onSessionStarted = [this](const SessionEventArgs& args) {
|
|
m_connected = true;
|
|
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
|
if (psession) {
|
|
auto sessionId = args.SessionId;
|
|
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer got session started from microsoft\n");
|
|
|
|
// send any buffered audio
|
|
int nFrames = m_audioBuffer.getNumItems();
|
|
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p got session started from azure, %d buffered frames\n", this, nFrames);
|
|
if (nFrames) {
|
|
char *p;
|
|
do {
|
|
p = m_audioBuffer.getNextChunk();
|
|
if (p) {
|
|
write(p, CHUNKSIZE);
|
|
}
|
|
} while (p);
|
|
}
|
|
switch_core_session_rwunlock(psession);
|
|
}
|
|
};
|
|
m_recognizer->SessionStarted += onSessionStarted;
|
|
m_recognizer->StartContinuousRecognitionAsync();
|
|
|
|
}
|
|
|
|
bool write(void* data, uint32_t datalen) {
|
|
if (m_finished) {
|
|
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::write not writing because we are finished, %p\n", this);
|
|
return false;
|
|
}
|
|
if (!m_connected) {
|
|
if (datalen % CHUNKSIZE == 0) {
|
|
m_audioBuffer.add(data, datalen);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
m_pushStream->Write(static_cast<uint8_t*>(data), datalen);
|
|
return true;
|
|
}
|
|
|
|
void finish() {
|
|
if (m_finished) return;
|
|
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::finish - calling StopContinuousRecognitionAsync (%p)\n", this);
|
|
m_finished = true;
|
|
m_recognizer->StopContinuousRecognitionAsync().get();
|
|
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "GStreamer::finish - recognition has completed (%p)\n", this);
|
|
}
|
|
|
|
bool isStopped() {
|
|
return m_stopped;
|
|
}
|
|
|
|
bool isConnecting() {
|
|
return m_connecting;
|
|
}
|
|
|
|
bool hasConfigurationChanged(
|
|
u_int16_t channels,
|
|
char *lang,
|
|
int interim,
|
|
uint32_t samples_per_second,
|
|
const char* region,
|
|
const char* subscriptionKey) {
|
|
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
|
|
if (!psession) throw std::invalid_argument( "session id no longer active" );
|
|
|
|
std::string newConfiguration = createConfigurationStr(channels, lang, interim, samples_per_second, region, subscriptionKey, psession);
|
|
|
|
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG,
|
|
"hasConfigurationChanged: old configurattion: %s, new configuration: %s\n", configuration(), newConfiguration.c_str());
|
|
|
|
switch_core_session_rwunlock(psession);
|
|
|
|
return strcmp(newConfiguration.c_str(), configuration());
|
|
}
|
|
|
|
private:
|
|
std::string m_sessionId;
|
|
std::string m_bugname;
|
|
std::string m_region;
|
|
std::shared_ptr<SpeechRecognizer> m_recognizer;
|
|
std::shared_ptr<PushAudioInputStream> m_pushStream;
|
|
std::string m_configuration_string;
|
|
responseHandler_t m_responseHandler;
|
|
bool m_interim;
|
|
bool m_finished;
|
|
bool m_connected;
|
|
bool m_connecting;
|
|
bool m_stopped;
|
|
SimpleBuffer m_audioBuffer;
|
|
|
|
std::string createConfigurationStr(
|
|
u_int16_t channels,
|
|
char *lang,
|
|
int interim,
|
|
uint32_t samples_per_second,
|
|
const char* region,
|
|
const char* subscriptionKey,
|
|
switch_core_session_t* psession
|
|
) {
|
|
switch_channel_t *channel = switch_core_session_get_channel(psession);
|
|
std::ostringstream configuration_stream;
|
|
configuration_stream <<
|
|
channels << ";" <<
|
|
lang << ";" <<
|
|
interim << ";" <<
|
|
samples_per_second << ";" <<
|
|
region << ";" <<
|
|
subscriptionKey << ";";
|
|
|
|
const char* endpoint = switch_channel_get_variable(channel, "AZURE_SERVICE_ENDPOINT");
|
|
const char* endpointId = switch_channel_get_variable(channel, "AZURE_SERVICE_ENDPOINT_ID");
|
|
configuration_stream <<
|
|
endpoint << ";" <<
|
|
endpointId << ";";
|
|
if (switch_true(switch_channel_get_variable(channel, "AZURE_USE_OUTPUT_FORMAT_DETAILED"))) {
|
|
configuration_stream << "output_format_detailed;";
|
|
}
|
|
if (switch_true(switch_channel_get_variable(channel, "AZURE_AUDIO_LOGGING"))) {
|
|
configuration_stream << "audio_logging;";
|
|
}
|
|
if (nullptr != proxyIP && nullptr != proxyPort) {
|
|
configuration_stream <<
|
|
proxyIP << ";" <<
|
|
proxyPort << ";" <<
|
|
proxyUsername << ";" <<
|
|
proxyPassword << ";";
|
|
}
|
|
const char* var;
|
|
if (var = switch_channel_get_variable(channel, "AZURE_SPEECH_ALTERNATIVE_LANGUAGE_CODES")) {
|
|
configuration_stream << var << ";";
|
|
}
|
|
if (var = switch_channel_get_variable(channel, "AZURE_PROFANITY_OPTION")) {
|
|
configuration_stream << var << ";";
|
|
}
|
|
if (var = switch_channel_get_variable(channel, "AZURE_REQUEST_SNR")) {
|
|
configuration_stream << var << ";";
|
|
}
|
|
if (var = switch_channel_get_variable(channel, "AZURE_INITIAL_SPEECH_TIMEOUT_MS")) {
|
|
configuration_stream << var << ";";
|
|
}
|
|
if (var = switch_channel_get_variable(channel, "AZURE_SPEECH_SEGMENTATION_SILENCE_TIMEOUT_MS")) {
|
|
configuration_stream << var << ";";
|
|
}
|
|
if (var = switch_channel_get_variable(channel, "AZURE_LANGUAGE_ID_MODE")) {
|
|
configuration_stream << var << ";";
|
|
}
|
|
if (var = switch_channel_get_variable(channel, "AZURE_SPEECH_HINTS")) {
|
|
configuration_stream << var << ";";
|
|
}
|
|
return configuration_stream.str();
|
|
}
|
|
};
|
|
|
|
static void reaper(struct cap_cb *cb) {
|
|
std::shared_ptr<GStreamer> pStreamer;
|
|
pStreamer.reset((GStreamer *)cb->streamer);
|
|
cb->streamer = nullptr;
|
|
|
|
std::thread t([pStreamer]{
|
|
pStreamer->finish();
|
|
});
|
|
t.detach();
|
|
}
|
|
|
|
static void killcb(struct cap_cb* cb) {
|
|
if (cb) {
|
|
if (cb->streamer) {
|
|
GStreamer* p = (GStreamer *) cb->streamer;
|
|
delete p;
|
|
cb->streamer = NULL;
|
|
}
|
|
if (cb->resampler) {
|
|
speex_resampler_destroy(cb->resampler);
|
|
cb->resampler = NULL;
|
|
}
|
|
if (cb->vad) {
|
|
switch_vad_destroy(&cb->vad);
|
|
cb->vad = nullptr;
|
|
}
|
|
}
|
|
}
|
|
|
|
extern "C" {
|
|
switch_status_t azure_transcribe_init() {
|
|
const char* subscriptionKey = std::getenv("AZURE_SUBSCRIPTION_KEY");
|
|
const char* region = std::getenv("AZURE_REGION");
|
|
if (NULL == subscriptionKey) {
|
|
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE,
|
|
"\"AZURE_SUBSCRIPTION_KEY\" env var not set; authentication will expect channel variables of same names to be set\n");
|
|
}
|
|
else {
|
|
hasDefaultCredentials = true;
|
|
}
|
|
return SWITCH_STATUS_SUCCESS;
|
|
}
|
|
|
|
switch_status_t azure_transcribe_cleanup() {
|
|
return SWITCH_STATUS_SUCCESS;
|
|
}
|
|
|
|
// start transcribe on a channel
|
|
switch_status_t azure_transcribe_session_init(switch_core_session_t *session, responseHandler_t responseHandler,
|
|
uint32_t samples_per_second, uint32_t channels, char* lang, int interim, char* bugname, void **ppUserData
|
|
) {
|
|
GStreamer *streamer = NULL;
|
|
switch_status_t status = SWITCH_STATUS_SUCCESS;
|
|
switch_channel_t *channel = switch_core_session_get_channel(session);
|
|
switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, bugname);
|
|
const char* subscriptionKey = switch_channel_get_variable(channel, "AZURE_SUBSCRIPTION_KEY");
|
|
const char* region = switch_channel_get_variable(channel, "AZURE_REGION");
|
|
const char* sessionId = switch_core_session_get_uuid(session);
|
|
auto read_codec = switch_core_session_get_read_codec(session);
|
|
uint32_t sampleRate = read_codec->implementation->actual_samples_per_second;
|
|
if (bug) {
|
|
struct cap_cb* existing_cb = (struct cap_cb*) switch_core_media_bug_get_user_data(bug);
|
|
GStreamer* existing_streamer = (GStreamer*) existing_cb->streamer;
|
|
existing_cb->is_keep_alive = 0;
|
|
if (!existing_streamer->hasConfigurationChanged(channels, lang, interim, sampleRate, region, subscriptionKey)) {
|
|
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Reuse active azure connection.\n");
|
|
return SWITCH_STATUS_SUCCESS;
|
|
}
|
|
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Azure configuration is changed, destroy old and create new azure connection\n");
|
|
reaper(existing_cb);
|
|
streamer = new GStreamer(sessionId, bugname, channels, lang, interim, sampleRate, region, subscriptionKey, responseHandler);
|
|
if (!existing_cb->vad) streamer->connect();
|
|
existing_cb->streamer = streamer;
|
|
*ppUserData = existing_cb;
|
|
return SWITCH_STATUS_SUCCESS;
|
|
}
|
|
int err;
|
|
switch_threadattr_t *thd_attr = NULL;
|
|
switch_memory_pool_t *pool = switch_core_session_get_pool(session);
|
|
struct cap_cb* cb = (struct cap_cb *) switch_core_session_alloc(session, sizeof(*cb));
|
|
memset(cb, sizeof(cb), 0);
|
|
cb->channels = channels;
|
|
strncpy(cb->sessionId, sessionId, MAX_SESSION_ID);
|
|
strncpy(cb->bugname, bugname, MAX_BUG_LEN);
|
|
|
|
if (subscriptionKey && region) {
|
|
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Using channel vars for azure authentication\n");
|
|
strncpy(cb->subscriptionKey, subscriptionKey, MAX_SUBSCRIPTION_KEY_LEN);
|
|
strncpy(cb->region, region, MAX_REGION);
|
|
}
|
|
else if (std::getenv("AZURE_SUBSCRIPTION_KEY") && std::getenv("AZURE_REGION")) {
|
|
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Using env vars for azure authentication\n");
|
|
strncpy(cb->subscriptionKey, std::getenv("AZURE_SUBSCRIPTION_KEY"), MAX_SUBSCRIPTION_KEY_LEN);
|
|
strncpy(cb->region, std::getenv("AZURE_REGION"), MAX_REGION);
|
|
}
|
|
else {
|
|
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "No channel vars or env vars for azure authentication..will use default profile if found\n");
|
|
}
|
|
|
|
cb->responseHandler = responseHandler;
|
|
|
|
cb->interim = interim;
|
|
strncpy(cb->lang, lang, MAX_LANG);
|
|
|
|
try {
|
|
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "%s: initializing gstreamer with %s\n",
|
|
switch_channel_get_name(channel), bugname);
|
|
streamer = new GStreamer(sessionId, bugname, channels, lang, interim, sampleRate, cb->region, subscriptionKey, responseHandler);
|
|
cb->streamer = streamer;
|
|
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "azure_transcribe_session_init: config: %s\n", streamer->configuration());
|
|
} catch (std::exception& e) {
|
|
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error initializing gstreamer: %s.\n",
|
|
switch_channel_get_name(channel), e.what());
|
|
return SWITCH_STATUS_FALSE;
|
|
}
|
|
|
|
if (switch_mutex_init(&cb->mutex, SWITCH_MUTEX_NESTED, pool) != SWITCH_STATUS_SUCCESS) {
|
|
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error initializing mutex\n");
|
|
status = SWITCH_STATUS_FALSE;
|
|
goto done;
|
|
}
|
|
|
|
/* determine if we need to resample the audio to 16-bit 8khz */
|
|
if (sampleRate != 8000) {
|
|
cb->resampler = speex_resampler_init(1, sampleRate, 8000, SWITCH_RESAMPLE_QUALITY, &err);
|
|
if (0 != err) {
|
|
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error initializing resampler: %s.\n",
|
|
switch_channel_get_name(channel), speex_resampler_strerror(err));
|
|
status = SWITCH_STATUS_FALSE;
|
|
goto done;
|
|
}
|
|
}
|
|
|
|
// allocate vad if we are delaying connecting to the recognizer until we detect speech
|
|
if (switch_channel_var_true(channel, "START_RECOGNIZING_ON_VAD")) {
|
|
cb->vad = switch_vad_init(sampleRate, 1);
|
|
if (cb->vad) {
|
|
const char* var;
|
|
int mode = 2;
|
|
int silence_ms = 150;
|
|
int voice_ms = 250;
|
|
int debug = 0;
|
|
|
|
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_MODE")) {
|
|
mode = atoi(var);
|
|
}
|
|
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_SILENCE_MS")) {
|
|
silence_ms = atoi(var);
|
|
}
|
|
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_VOICE_MS")) {
|
|
voice_ms = atoi(var);
|
|
}
|
|
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_DEBUG")) {
|
|
debug = atoi(var);
|
|
}
|
|
switch_vad_set_mode(cb->vad, mode);
|
|
switch_vad_set_param(cb->vad, "silence_ms", silence_ms);
|
|
switch_vad_set_param(cb->vad, "voice_ms", voice_ms);
|
|
switch_vad_set_param(cb->vad, "debug", debug);
|
|
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "%s: delaying connection until vad, voice_ms %d, mode %d\n",
|
|
switch_channel_get_name(channel), voice_ms, mode);
|
|
}
|
|
}
|
|
if (!cb->vad) streamer->connect();
|
|
done:
|
|
*ppUserData = cb;
|
|
cb->is_keep_alive = 0;
|
|
return status;
|
|
}
|
|
|
|
switch_status_t azure_transcribe_session_stop(switch_core_session_t *session, int channelIsClosing, char* bugname) {
|
|
switch_channel_t *channel = switch_core_session_get_channel(session);
|
|
switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, bugname);
|
|
|
|
if (bug) {
|
|
struct cap_cb *cb = (struct cap_cb *) switch_core_media_bug_get_user_data(bug);
|
|
switch_status_t st;
|
|
|
|
if (use_single_connection && !channelIsClosing) {
|
|
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "azure_transcribe_session_stop: call is running, use_single_connection is true, keep alive is activated\n");
|
|
cb->is_keep_alive = 1;
|
|
return SWITCH_STATUS_SUCCESS;
|
|
}
|
|
|
|
// close connection and get final responses
|
|
switch_mutex_lock(cb->mutex);
|
|
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "azure_transcribe_session_stop: locked session\n");
|
|
|
|
switch_channel_set_private(channel, bugname, NULL);
|
|
if (!channelIsClosing) switch_core_media_bug_remove(session, &bug);
|
|
|
|
GStreamer* streamer = (GStreamer *) cb->streamer;
|
|
if (streamer) reaper(cb);
|
|
killcb(cb);
|
|
switch_mutex_unlock(cb->mutex);
|
|
switch_mutex_destroy(cb->mutex);
|
|
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "azure_transcribe_session_stop: unlocked session\n");
|
|
|
|
return SWITCH_STATUS_SUCCESS;
|
|
}
|
|
|
|
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s Bug is not attached.\n", switch_channel_get_name(channel));
|
|
return SWITCH_STATUS_FALSE;
|
|
}
|
|
|
|
switch_bool_t azure_transcribe_frame(switch_media_bug_t *bug, void* user_data) {
|
|
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
|
|
uint8_t data[SWITCH_RECOMMENDED_BUFFER_SIZE];
|
|
switch_frame_t frame = {};
|
|
struct cap_cb *cb = (struct cap_cb *) user_data;
|
|
|
|
frame.data = data;
|
|
frame.buflen = SWITCH_RECOMMENDED_BUFFER_SIZE;
|
|
if (cb->is_keep_alive) {
|
|
// remove media bug buffered data
|
|
while (true) {
|
|
unsigned char data[SWITCH_RECOMMENDED_BUFFER_SIZE] = {0};
|
|
switch_frame_t frame = { 0 };
|
|
frame.data = data;
|
|
frame.buflen = SWITCH_RECOMMENDED_BUFFER_SIZE;
|
|
switch_status_t rv = switch_core_media_bug_read(bug, &frame, SWITCH_TRUE);
|
|
if (rv != SWITCH_STATUS_SUCCESS) break;
|
|
}
|
|
return SWITCH_TRUE;
|
|
}
|
|
if (switch_mutex_trylock(cb->mutex) == SWITCH_STATUS_SUCCESS) {
|
|
GStreamer* streamer = (GStreamer *) cb->streamer;
|
|
if (streamer) {
|
|
while (switch_core_media_bug_read(bug, &frame, SWITCH_TRUE) == SWITCH_STATUS_SUCCESS && !switch_test_flag((&frame), SFF_CNG)) {
|
|
if (frame.datalen) {
|
|
if (cb->vad && !streamer->isConnecting()) {
|
|
switch_vad_state_t state = switch_vad_process(cb->vad, (int16_t*) frame.data, frame.samples);
|
|
if (state == SWITCH_VAD_STATE_START_TALKING) {
|
|
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "detected speech, connect to azure speech now\n");
|
|
streamer->connect();
|
|
cb->responseHandler(session, TRANSCRIBE_EVENT_VAD_DETECTED, NULL, cb->bugname, 0);
|
|
}
|
|
}
|
|
|
|
if (cb->resampler) {
|
|
spx_int16_t out[SWITCH_RECOMMENDED_BUFFER_SIZE];
|
|
spx_uint32_t out_len = SWITCH_RECOMMENDED_BUFFER_SIZE;
|
|
spx_uint32_t in_len = frame.samples;
|
|
size_t written;
|
|
|
|
speex_resampler_process_interleaved_int(
|
|
cb->resampler,
|
|
(const spx_int16_t *) frame.data,
|
|
(spx_uint32_t *) &in_len,
|
|
&out[0],
|
|
&out_len);
|
|
streamer->write( &out[0], sizeof(spx_int16_t) * out_len);
|
|
}
|
|
else {
|
|
streamer->write( frame.data, frame.datalen);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
switch_mutex_unlock(cb->mutex);
|
|
}
|
|
return SWITCH_TRUE;
|
|
}
|
|
}
|