freeswitch-modules/mod_azure_transcribe/azure_transcribe_glue.cpp

#include <cstdlib>

#include <switch.h>
#include <switch_json.h>

#include <string.h>
#include <mutex>
#include <thread>
#include <condition_variable>
#include <string>
#include <sstream>
#include <deque>
#include <memory>

#include <speechapi_cxx.h>

#include "mod_azure_transcribe.h"
#include "simple_buffer.h"

#define CHUNKSIZE (320)
#define DEFAULT_SPEECH_TIMEOUT "180000"

using namespace Microsoft::CognitiveServices::Speech;
using namespace Microsoft::CognitiveServices::Speech::Audio;

const char ALLOC_TAG[] = "drachtio";

static bool hasDefaultCredentials = false;
static bool sdkInitialized = false;
static const char* sdkLog = std::getenv("AZURE_SDK_LOGFILE");
static const char* proxyIP = std::getenv("JAMBONES_HTTP_PROXY_IP");
static const char* proxyPort = std::getenv("JAMBONES_HTTP_PROXY_PORT");
static const char* proxyUsername = std::getenv("JAMBONES_HTTP_PROXY_USERNAME");
static const char* proxyPassword = std::getenv("JAMBONES_HTTP_PROXY_PASSWORD");
static const bool use_single_connection = switch_true(std::getenv("AZURE_SPEECH_USE_SINGLE_CONNECTION"));

class GStreamer {
public:
	GStreamer(
		const char *sessionId,
		const char *bugname,
		u_int16_t channels,
    char *lang,
    int interim,
		uint32_t samples_per_second,
		const char* region,
		const char* subscriptionKey,
		responseHandler_t responseHandler
  ) : m_sessionId(sessionId), m_bugname(bugname), m_finished(false), m_stopped(false), m_interim(interim),
	 m_connected(false), m_connecting(false), m_audioBuffer(320 * (samples_per_second == 8000 ? 1 : 2), 15),
	m_responseHandler(responseHandler) {

		switch_core_session_t* psession = switch_core_session_locate(sessionId);
		if (!psession) throw std::invalid_argument( "session id no longer active" );
		switch_channel_t *channel = switch_core_session_get_channel(psession);

		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::GStreamer(%p) region %s, language %s\n",
			this, region, lang);


		const char* endpoint = switch_channel_get_variable(channel, "AZURE_SERVICE_ENDPOINT");
		const char* endpointId = switch_channel_get_variable(channel, "AZURE_SERVICE_ENDPOINT_ID");

		auto sourceLanguageConfig = SourceLanguageConfig::FromLanguage(lang);
		auto format = AudioStreamFormat::GetWaveFormatPCM(8000, 16, channels);
		auto options = AudioProcessingOptions::Create(AUDIO_INPUT_PROCESSING_ENABLE_DEFAULT);
		auto speechConfig = nullptr != endpoint ?
			(nullptr != subscriptionKey ?
				SpeechConfig::FromEndpoint(endpoint, subscriptionKey) :
				SpeechConfig::FromEndpoint(endpoint)) :
			SpeechConfig::FromSubscription(subscriptionKey, region);
		if (switch_true(switch_channel_get_variable(channel, "AZURE_USE_OUTPUT_FORMAT_DETAILED"))) {
			speechConfig->SetOutputFormat(OutputFormat::Detailed);
		}
		if (nullptr != endpointId) {
      switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG, "setting endpoint id: %s\n", endpointId);
			speechConfig->SetEndpointId(endpointId);
		}
		if (!sdkInitialized && sdkLog) {
			sdkInitialized = true;
			speechConfig->SetProperty(PropertyId::Speech_LogFilename, sdkLog);
		}
		if (switch_true(switch_channel_get_variable(channel, "AZURE_AUDIO_LOGGING"))) {
			speechConfig->EnableAudioLogging();
		}

    if (nullptr != proxyIP && nullptr != proxyPort) {
      switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG, "setting proxy: %s:%s\n", proxyIP, proxyPort);
      speechConfig->SetProxy(proxyIP, atoi(proxyPort), proxyUsername, proxyPassword);
    }

		m_pushStream = AudioInputStream::CreatePushStream(format);
		auto audioConfig = AudioConfig::FromStreamInput(m_pushStream);

    // alternative language
		const char* var;
    if (var = switch_channel_get_variable(channel, "AZURE_SPEECH_ALTERNATIVE_LANGUAGE_CODES")) {
			std::vector<std::string> languages;
			char *alt_langs[3] = { 0 };
      int argc = switch_separate_string((char *) var, ',', alt_langs, 3);

			languages.push_back(lang); // primary language
      for (int i = 0; i < argc; i++) {
				languages.push_back( alt_langs[i]);
        switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG, "added alternative lang %s\n", alt_langs[i]);
      }
			auto autoDetectSourceLanguageConfig = AutoDetectSourceLanguageConfig::FromLanguages(languages);
			m_recognizer = SpeechRecognizer::FromConfig(speechConfig, autoDetectSourceLanguageConfig, audioConfig);
    }
		else {
			auto sourceLanguageConfig = SourceLanguageConfig::FromLanguage(lang);
			m_recognizer = SpeechRecognizer::FromConfig(speechConfig, sourceLanguageConfig, audioConfig);
		}


		// set properties
		auto &properties = m_recognizer->Properties;

		// profanity options: Allowed values are "masked", "removed", and "raw".
		const char* profanity = switch_channel_get_variable(channel, "AZURE_PROFANITY_OPTION");
		if (profanity) {
			properties.SetProperty(PropertyId::SpeechServiceResponse_ProfanityOption, profanity);
		}
		// report signal-to-noise ratio
		if (switch_true(switch_channel_get_variable(channel, "AZURE_REQUEST_SNR"))) {
			properties.SetProperty(PropertyId::SpeechServiceResponse_RequestSnr, TrueString);
		}
		// initial speech timeout in milliseconds
		const char* timeout = switch_channel_get_variable(channel, "AZURE_INITIAL_SPEECH_TIMEOUT_MS");
		if (timeout) properties.SetProperty(PropertyId::SpeechServiceConnection_InitialSilenceTimeoutMs, timeout);
		else properties.SetProperty(PropertyId::SpeechServiceConnection_InitialSilenceTimeoutMs, DEFAULT_SPEECH_TIMEOUT);

    const char* segmentationInterval = switch_channel_get_variable(channel, "AZURE_SPEECH_SEGMENTATION_SILENCE_TIMEOUT_MS");
    if (segmentationInterval) {
      switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG, "setting segmentation interval to %s ms\n", segmentationInterval);
      properties.SetProperty(PropertyId::Speech_SegmentationSilenceTimeoutMs, segmentationInterval);
    }

		//https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-identification?tabs=once&pivots=programming-language-cpp#at-start-and-continuous-language-identification
		const char* languageIdMode = switch_channel_get_variable(channel, "AZURE_LANGUAGE_ID_MODE");
		if (languageIdMode) {
			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG, "setting SpeechServiceConnection_LanguageIdMode to %s \n", languageIdMode);
			properties.SetProperty(PropertyId::SpeechServiceConnection_LanguageIdMode, languageIdMode);
		}

		// recognition mode - readonly according to Azure docs:
		// https://docs.microsoft.com/en-us/javascript/api/microsoft-cognitiveservices-speech-sdk/propertyid?view=azure-node-latest
		/*
		const char* recoMode = switch_channel_get_variable(channel, "AZURE_RECOGNITION_MODE");
		if (recoMode) {
			properties.SetProperty(PropertyId::SpeechServiceConnection_RecoMode, recoMode);
		}
		*/

		// hints
		const char* hints = switch_channel_get_variable(channel, "AZURE_SPEECH_HINTS");
		if (hints) {
			auto grammar = PhraseListGrammar::FromRecognizer(m_recognizer);
			char *phrases[500] = { 0 };
      int argc = switch_separate_string((char *)hints, ',', phrases, 500);
      for (int i = 0; i < argc; i++) {
        grammar->AddPhrase(phrases[i]);
      }
      switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG, "added %d hints\n", argc);
		}

		auto onSessionStopped = [this](const SessionEventArgs& args) {
			switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
			m_stopped = true;
			if (psession) {
				auto sessionId = args.SessionId;
				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer: got session stopped from microsoft\n");
				switch_core_session_rwunlock(psession);
			}
		};
		auto onSpeechStartDetected = [this, responseHandler](const RecognitionEventArgs& args) {
			switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
			if (psession) {
				auto sessionId = args.SessionId;
				responseHandler(psession, TRANSCRIBE_EVENT_START_OF_UTTERANCE, NULL, m_bugname.c_str(), m_finished);
				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer start of speech\n");
				switch_core_session_rwunlock(psession);
			}
		};
		auto onSpeechEndDetected = [this, responseHandler](const RecognitionEventArgs& args) {
			switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
			if (psession) {
				auto sessionId = args.SessionId;
				responseHandler(psession, TRANSCRIBE_EVENT_END_OF_UTTERANCE, NULL, m_bugname.c_str(), m_finished);
				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer end of speech\n");
				switch_core_session_rwunlock(psession);
			}
		};
		auto onRecognitionEvent = [this, responseHandler](const SpeechRecognitionEventArgs& args) {
			switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
			if (psession) {
				auto result = args.Result;
				auto reason = result->Reason;
				const auto& properties = result->Properties;
				auto json = properties.GetProperty(PropertyId::SpeechServiceResponse_JsonResult);
				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer onRecognitionEvent reason %d results: %s,\n", reason, json.c_str());

				switch (reason) {
					case ResultReason::RecognizingSpeech:
					case ResultReason::RecognizedSpeech:
						// note: interim results don't have "RecognitionStatus": "Success"
						responseHandler(psession, TRANSCRIBE_EVENT_RESULTS, json.c_str(), m_bugname.c_str(), m_finished);
					break;
					case ResultReason::NoMatch:
						responseHandler(psession, TRANSCRIBE_EVENT_NO_SPEECH_DETECTED, json.c_str(), m_bugname.c_str(), m_finished);
					break;

					default:
						switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "GStreamer unexpected result '%s': reason %d\n",
							json.c_str(), reason);
            responseHandler(psession, TRANSCRIBE_EVENT_ERROR, json.c_str(), m_bugname.c_str(), m_finished);

					break;
				}
				switch_core_session_rwunlock(psession);
			}
		};

		auto onCanceled = [this, responseHandler](const SpeechRecognitionCanceledEventArgs& args) {
      if (m_finished) return;
			switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
			if (psession) {
        auto result = args.Result;
        auto details = args.ErrorDetails;
        auto code = args.ErrorCode;
        cJSON* json = cJSON_CreateObject();
        cJSON_AddStringToObject(json, "type", "error");
        cJSON_AddStringToObject(json, "error", details.c_str());
        char* jsonString = cJSON_PrintUnformatted(json);
        responseHandler(psession, TRANSCRIBE_EVENT_ERROR, jsonString, m_bugname.c_str(), m_finished);
        free(jsonString);
        cJSON_Delete(json);
				switch_core_session_rwunlock(psession);
        switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer recognition canceled, error %d: %s\n", code, details.c_str());
      }
		};

		m_recognizer->SessionStopped += onSessionStopped;
		m_recognizer->SpeechStartDetected += onSpeechStartDetected;
		m_recognizer->SpeechEndDetected += onSpeechEndDetected;
		if (interim) m_recognizer->Recognizing += onRecognitionEvent;
		m_recognizer->Recognized += onRecognitionEvent;
		m_recognizer->Canceled += onCanceled;

		// Store the final configuration string
    m_configuration_string = createConfigurationStr(channels, lang, interim, samples_per_second, region, subscriptionKey, psession);

		switch_core_session_rwunlock(psession);
	}

	~GStreamer() {
		//switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::~GStreamer %p\n", this);
	}

	const char* configuration() {
		return m_configuration_string.c_str();
	}

	void connect() {
		if (m_connecting) return;
		m_connecting = true;

		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer:connect %p connecting to azure speech..\n", this);

		auto onSessionStarted = [this](const SessionEventArgs& args) {
			m_connected = true;
			switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
			if (psession) {
				auto sessionId = args.SessionId;
				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer got session started from microsoft\n");

				// send any buffered audio
				int nFrames = m_audioBuffer.getNumItems();
				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p got session started from azure, %d buffered frames\n", this, nFrames);
				if (nFrames) {
					char *p;
					do {
						p = m_audioBuffer.getNextChunk();
						if (p) {
							write(p, CHUNKSIZE);
						}
					} while (p);
				}
				switch_core_session_rwunlock(psession);
			}
		};
		m_recognizer->SessionStarted += onSessionStarted;
		m_recognizer->StartContinuousRecognitionAsync();

	}

	bool write(void* data, uint32_t datalen) {
		if (m_finished) {
			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::write not writing because we are finished, %p\n", this);
			return false;
		}
		if (!m_connected) {
      if (datalen % CHUNKSIZE == 0) {
        m_audioBuffer.add(data, datalen);
      }
      return true;
    }

    m_pushStream->Write(static_cast<uint8_t*>(data), datalen);
		return true;
	}

	void finish() {
		if (m_finished) return;
		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer::finish - calling  StopContinuousRecognitionAsync (%p)\n", this);
		m_finished = true;
		m_recognizer->StopContinuousRecognitionAsync().get();
		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "GStreamer::finish - recognition has completed (%p)\n", this);
	}

	bool isStopped() {
		return m_stopped;
	}

	bool isConnecting() {
    return m_connecting;
  }

	bool hasConfigurationChanged(
		u_int16_t channels,
		char *lang,
		int interim,
		uint32_t samples_per_second,
		const char* region,
		const char* subscriptionKey) {
		switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
		if (!psession) throw std::invalid_argument( "session id no longer active" );

		std::string newConfiguration = createConfigurationStr(channels, lang, interim, samples_per_second, region, subscriptionKey, psession);

		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(psession), SWITCH_LOG_DEBUG,
			"hasConfigurationChanged: old configurattion: %s, new configuration: %s\n", configuration(),  newConfiguration.c_str());

		switch_core_session_rwunlock(psession);

		return strcmp(newConfiguration.c_str(), configuration());
	}

private:
	std::string m_sessionId;
	std::string m_bugname;
	std::string  m_region;
	std::shared_ptr<SpeechRecognizer> m_recognizer;
	std::shared_ptr<PushAudioInputStream> m_pushStream;
  std::string m_configuration_string;
	responseHandler_t m_responseHandler;
	bool m_interim;
	bool m_finished;
	bool m_connected;
	bool m_connecting;
	bool m_stopped;
	SimpleBuffer m_audioBuffer;

	std::string createConfigurationStr(
		u_int16_t channels,
		char *lang,
		int interim,
		uint32_t samples_per_second,
		const char* region,
		const char* subscriptionKey,
		switch_core_session_t* psession
	) {
		switch_channel_t *channel = switch_core_session_get_channel(psession);
		std::ostringstream configuration_stream;
		configuration_stream <<
			channels << ";" <<
			lang << ";" <<
			interim << ";" <<
			samples_per_second << ";" <<
			region << ";" <<
			subscriptionKey << ";";

		const char* endpoint = switch_channel_get_variable(channel, "AZURE_SERVICE_ENDPOINT");
		const char* endpointId = switch_channel_get_variable(channel, "AZURE_SERVICE_ENDPOINT_ID");
		configuration_stream <<
			endpoint << ";" <<
			endpointId << ";";
		if (switch_true(switch_channel_get_variable(channel, "AZURE_USE_OUTPUT_FORMAT_DETAILED"))) {
			configuration_stream << "output_format_detailed;";
		}
		if (switch_true(switch_channel_get_variable(channel, "AZURE_AUDIO_LOGGING"))) {
			configuration_stream << "audio_logging;";
		}
		if (nullptr != proxyIP && nullptr != proxyPort) {
			configuration_stream <<
				proxyIP << ";" <<
				proxyPort << ";" <<
				proxyUsername << ";" <<
				proxyPassword << ";";
		}
		const char* var;
		if (var = switch_channel_get_variable(channel, "AZURE_SPEECH_ALTERNATIVE_LANGUAGE_CODES")) {
			configuration_stream << var << ";";
		}
		if (var = switch_channel_get_variable(channel, "AZURE_PROFANITY_OPTION")) {
			configuration_stream << var << ";";
		}
		if (var = switch_channel_get_variable(channel, "AZURE_REQUEST_SNR")) {
			configuration_stream << var << ";";
		}
		if (var = switch_channel_get_variable(channel, "AZURE_INITIAL_SPEECH_TIMEOUT_MS")) {
			configuration_stream << var << ";";
		}
		if (var = switch_channel_get_variable(channel, "AZURE_SPEECH_SEGMENTATION_SILENCE_TIMEOUT_MS")) {
			configuration_stream << var << ";";
		}
		if (var = switch_channel_get_variable(channel, "AZURE_LANGUAGE_ID_MODE")) {
			configuration_stream << var << ";";
		}
		if (var = switch_channel_get_variable(channel, "AZURE_SPEECH_HINTS")) {
			configuration_stream << var << ";";
		}
		return configuration_stream.str();
	}
};

static void reaper(struct cap_cb *cb) {
	std::shared_ptr<GStreamer> pStreamer;
	pStreamer.reset((GStreamer *)cb->streamer);
	cb->streamer = nullptr;

	std::thread t([pStreamer]{
		pStreamer->finish();
	});
	t.detach();
}

static void killcb(struct cap_cb* cb) {
	if (cb) {
		if (cb->streamer) {
			GStreamer* p = (GStreamer *) cb->streamer;
			delete p;
			cb->streamer = NULL;
		}
		if (cb->resampler) {
				speex_resampler_destroy(cb->resampler);
				cb->resampler = NULL;
		}
		if (cb->vad) {
			switch_vad_destroy(&cb->vad);
			cb->vad = nullptr;
		}
	}
}

extern "C" {
	switch_status_t azure_transcribe_init() {
		const char* subscriptionKey = std::getenv("AZURE_SUBSCRIPTION_KEY");
		const char* region = std::getenv("AZURE_REGION");
		if (NULL == subscriptionKey) {
			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE,
				"\"AZURE_SUBSCRIPTION_KEY\"  env var not set; authentication will expect channel variables of same names to be set\n");
		}
		else {
			hasDefaultCredentials = true;
		}
		return SWITCH_STATUS_SUCCESS;
	}

	switch_status_t azure_transcribe_cleanup() {
		return SWITCH_STATUS_SUCCESS;
	}

	// start transcribe on a channel
	switch_status_t azure_transcribe_session_init(switch_core_session_t *session, responseHandler_t responseHandler,
          uint32_t samples_per_second, uint32_t channels, char* lang, int interim, char* bugname, void **ppUserData
	) {
		GStreamer *streamer = NULL;
		switch_status_t status = SWITCH_STATUS_SUCCESS;
		switch_channel_t *channel = switch_core_session_get_channel(session);
		switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, bugname);
		const char* subscriptionKey = switch_channel_get_variable(channel, "AZURE_SUBSCRIPTION_KEY");
		const char* region = switch_channel_get_variable(channel, "AZURE_REGION");
		const char* sessionId = switch_core_session_get_uuid(session);
		auto read_codec = switch_core_session_get_read_codec(session);
		uint32_t sampleRate = read_codec->implementation->actual_samples_per_second;
		if (bug) {
			struct cap_cb* existing_cb = (struct cap_cb*) switch_core_media_bug_get_user_data(bug);
			GStreamer* existing_streamer = (GStreamer*) existing_cb->streamer;
			existing_cb->is_keep_alive = 0;
			if (!existing_streamer->hasConfigurationChanged(channels, lang, interim, sampleRate, region, subscriptionKey)) {
				switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Reuse active azure connection.\n");
				return SWITCH_STATUS_SUCCESS;
			}
			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Azure configuration is changed, destroy old and create new azure connection\n");
			reaper(existing_cb);
			streamer =  new GStreamer(sessionId, bugname, channels, lang, interim, sampleRate, region, subscriptionKey, responseHandler);
			if (!existing_cb->vad) streamer->connect();
			existing_cb->streamer = streamer;
			*ppUserData = existing_cb;
			return SWITCH_STATUS_SUCCESS;
		}
		int err;
		switch_threadattr_t *thd_attr = NULL;
		switch_memory_pool_t *pool = switch_core_session_get_pool(session);
		struct cap_cb* cb = (struct cap_cb *) switch_core_session_alloc(session, sizeof(*cb));
		memset(cb, sizeof(cb), 0);
		cb->channels = channels;
		strncpy(cb->sessionId, sessionId, MAX_SESSION_ID);
		strncpy(cb->bugname, bugname, MAX_BUG_LEN);

		if (subscriptionKey && region) {
			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Using channel vars for azure authentication\n");
			strncpy(cb->subscriptionKey, subscriptionKey, MAX_SUBSCRIPTION_KEY_LEN);
			strncpy(cb->region, region, MAX_REGION);
		}
		else if (std::getenv("AZURE_SUBSCRIPTION_KEY") && std::getenv("AZURE_REGION")) {
			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Using env vars for azure authentication\n");
			strncpy(cb->subscriptionKey, std::getenv("AZURE_SUBSCRIPTION_KEY"), MAX_SUBSCRIPTION_KEY_LEN);
			strncpy(cb->region, std::getenv("AZURE_REGION"), MAX_REGION);
		}
		else {
			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "No channel vars or env vars for azure authentication..will use default profile if found\n");
		}

		cb->responseHandler = responseHandler;

		cb->interim = interim;
		strncpy(cb->lang, lang, MAX_LANG);

		try {
			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "%s: initializing gstreamer with %s\n",
					switch_channel_get_name(channel), bugname);
			streamer = new GStreamer(sessionId, bugname, channels, lang, interim, sampleRate, cb->region, subscriptionKey, responseHandler);
			cb->streamer = streamer;
			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "azure_transcribe_session_init: config: %s\n", streamer->configuration());
		} catch (std::exception& e) {
			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error initializing gstreamer: %s.\n",
				switch_channel_get_name(channel), e.what());
			return SWITCH_STATUS_FALSE;
		}

		if (switch_mutex_init(&cb->mutex, SWITCH_MUTEX_NESTED, pool) != SWITCH_STATUS_SUCCESS) {
			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error initializing mutex\n");
			status = SWITCH_STATUS_FALSE;
			goto done;
		}

		/* determine if we need to resample the audio to 16-bit 8khz */
		if (sampleRate != 8000) {
			cb->resampler = speex_resampler_init(1, sampleRate, 8000, SWITCH_RESAMPLE_QUALITY, &err);
			if (0 != err) {
				switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error initializing resampler: %s.\n",
							switch_channel_get_name(channel), speex_resampler_strerror(err));
				status = SWITCH_STATUS_FALSE;
				goto done;
			}
		}

		// allocate vad if we are delaying connecting to the recognizer until we detect speech
		if (switch_channel_var_true(channel, "START_RECOGNIZING_ON_VAD")) {
			cb->vad = switch_vad_init(sampleRate, 1);
			if (cb->vad) {
				const char* var;
				int mode = 2;
				int silence_ms = 150;
				int voice_ms = 250;
				int debug = 0;

				if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_MODE")) {
					mode = atoi(var);
				}
				if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_SILENCE_MS")) {
					silence_ms = atoi(var);
				}
				if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_VOICE_MS")) {
					voice_ms = atoi(var);
				}
				if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_DEBUG")) {
					debug = atoi(var);
				}
				switch_vad_set_mode(cb->vad, mode);
				switch_vad_set_param(cb->vad, "silence_ms", silence_ms);
				switch_vad_set_param(cb->vad, "voice_ms", voice_ms);
				switch_vad_set_param(cb->vad, "debug", debug);
				switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "%s: delaying connection until vad, voice_ms %d, mode %d\n",
					switch_channel_get_name(channel), voice_ms, mode);
			}
		}
		if (!cb->vad) streamer->connect();
	done:
		*ppUserData = cb;
		cb->is_keep_alive = 0;
		return status;
	}

	switch_status_t azure_transcribe_session_stop(switch_core_session_t *session, int channelIsClosing, char* bugname) {
		switch_channel_t *channel = switch_core_session_get_channel(session);
		switch_media_bug_t *bug = (switch_media_bug_t*) switch_channel_get_private(channel, bugname);

		if (bug) {
			struct cap_cb *cb = (struct cap_cb *) switch_core_media_bug_get_user_data(bug);
			switch_status_t st;

			if (use_single_connection && !channelIsClosing) {
				switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "azure_transcribe_session_stop: call is running, use_single_connection is true, keep alive is activated\n");
				cb->is_keep_alive = 1;
				return SWITCH_STATUS_SUCCESS;
			}

			// close connection and get final responses
			switch_mutex_lock(cb->mutex);
			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "azure_transcribe_session_stop: locked session\n");

			switch_channel_set_private(channel, bugname, NULL);
			if (!channelIsClosing) switch_core_media_bug_remove(session, &bug);

			GStreamer* streamer = (GStreamer *) cb->streamer;
			if (streamer) reaper(cb);
			killcb(cb);
			switch_mutex_unlock(cb->mutex);
			switch_mutex_destroy(cb->mutex);
			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "azure_transcribe_session_stop: unlocked session\n");

			return SWITCH_STATUS_SUCCESS;
		}

		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s Bug is not attached.\n", switch_channel_get_name(channel));
		return SWITCH_STATUS_FALSE;
	}

	switch_bool_t azure_transcribe_frame(switch_media_bug_t *bug, void* user_data) {
		switch_core_session_t *session = switch_core_media_bug_get_session(bug);
		uint8_t data[SWITCH_RECOMMENDED_BUFFER_SIZE];
		switch_frame_t frame = {};
		struct cap_cb *cb = (struct cap_cb *) user_data;

		frame.data = data;
		frame.buflen = SWITCH_RECOMMENDED_BUFFER_SIZE;
		if (cb->is_keep_alive) {
			// remove media bug buffered data
 			while (true) {
				unsigned char data[SWITCH_RECOMMENDED_BUFFER_SIZE] = {0};
				switch_frame_t frame = { 0 };
				frame.data = data;
				frame.buflen = SWITCH_RECOMMENDED_BUFFER_SIZE;
				switch_status_t rv = switch_core_media_bug_read(bug, &frame, SWITCH_TRUE);
				if (rv != SWITCH_STATUS_SUCCESS) break;
			}
			return SWITCH_TRUE;
		}
		if (switch_mutex_trylock(cb->mutex) == SWITCH_STATUS_SUCCESS) {
			GStreamer* streamer = (GStreamer *) cb->streamer;
			if (streamer) {
				while (switch_core_media_bug_read(bug, &frame, SWITCH_TRUE) == SWITCH_STATUS_SUCCESS && !switch_test_flag((&frame), SFF_CNG)) {
					if (frame.datalen) {
						if (cb->vad && !streamer->isConnecting()) {
							switch_vad_state_t state = switch_vad_process(cb->vad, (int16_t*) frame.data, frame.samples);
							if (state == SWITCH_VAD_STATE_START_TALKING) {
								switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "detected speech, connect to azure speech now\n");
								streamer->connect();
								cb->responseHandler(session, TRANSCRIBE_EVENT_VAD_DETECTED, NULL, cb->bugname, 0);
							}
						}

						if (cb->resampler) {
							spx_int16_t out[SWITCH_RECOMMENDED_BUFFER_SIZE];
							spx_uint32_t out_len = SWITCH_RECOMMENDED_BUFFER_SIZE;
							spx_uint32_t in_len = frame.samples;
							size_t written;

							speex_resampler_process_interleaved_int(
								cb->resampler,
								(const spx_int16_t *) frame.data,
								(spx_uint32_t *) &in_len,
								&out[0],
								&out_len);
							streamer->write( &out[0], sizeof(spx_int16_t) * out_len);
						}
						else {
							streamer->write( frame.data, frame.datalen);
						}
					}
				}
			}
			switch_mutex_unlock(cb->mutex);
		}
		return SWITCH_TRUE;
	}
}