eliminate support for multiple lws threads as part of fixing valgrind errors

Signed-off-by: Dave Horton <daveh@beachdognet.com>
This commit is contained in:
Dave Horton
2023-12-26 10:57:15 -05:00
parent a2324972eb
commit 420e51eac7
140 changed files with 19851 additions and 0 deletions

BIN
mod_google_transcribe/.DS_Store vendored Normal file

Binary file not shown.

View File

@@ -0,0 +1,8 @@
Copyright 2023, Drachtio Communications Services, LLC
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@@ -0,0 +1,10 @@
include $(top_srcdir)/build/modmake.rulesam
MODNAME=mod_google_transcribe
mod_LTLIBRARIES = mod_google_transcribe.la
mod_google_transcribe_la_SOURCES = mod_google_transcribe.c google_glue.cpp
mod_google_transcribe_la_CFLAGS = $(AM_CFLAGS)
mod_google_transcribe_la_CXXFLAGS = -I $(top_srcdir)/libs/googleapis/gens $(AM_CXXFLAGS) -std=c++17
mod_google_transcribe_la_LIBADD = $(switch_builddir)/libfreeswitch.la
mod_google_transcribe_la_LDFLAGS = -avoid-version -module -no-undefined -shared `pkg-config --libs grpc++ grpc`

View File

@@ -0,0 +1,101 @@
# mod_google_transcribe
A Freeswitch module that generates real-time transcriptions on a Freeswitch channel by using Google's Speech-to-Text API.
Optionally, the connection to the google cloud recognizer can be delayed until voice activity has been detected. This can be useful in cases where it is desired to minimize the costs of streaming audio for transcription. This setting is governed by the channel variables starting with 1RECOGNIZER_VAD`, as described below.
## API
### Commands
The freeswitch module exposes two versions of an API command to transcribe speech:
#### version 1
```bash
uuid_google_transcribe <uuid> start <lang-code> [interim]
```
When using this command, additional speech processing options can be provided through Freeswitch channel variables, described [below](#command-variables).
####version 2
```bash
uuid_google_transcribe2 <uuid> start <lang-code> [interim] (bool) \
[single-utterance](bool) [separate-recognition](bool) [max-alternatives](int) \
[profanity-filter](bool) [word-time](bool) [punctuation](bool) \
[model](string) [enhanced](bool) [hints](word seperated by , and no spaces) \
[play-file] (play file path)
```
This command allows speech processing options to be provided on the command line, and has the ability to optionally play an audio file as a prompt.
Example:
```bash
bgapi uuid_google_transcribe2 312033b6-4b2a-48d8-be0c-5f161aec2b3e start en-US \
true true true 5 true true true command_and_search true \
yes,no,hello https://www2.cs.uic.edu/~i101/SoundFiles/CantinaBand60.wav
```
Attaches media bug to channel and performs streaming recognize request.
- `uuid` - unique identifier of Freeswitch channel
- `lang-code` - a valid Google [language code](https://cloud.google.com/speech-to-text/docs/languages) to use for speech recognition
- `interim` - If the 'interim' keyword is present then both interim and final transcription results will be returned; otherwise only final transcriptions will be returned
```
uuid_google_transcribe <uuid> stop
```
Stop transcription on the channel.
### Command Variables
Additional google speech options can be set through freeswitch channel variables for `uuid_google_transcribe` (some can alternatively be set in the command line for `uuid_google_transcribe2`).
| variable | Description |
| --- | ----------- |
| GOOGLE_SPEECH_SINGLE_UTTERANCE | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.StreamingRecognitionConfig.FIELDS.bool.google.cloud.speech.v1.StreamingRecognitionConfig.single_utterance) |
| GOOGLE_SPEECH_SEPARATE_RECOGNITION_PER_CHANNEL | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.RecognitionConfig.FIELDS.bool.google.cloud.speech.v1.RecognitionConfig.enable_separate_recognition_per_channel) |
| GOOGLE_SPEECH_MAX_ALTERNATIVES | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.RecognitionConfig.FIELDS.int32.google.cloud.speech.v1.RecognitionConfig.max_alternatives) |
| GOOGLE_SPEECH_PROFANITY_FILTER | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.RecognitionConfig.FIELDS.bool.google.cloud.speech.v1.RecognitionConfig.profanity_filter) |
| GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.RecognitionConfig.FIELDS.bool.google.cloud.speech.v1.RecognitionConfig.enable_word_time_offsets) |
| GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.RecognitionConfig.FIELDS.bool.google.cloud.speech.v1.RecognitionConfig.enable_automatic_punctuation) |
| GOOGLE_SPEECH_MODEL | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.RecognitionConfig.FIELDS.string.google.cloud.speech.v1.RecognitionConfig.model) |
| GOOGLE_SPEECH_USE_ENHANCED | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.RecognitionConfig.FIELDS.bool.google.cloud.speech.v1.RecognitionConfig.use_enhanced) |
| GOOGLE_SPEECH_HINTS | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.PhraseSet) |
| GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES | a comma-separated list of language codes, [per this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.RecognitionConfig.FIELDS.repeated.string.google.cloud.speech.v1p1beta1.RecognitionConfig.alternative_language_codes) |
| GOOGLE_SPEECH_SPEAKER_DIARIZATION | set to 1 to enable [speaker diarization](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.SpeakerDiarizationConfig) |
| GOOGLE_SPEECH_SPEAKER_DIARIZATION_MIN_SPEAKER_COUNT | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.SpeakerDiarizationConfig) |
| GOOGLE_SPEECH_SPEAKER_DIARIZATION_MAX_SPEAKER_COUNT | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.SpeakerDiarizationConfig) |
| GOOGLE_SPEECH_METADATA_INTERACTION_TYPE | set to 'discussion', 'presentation', 'phone_call', 'voicemail', 'professionally_produced', 'voice_search', 'voice_command', or 'dictation' [per this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.RecognitionMetadata.InteractionType) |
| GOOGLE_SPEECH_METADATA_INDUSTRY_NAICS_CODE | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.RecognitionMetadata) |
| GOOGLE_SPEECH_METADATA_MICROPHONE_DISTANCE | set to 'nearfield', 'midfield', or 'farfield' [per this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.RecognitionMetadata.MicrophoneDistance) |
| GOOGLE_SPEECH_METADATA_ORIGINAL_MEDIA_TYPE | set to 'audio', or 'video' [per this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.RecognitionMetadata.OriginalMediaType) |
| GOOGLE_SPEECH_METADATA_RECORDING_DEVICE_TYPE | set to 'smartphone', 'pc', 'phone_line', 'vehicle', 'other_outdoor_device', or 'other_indoor_device' [per this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.RecognitionMetadata.RecordingDeviceType)|
| START_RECOGNIZING_ON_VAD | if set to 1 or true, do not begin streaming audio to google cloud until voice activity is detected.|
| RECOGNIZER_VAD_MODE | An integer value 0-3 from less to more aggressive vad detection (default: 2).|
| RECOGNIZER_VAD_VOICE_MS | The number of milliseconds of voice activity that is required to trigger the connection to google cloud, when START_RECOGNIZING_ON_VAD is set (default: 250).|
| RECOGNIZER_VAD_DEBUG | if >0 vad debug logs will be generated (default: 0).|
### Events
**google_transcribe::transcription** - returns an interim or final transcription. The event contains a JSON body describing the transcription result:
```js
{
"stability": 0,
"is_final": true,
"alternatives": [{
"confidence": 0.96471,
"transcript": "Donny was a good bowler, and a good man"
}]
}
```
**google_transcribe::end_of_utterance** - returns an indication that an utterance has been detected. This may be returned prior to a final transcription. This event is only returned when GOOGLE_SPEECH_SINGLE_UTTERANCE is set to true.
**google_transcribe::end_of_transcript** - returned when a transcription operation has completed. If a final transcription has not been returned by now, it won't be. This event is only returned when GOOGLE_SPEECH_SINGLE_UTTERANCE is set to true.
**google_transcribe::no_audio_detected** - returned when google has returned an error indicating that no audio was received for a lengthy period of time.
**google_transcribe::max_duration_exceeded** - returned when google has returned an an indication that a long-running transcription has been stopped due to a max duration limit (305 seconds) on their side. It is the applications responsibility to respond by starting a new transcription session, if desired.
**google_transcribe::no_audio_detected** - returned when google has not received any audio for some reason.
## Usage
When using [drachtio-fsrmf](https://www.npmjs.com/package/drachtio-fsmrf), you can access this API command via the api method on the 'endpoint' object.
```js
ep.api('uuid_google_transcribe', `${ep.uuid} start en-US`);
```
## Examples
[google_transcribe.js](../../examples/google_transcribe.js)

View File

@@ -0,0 +1,727 @@
#include <cstdlib>
#include <algorithm>
#include <future>
#include <switch.h>
#include <switch_json.h>
#include <grpc++/grpc++.h>
#include "google/cloud/speech/v1p1beta1/cloud_speech.grpc.pb.h"
#include <switch_json.h>
#include "mod_google_transcribe.h"
#include "simple_buffer.h"
using google::cloud::speech::v1p1beta1::RecognitionConfig;
using google::cloud::speech::v1p1beta1::Speech;
using google::cloud::speech::v1p1beta1::SpeechContext;
using google::cloud::speech::v1p1beta1::StreamingRecognizeRequest;
using google::cloud::speech::v1p1beta1::StreamingRecognizeResponse;
using google::cloud::speech::v1p1beta1::SpeakerDiarizationConfig;
using google::cloud::speech::v1p1beta1::SpeechAdaptation;
using google::cloud::speech::v1p1beta1::PhraseSet;
using google::cloud::speech::v1p1beta1::PhraseSet_Phrase;
using google::cloud::speech::v1p1beta1::RecognitionMetadata;
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_DISCUSSION;
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_PRESENTATION;
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_PHONE_CALL;
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_VOICEMAIL;
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_PROFESSIONALLY_PRODUCED;
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_VOICE_SEARCH;
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_VOICE_COMMAND;
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_DICTATION;
using google::cloud::speech::v1p1beta1::RecognitionMetadata_MicrophoneDistance_NEARFIELD;
using google::cloud::speech::v1p1beta1::RecognitionMetadata_MicrophoneDistance_MIDFIELD;
using google::cloud::speech::v1p1beta1::RecognitionMetadata_MicrophoneDistance_FARFIELD;
using google::cloud::speech::v1p1beta1::RecognitionMetadata_OriginalMediaType_AUDIO;
using google::cloud::speech::v1p1beta1::RecognitionMetadata_OriginalMediaType_VIDEO;
using google::cloud::speech::v1p1beta1::RecognitionMetadata_RecordingDeviceType_SMARTPHONE;
using google::cloud::speech::v1p1beta1::RecognitionMetadata_RecordingDeviceType_PC;
using google::cloud::speech::v1p1beta1::RecognitionMetadata_RecordingDeviceType_PHONE_LINE;
using google::cloud::speech::v1p1beta1::RecognitionMetadata_RecordingDeviceType_VEHICLE;
using google::cloud::speech::v1p1beta1::RecognitionMetadata_RecordingDeviceType_OTHER_OUTDOOR_DEVICE;
using google::cloud::speech::v1p1beta1::RecognitionMetadata_RecordingDeviceType_OTHER_INDOOR_DEVICE;
using google::cloud::speech::v1p1beta1::StreamingRecognizeResponse_SpeechEventType_END_OF_SINGLE_UTTERANCE;
using google::rpc::Status;
#define CHUNKSIZE (320)
namespace {
int case_insensitive_match(std::string s1, std::string s2) {
std::transform(s1.begin(), s1.end(), s1.begin(), ::tolower);
std::transform(s2.begin(), s2.end(), s2.begin(), ::tolower);
if(s1.compare(s2) == 0)
return 1; //The strings are same
return 0; //not matched
}
}
class GStreamer;
class GStreamer {
public:
GStreamer(
switch_core_session_t *session,
uint32_t channels,
char* lang,
int interim,
uint32_t config_sample_rate,
uint32_t samples_per_second,
int single_utterance,
int separate_recognition,
int max_alternatives,
int profanity_filter,
int word_time_offset,
int punctuation,
const char* model,
int enhanced,
const char* hints) : m_session(session), m_writesDone(false), m_connected(false),
m_audioBuffer(CHUNKSIZE, 15) {
const char* var;
const char* google_uri;
switch_channel_t *channel = switch_core_session_get_channel(session);
if (!(google_uri = switch_channel_get_variable(channel, "GOOGLE_SPEECH_TO_TEXT_URI"))) {
google_uri = "speech.googleapis.com";
}
if (var = switch_channel_get_variable(channel, "GOOGLE_APPLICATION_CREDENTIALS")) {
auto channelCreds = grpc::SslCredentials(grpc::SslCredentialsOptions());
auto callCreds = grpc::ServiceAccountJWTAccessCredentials(var);
auto creds = grpc::CompositeChannelCredentials(channelCreds, callCreds);
m_channel = grpc::CreateChannel(google_uri, creds);
}
else {
auto creds = grpc::GoogleDefaultCredentials();
m_channel = grpc::CreateChannel(google_uri, creds);
}
m_stub = Speech::NewStub(m_channel);
auto* streaming_config = m_request.mutable_streaming_config();
RecognitionConfig* config = streaming_config->mutable_config();
streaming_config->set_interim_results(interim);
if (single_utterance == 1) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "enable_single_utterance\n");
streaming_config->set_single_utterance(true);
}
else {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "enable_single_utterance is FALSE\n");
streaming_config->set_single_utterance(false);
}
config->set_language_code(lang);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "transcribe language %s \n", lang);
config->set_sample_rate_hertz(config_sample_rate);
config->set_encoding(RecognitionConfig::LINEAR16);
// the rest of config comes from channel vars
// number of channels in the audio stream (default: 1)
if (channels > 1) {
config->set_audio_channel_count(channels);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "audio_channel_count %d\n", channels);
// transcribe each separately?
if (separate_recognition == 1) {
config->set_enable_separate_recognition_per_channel(true);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "enable_separate_recognition_per_channel on\n");
}
}
// max alternatives
if (max_alternatives > 1) {
config->set_max_alternatives(max_alternatives);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "max_alternatives %d\n", max_alternatives);
}
// profanity filter
if (profanity_filter == 1) {
config->set_profanity_filter(true);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "profanity_filter\n");
}
// enable word offsets
if (word_time_offset == 1) {
config->set_enable_word_time_offsets(true);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "enable_word_time_offsets\n");
}
// enable automatic punctuation
if (punctuation == 1) {
config->set_enable_automatic_punctuation(true);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "enable_automatic_punctuation\n");
}
else {
config->set_enable_automatic_punctuation(false);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "disable_automatic_punctuation\n");
}
// speech model
if (model != NULL) {
config->set_model(model);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "speech model %s\n", model);
}
// use enhanced model
if (enhanced == 1) {
config->set_use_enhanced(true);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "use_enhanced\n");
}
// hints
if (hints != NULL) {
auto* adaptation = config->mutable_adaptation();
auto* phrase_set = adaptation->add_phrase_sets();
auto *context = config->add_speech_contexts();
float boost = -1;
// get boost setting for the phrase set in its entirety
if (switch_true(switch_channel_get_variable(channel, "GOOGLE_SPEECH_HINTS_BOOST"))) {
boost = (float) atof(switch_channel_get_variable(channel, "GOOGLE_SPEECH_HINTS_BOOST"));
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "boost value: %f\n", boost);
phrase_set->set_boost(boost);
}
// hints are either a simple comma-separated list of phrases, or a json array of objects
// containing a phrase and a boost value
auto *jHint = cJSON_Parse((char *) hints);
if (jHint) {
int i = 0;
cJSON *jPhrase = NULL;
cJSON_ArrayForEach(jPhrase, jHint) {
auto* phrase = phrase_set->add_phrases();
cJSON *jItem = cJSON_GetObjectItem(jPhrase, "phrase");
if (jItem) {
phrase->set_value(cJSON_GetStringValue(jItem));
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "phrase: %s\n", phrase->value().c_str());
if (cJSON_GetObjectItem(jPhrase, "boost")) {
phrase->set_boost((float) cJSON_GetObjectItem(jPhrase, "boost")->valuedouble);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "boost value: %f\n", phrase->boost());
}
i++;
}
}
cJSON_Delete(jHint);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "added %d hints\n", i);
}
else {
char *phrases[500] = { 0 };
int argc = switch_separate_string((char *) hints, ',', phrases, 500);
for (int i = 0; i < argc; i++) {
auto* phrase = phrase_set->add_phrases();
phrase->set_value(phrases[i]);
}
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "added %d hints\n", argc);
}
}
// alternative language
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES")) {
char *alt_langs[3] = { 0 };
int argc = switch_separate_string((char *) var, ',', alt_langs, 3);
for (int i = 0; i < argc; i++) {
config->add_alternative_language_codes(alt_langs[i]);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "added alternative lang %s\n", alt_langs[i]);
}
}
// speaker diarization
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_SPEAKER_DIARIZATION")) {
auto* diarization_config = config->mutable_diarization_config();
diarization_config->set_enable_speaker_diarization(true);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "enabling speaker diarization\n", var);
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_SPEAKER_DIARIZATION_MIN_SPEAKER_COUNT")) {
int count = std::max(atoi(var), 1);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "setting min speaker count to %d\n", count);
diarization_config->set_min_speaker_count(count);
}
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_SPEAKER_DIARIZATION_MAX_SPEAKER_COUNT")) {
int count = std::max(atoi(var), 2);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "setting max speaker count to %d\n", count);
diarization_config->set_max_speaker_count(count);
}
}
// recognition metadata
auto* metadata = config->mutable_metadata();
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_METADATA_INTERACTION_TYPE")) {
if (case_insensitive_match("discussion", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_DISCUSSION);
if (case_insensitive_match("presentation", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_PRESENTATION);
if (case_insensitive_match("phone_call", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_PHONE_CALL);
if (case_insensitive_match("voicemail", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_VOICEMAIL);
if (case_insensitive_match("professionally_produced", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_PROFESSIONALLY_PRODUCED);
if (case_insensitive_match("voice_search", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_VOICE_SEARCH);
if (case_insensitive_match("voice_command", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_VOICE_COMMAND);
if (case_insensitive_match("dictation", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_DICTATION);
}
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_METADATA_INDUSTRY_NAICS_CODE")) {
metadata->set_industry_naics_code_of_audio(atoi(var));
}
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_METADATA_MICROPHONE_DISTANCE")) {
if (case_insensitive_match("nearfield", var)) metadata->set_microphone_distance(RecognitionMetadata_MicrophoneDistance_NEARFIELD);
if (case_insensitive_match("midfield", var)) metadata->set_microphone_distance(RecognitionMetadata_MicrophoneDistance_MIDFIELD);
if (case_insensitive_match("farfield", var)) metadata->set_microphone_distance(RecognitionMetadata_MicrophoneDistance_FARFIELD);
}
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_METADATA_ORIGINAL_MEDIA_TYPE")) {
if (case_insensitive_match("audio", var)) metadata->set_original_media_type(RecognitionMetadata_OriginalMediaType_AUDIO);
if (case_insensitive_match("video", var)) metadata->set_original_media_type(RecognitionMetadata_OriginalMediaType_VIDEO);
}
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_METADATA_RECORDING_DEVICE_TYPE")) {
if (case_insensitive_match("smartphone", var)) metadata->set_recording_device_type(RecognitionMetadata_RecordingDeviceType_SMARTPHONE);
if (case_insensitive_match("pc", var)) metadata->set_recording_device_type(RecognitionMetadata_RecordingDeviceType_PC);
if (case_insensitive_match("phone_line", var)) metadata->set_recording_device_type(RecognitionMetadata_RecordingDeviceType_PHONE_LINE);
if (case_insensitive_match("vehicle", var)) metadata->set_recording_device_type(RecognitionMetadata_RecordingDeviceType_VEHICLE);
if (case_insensitive_match("other_outdoor_device", var)) metadata->set_recording_device_type(RecognitionMetadata_RecordingDeviceType_OTHER_OUTDOOR_DEVICE);
if (case_insensitive_match("other_indoor_device", var)) metadata->set_recording_device_type(RecognitionMetadata_RecordingDeviceType_OTHER_INDOOR_DEVICE);
}
}
~GStreamer() {
//switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_INFO, "GStreamer::~GStreamer - deleting channel and stub: %p\n", (void*)this);
}
void connect() {
assert(!m_connected);
// Begin a stream.
m_streamer = m_stub->StreamingRecognize(&m_context);
m_connected = true;
// read thread is waiting on this
m_promise.set_value();
// Write the first request, containing the config only.
m_streamer->Write(m_request);
// send any buffered audio
int nFrames = m_audioBuffer.getNumItems();
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p got stream ready, %d buffered frames\n", this, nFrames);
if (nFrames) {
char *p;
do {
p = m_audioBuffer.getNextChunk();
if (p) {
write(p, CHUNKSIZE);
}
} while (p);
}
}
bool write(void* data, uint32_t datalen) {
if (!m_connected) {
if (datalen % CHUNKSIZE == 0) {
m_audioBuffer.add(data, datalen);
}
return true;
}
m_request.set_audio_content(data, datalen);
bool ok = m_streamer->Write(m_request);
return ok;
}
uint32_t nextMessageSize(void) {
uint32_t size = 0;
m_streamer->NextMessageSize(&size);
return size;
}
bool read(StreamingRecognizeResponse* response) {
return m_streamer->Read(response);
}
grpc::Status finish() {
return m_streamer->Finish();
}
void writesDone() {
// grpc crashes if we call this twice on a stream
if (!m_connected) {
cancelConnect();
}
else if (!m_writesDone) {
m_streamer->WritesDone();
m_writesDone = true;
}
}
bool waitForConnect() {
std::shared_future<void> sf(m_promise.get_future());
sf.wait();
return m_connected;
}
void cancelConnect() {
assert(!m_connected);
m_promise.set_value();
}
bool isConnected() {
return m_connected;
}
private:
switch_core_session_t* m_session;
grpc::ClientContext m_context;
std::shared_ptr<grpc::Channel> m_channel;
std::unique_ptr<Speech::Stub> m_stub;
std::unique_ptr< grpc::ClientReaderWriterInterface<StreamingRecognizeRequest, StreamingRecognizeResponse> > m_streamer;
StreamingRecognizeRequest m_request;
bool m_writesDone;
bool m_connected;
std::promise<void> m_promise;
SimpleBuffer m_audioBuffer;
};
static void *SWITCH_THREAD_FUNC grpc_read_thread(switch_thread_t *thread, void *obj) {
static int count;
struct cap_cb *cb = (struct cap_cb *) obj;
GStreamer* streamer = (GStreamer *) cb->streamer;
bool connected = streamer->waitForConnect();
if (!connected) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "google transcribe grpc read thread exiting since we didnt connect\n") ;
return nullptr;
}
// Read responses.
StreamingRecognizeResponse response;
while (streamer->read(&response)) { // Returns false when no more to read.
switch_core_session_t* session = switch_core_session_locate(cb->sessionId);
if (!session) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "grpc_read_thread: session %s is gone!\n", cb->sessionId) ;
return nullptr;
}
count++;
auto speech_event_type = response.speech_event_type();
if (response.has_error()) {
Status status = response.error();
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "grpc_read_thread: error %s (%d)\n", status.message().c_str(), status.code()) ;
cJSON* json = cJSON_CreateObject();
cJSON_AddStringToObject(json, "type", "error");
cJSON_AddStringToObject(json, "error", status.message().c_str());
char* jsonString = cJSON_PrintUnformatted(json);
cb->responseHandler(session, jsonString, cb->bugname);
free(jsonString);
cJSON_Delete(json);
}
if (cb->play_file == 1){
cb->responseHandler(session, "play_interrupt", cb->bugname);
}
for (int r = 0; r < response.results_size(); ++r) {
auto result = response.results(r);
cJSON * jResult = cJSON_CreateObject();
cJSON * jAlternatives = cJSON_CreateArray();
cJSON * jStability = cJSON_CreateNumber(result.stability());
cJSON * jIsFinal = cJSON_CreateBool(result.is_final());
cJSON * jLanguageCode = cJSON_CreateString(result.language_code().c_str());
cJSON * jChannelTag = cJSON_CreateNumber(result.channel_tag());
auto duration = result.result_end_time();
int32_t seconds = duration.seconds();
int64_t nanos = duration.nanos();
int span = (int) trunc(seconds * 1000. + ((float) nanos / 1000000.));
cJSON * jResultEndTime = cJSON_CreateNumber(span);
cJSON_AddItemToObject(jResult, "stability", jStability);
cJSON_AddItemToObject(jResult, "is_final", jIsFinal);
cJSON_AddItemToObject(jResult, "alternatives", jAlternatives);
cJSON_AddItemToObject(jResult, "language_code", jLanguageCode);
cJSON_AddItemToObject(jResult, "channel_tag", jChannelTag);
cJSON_AddItemToObject(jResult, "result_end_time", jResultEndTime);
for (int a = 0; a < result.alternatives_size(); ++a) {
auto alternative = result.alternatives(a);
cJSON* jAlt = cJSON_CreateObject();
cJSON* jConfidence = cJSON_CreateNumber(alternative.confidence());
cJSON* jTranscript = cJSON_CreateString(alternative.transcript().c_str());
cJSON_AddItemToObject(jAlt, "confidence", jConfidence);
cJSON_AddItemToObject(jAlt, "transcript", jTranscript);
if (alternative.words_size() > 0) {
cJSON * jWords = cJSON_CreateArray();
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "grpc_read_thread: %d words\n", alternative.words_size()) ;
for (int b = 0; b < alternative.words_size(); b++) {
auto words = alternative.words(b);
cJSON* jWord = cJSON_CreateObject();
cJSON_AddItemToObject(jWord, "word", cJSON_CreateString(words.word().c_str()));
if (words.has_start_time()) {
cJSON_AddItemToObject(jWord, "start_time", cJSON_CreateNumber(words.start_time().seconds()));
}
if (words.has_end_time()) {
cJSON_AddItemToObject(jWord, "end_time", cJSON_CreateNumber(words.end_time().seconds()));
}
int speaker_tag = words.speaker_tag();
if (speaker_tag > 0) {
cJSON_AddItemToObject(jWord, "speaker_tag", cJSON_CreateNumber(speaker_tag));
}
float confidence = words.confidence();
if (confidence > 0.0) {
cJSON_AddItemToObject(jWord, "confidence", cJSON_CreateNumber(confidence));
}
cJSON_AddItemToArray(jWords, jWord);
}
cJSON_AddItemToObject(jAlt, "words", jWords);
}
cJSON_AddItemToArray(jAlternatives, jAlt);
}
char* json = cJSON_PrintUnformatted(jResult);
cb->responseHandler(session, (const char *) json, cb->bugname);
free(json);
cJSON_Delete(jResult);
}
if (speech_event_type == StreamingRecognizeResponse_SpeechEventType_END_OF_SINGLE_UTTERANCE) {
// we only get this when we have requested it, and recognition stops after we get this
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "grpc_read_thread: got end_of_utterance\n") ;
cb->got_end_of_utterance = 1;
cb->responseHandler(session, "end_of_utterance", cb->bugname);
if (cb->wants_single_utterance) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "grpc_read_thread: sending writesDone because we want only a single utterance\n") ;
streamer->writesDone();
}
}
switch_core_session_rwunlock(session);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "grpc_read_thread: got %d responses\n", response.results_size());
}
{
switch_core_session_t* session = switch_core_session_locate(cb->sessionId);
if (session) {
grpc::Status status = streamer->finish();
if (11 == status.error_code()) {
if (std::string::npos != status.error_message().find("Exceeded maximum allowed stream duration")) {
cb->responseHandler(session, "max_duration_exceeded", cb->bugname);
}
else {
cb->responseHandler(session, "no_audio", cb->bugname);
}
}
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "grpc_read_thread: finish() status %s (%d)\n", status.error_message().c_str(), status.error_code()) ;
switch_core_session_rwunlock(session);
}
}
return nullptr;
}
extern "C" {
switch_status_t google_speech_init() {
const char* gcsServiceKeyFile = std::getenv("GOOGLE_APPLICATION_CREDENTIALS");
if (gcsServiceKeyFile) {
try {
auto creds = grpc::GoogleDefaultCredentials();
} catch (const std::exception& e) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT,
"Error initializing google api with provided credentials in %s: %s\n", gcsServiceKeyFile, e.what());
return SWITCH_STATUS_FALSE;
}
}
return SWITCH_STATUS_SUCCESS;
}
switch_status_t google_speech_cleanup() {
return SWITCH_STATUS_SUCCESS;
}
switch_status_t google_speech_session_init(switch_core_session_t *session, responseHandler_t responseHandler,
uint32_t to_rate, uint32_t samples_per_second, uint32_t channels, char* lang, int interim, char *bugname,
int single_utterance, int separate_recognition, int max_alternatives, int profanity_filter, int word_time_offset,
int punctuation, const char* model, int enhanced, const char* hints, char* play_file, void **ppUserData) {
switch_channel_t *channel = switch_core_session_get_channel(session);
auto read_codec = switch_core_session_get_read_codec(session);
uint32_t sampleRate = read_codec->implementation->actual_samples_per_second;
struct cap_cb *cb;
int err;
cb =(struct cap_cb *) switch_core_session_alloc(session, sizeof(*cb));
strncpy(cb->sessionId, switch_core_session_get_uuid(session), MAX_SESSION_ID);
strncpy(cb->bugname, bugname, MAX_BUG_LEN);
cb->got_end_of_utterance = 0;
cb->wants_single_utterance = single_utterance;
if (play_file != NULL){
cb->play_file = 1;
}
switch_mutex_init(&cb->mutex, SWITCH_MUTEX_NESTED, switch_core_session_get_pool(session));
if (sampleRate != to_rate) {
cb->resampler = speex_resampler_init(channels, sampleRate, to_rate, SWITCH_RESAMPLE_QUALITY, &err);
if (0 != err) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error initializing resampler: %s.\n",
switch_channel_get_name(channel), speex_resampler_strerror(err));
return SWITCH_STATUS_FALSE;
}
} else {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "%s: no resampling needed for this call\n", switch_channel_get_name(channel));
}
cb->responseHandler = responseHandler;
// allocate vad if we are delaying connecting to the recognizer until we detect speech
if (switch_channel_var_true(channel, "START_RECOGNIZING_ON_VAD")) {
cb->vad = switch_vad_init(sampleRate, channels);
if (cb->vad) {
const char* var;
int mode = 2;
int silence_ms = 150;
int voice_ms = 250;
int debug = 0;
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_MODE")) {
mode = atoi(var);
}
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_SILENCE_MS")) {
silence_ms = atoi(var);
}
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_VOICE_MS")) {
voice_ms = atoi(var);
}
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_VOICE_MS")) {
voice_ms = atoi(var);
}
switch_vad_set_mode(cb->vad, mode);
switch_vad_set_param(cb->vad, "silence_ms", silence_ms);
switch_vad_set_param(cb->vad, "voice_ms", voice_ms);
switch_vad_set_param(cb->vad, "debug", debug);
}
}
GStreamer *streamer = NULL;
try {
streamer = new GStreamer(session, channels, lang, interim, to_rate, sampleRate, single_utterance, separate_recognition, max_alternatives,
profanity_filter, word_time_offset, punctuation, model, enhanced, hints);
cb->streamer = streamer;
} catch (std::exception& e) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error initializing gstreamer: %s.\n",
switch_channel_get_name(channel), e.what());
return SWITCH_STATUS_FALSE;
}
if (!cb->vad) streamer->connect();
// create the read thread
switch_threadattr_t *thd_attr = NULL;
switch_memory_pool_t *pool = switch_core_session_get_pool(session);
switch_threadattr_create(&thd_attr, pool);
switch_threadattr_stacksize_set(thd_attr, SWITCH_THREAD_STACKSIZE);
switch_thread_create(&cb->thread, thd_attr, grpc_read_thread, cb, pool);
*ppUserData = cb;
return SWITCH_STATUS_SUCCESS;
}
switch_status_t google_speech_session_cleanup(switch_core_session_t *session, int channelIsClosing, switch_media_bug_t *bug) {
switch_channel_t *channel = switch_core_session_get_channel(session);
if (bug) {
struct cap_cb *cb = (struct cap_cb *) switch_core_media_bug_get_user_data(bug);
switch_mutex_lock(cb->mutex);
if (!switch_channel_get_private(channel, cb->bugname)) {
// race condition
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s Bug is not attached (race).\n", switch_channel_get_name(channel));
switch_mutex_unlock(cb->mutex);
return SWITCH_STATUS_FALSE;
}
switch_channel_set_private(channel, cb->bugname, NULL);
// stop playback if available
if (cb->play_file == 1){
if (switch_channel_test_flag(channel, CF_BROADCAST)) {
switch_channel_stop_broadcast(channel);
} else {
switch_channel_set_flag_value(channel, CF_BREAK, 1);
}
}
// close connection and get final responses
GStreamer* streamer = (GStreamer *) cb->streamer;
if (streamer) {
streamer->writesDone();
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "google_speech_session_cleanup: GStreamer (%p) waiting for read thread to complete\n", (void*)streamer);
switch_status_t st;
switch_thread_join(&st, cb->thread);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "google_speech_session_cleanup: GStreamer (%p) read thread completed\n", (void*)streamer);
delete streamer;
cb->streamer = NULL;
}
if (cb->resampler) {
speex_resampler_destroy(cb->resampler);
}
if (cb->vad) {
switch_vad_destroy(&cb->vad);
cb->vad = nullptr;
}
if (!channelIsClosing) {
switch_core_media_bug_remove(session, &bug);
}
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "google_speech_session_cleanup: Closed stream\n");
switch_mutex_unlock(cb->mutex);
return SWITCH_STATUS_SUCCESS;
}
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s Bug is not attached.\n", switch_channel_get_name(channel));
return SWITCH_STATUS_FALSE;
}
switch_bool_t google_speech_frame(switch_media_bug_t *bug, void* user_data) {
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
struct cap_cb *cb = (struct cap_cb *) user_data;
if (cb->streamer && (!cb->wants_single_utterance || !cb->got_end_of_utterance)) {
GStreamer* streamer = (GStreamer *) cb->streamer;
uint8_t data[SWITCH_RECOMMENDED_BUFFER_SIZE];
switch_frame_t frame = {};
frame.data = data;
frame.buflen = SWITCH_RECOMMENDED_BUFFER_SIZE;
if (switch_mutex_trylock(cb->mutex) == SWITCH_STATUS_SUCCESS) {
while (switch_core_media_bug_read(bug, &frame, SWITCH_TRUE) == SWITCH_STATUS_SUCCESS && !switch_test_flag((&frame), SFF_CNG)) {
if (frame.datalen) {
if (cb->vad && !streamer->isConnected()) {
switch_vad_state_t state = switch_vad_process(cb->vad, (int16_t*) frame.data, frame.samples);
if (state == SWITCH_VAD_STATE_START_TALKING) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "detected speech, connect to google speech now\n");
streamer->connect();
cb->responseHandler(session, "vad_detected", cb->bugname);
}
}
if (cb->resampler) {
spx_int16_t out[SWITCH_RECOMMENDED_BUFFER_SIZE];
spx_uint32_t out_len = SWITCH_RECOMMENDED_BUFFER_SIZE;
spx_uint32_t in_len = frame.samples;
size_t written;
speex_resampler_process_interleaved_int(cb->resampler,
(const spx_int16_t *) frame.data,
(spx_uint32_t *) &in_len,
&out[0],
&out_len);
streamer->write( &out[0], sizeof(spx_int16_t) * out_len);
}
else {
streamer->write( frame.data, sizeof(spx_int16_t) * frame.samples);
}
}
}
switch_mutex_unlock(cb->mutex);
}
}
return SWITCH_TRUE;
}
}

View File

@@ -0,0 +1,13 @@
#ifndef __GOOGLE_GLUE_H__
#define __GOOGLE_GLUE_H__
switch_status_t google_speech_init();
switch_status_t google_speech_cleanup();
switch_status_t google_speech_session_init(switch_core_session_t *session, responseHandler_t responseHandler,
uint32_t to_rate, uint32_t samples_per_second, uint32_t channels, char* lang, int interim, char *bugname, int single_utterence,
int separate_recognition, int max_alternatives, int profinity_filter, int word_time_offset, int punctuation, const char* model, int enhanced,
const char* hints, char* play_file, void **ppUserData);
switch_status_t google_speech_session_cleanup(switch_core_session_t *session, int channelIsClosing, switch_media_bug_t *bug);
switch_bool_t google_speech_frame(switch_media_bug_t *bug, void* user_data);
#endif

View File

@@ -0,0 +1,484 @@
/*
*
* mod_google_transcribe.c -- Freeswitch module for real-time transcription using google's gRPC interface
*
*/
#include "mod_google_transcribe.h"
#include "google_glue.h"
#include <stdlib.h>
#include <switch.h>
static const uint32_t DEFAULT_SAMPLE_RATE = 8000;
/* Prototypes */
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_transcribe_shutdown);
SWITCH_MODULE_RUNTIME_FUNCTION(mod_transcribe_runtime);
SWITCH_MODULE_LOAD_FUNCTION(mod_transcribe_load);
SWITCH_MODULE_DEFINITION(mod_google_transcribe, mod_transcribe_load, mod_transcribe_shutdown, NULL);
static switch_status_t do_stop(switch_core_session_t *session, char* bugname);
static void responseHandler(switch_core_session_t* session, const char * json, const char* bugname) {
switch_event_t *event;
switch_channel_t *channel = switch_core_session_get_channel(session);
if (0 == strcmp("vad_detected", json)) {
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_VAD_DETECTED);
switch_channel_event_set_data(channel, event);
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
}
else if (0 == strcmp("end_of_utterance", json)) {
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_END_OF_UTTERANCE);
switch_channel_event_set_data(channel, event);
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
}
else if (0 == strcmp("end_of_transcript", json)) {
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_END_OF_TRANSCRIPT);
switch_channel_event_set_data(channel, event);
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
}
else if (0 == strcmp("start_of_transcript", json)) {
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_START_OF_TRANSCRIPT);
switch_channel_event_set_data(channel, event);
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
}
else if (0 == strcmp("max_duration_exceeded", json)) {
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_MAX_DURATION_EXCEEDED);
switch_channel_event_set_data(channel, event);
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
}
else if (0 == strcmp("no_audio", json)) {
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_NO_AUDIO_DETECTED);
switch_channel_event_set_data(channel, event);
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
}
else if (0 == strcmp("play_interrupt", json)){
switch_event_t *qevent;
switch_status_t status;
if (switch_event_create(&qevent, SWITCH_EVENT_DETECTED_SPEECH) == SWITCH_STATUS_SUCCESS) {
if ((status = switch_core_session_queue_event(session, &qevent)) != SWITCH_STATUS_SUCCESS){
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "unable to queue play inturrupt event %d \n", status);
}
}else{
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "unable to create play inturrupt event \n");
}
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_PLAY_INTERRUPT);
switch_channel_event_set_data(channel, event);
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
}
else {
int error = 0;
cJSON* jMessage = cJSON_Parse(json);
if (jMessage) {
const char* type = cJSON_GetStringValue(cJSON_GetObjectItem(jMessage, "type"));
if (type && 0 == strcmp(type, "error")) {
error = 1;
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_ERROR);
}
cJSON_Delete(jMessage);
}
if (!error) {
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_RESULTS);
}
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "%s json payload: %s.\n", bugname ? bugname : "google_transcribe", json);
switch_channel_event_set_data(channel, event);
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
switch_event_add_body(event, "%s", json);
}
if (bugname) switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "media-bugname", bugname);
switch_event_fire(&event);
}
static switch_bool_t capture_callback(switch_media_bug_t *bug, void *user_data, switch_abc_type_t type)
{
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
struct cap_cb* cb = (struct cap_cb*) switch_core_media_bug_get_user_data(bug);
switch (type) {
case SWITCH_ABC_TYPE_INIT:
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Got SWITCH_ABC_TYPE_INIT.\n");
responseHandler(session, "start_of_transcript", cb->bugname);
break;
case SWITCH_ABC_TYPE_CLOSE:
{
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Got SWITCH_ABC_TYPE_CLOSE, calling google_speech_session_cleanup.\n");
responseHandler(session, "end_of_transcript", cb->bugname);
google_speech_session_cleanup(session, 1, bug);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Finished SWITCH_ABC_TYPE_CLOSE.\n");
}
break;
case SWITCH_ABC_TYPE_READ:
return google_speech_frame(bug, user_data);
break;
case SWITCH_ABC_TYPE_WRITE:
default:
break;
}
return SWITCH_TRUE;
}
static switch_status_t transcribe_input_callback(switch_core_session_t *session, void *input, switch_input_type_t input_type, void *data, unsigned int len){
if (input_type == SWITCH_INPUT_TYPE_EVENT) {
switch_event_t *event;
event = (switch_event_t *)input;
if (event->event_id == SWITCH_EVENT_DETECTED_SPEECH) {
return SWITCH_STATUS_BREAK;
}
}
return SWITCH_STATUS_SUCCESS;
}
static switch_status_t do_stop(switch_core_session_t *session, char *bugname)
{
switch_status_t status = SWITCH_STATUS_SUCCESS;
switch_channel_t *channel = switch_core_session_get_channel(session);
switch_media_bug_t *bug = switch_channel_get_private(channel, bugname);
if (bug) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Received user command command, calling google_speech_session_cleanup (possibly to stop prev transcribe)\n");
status = google_speech_session_cleanup(session, 0, bug);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "stopped transcription.\n");
}
return status;
}
static switch_status_t start_capture2(switch_core_session_t *session, switch_media_bug_flag_t flags,
uint32_t sample_rate, char* lang, int interim, int single_utterance, int separate_recognition, int max_alternatives,
int profinity_filter, int word_time_offset, int punctuation, const char* model, int enhanced, const char* hints, char* play_file)
{
switch_channel_t *channel = switch_core_session_get_channel(session);
switch_media_bug_t *bug;
switch_status_t status;
switch_codec_implementation_t read_impl = { 0 };
void *pUserData;
uint32_t samples_per_second;
switch_input_args_t args = { 0 };
if (switch_channel_get_private(channel, MY_BUG_NAME)) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "removing bug from previous transcribe\n");
do_stop(session, MY_BUG_NAME);
}
switch_core_session_get_read_impl(session, &read_impl);
if (switch_channel_pre_answer(channel) != SWITCH_STATUS_SUCCESS) {
return SWITCH_STATUS_FALSE;
}
samples_per_second = !strcasecmp(read_impl.iananame, "g722") ? read_impl.actual_samples_per_second : read_impl.samples_per_second;
if (SWITCH_STATUS_FALSE == google_speech_session_init(session, responseHandler, sample_rate, samples_per_second, flags & SMBF_STEREO ? 2 : 1, lang, interim, MY_BUG_NAME, single_utterance,
separate_recognition, max_alternatives, profinity_filter, word_time_offset, punctuation, model, enhanced, hints, play_file, &pUserData)) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error initializing google speech session.\n");
return SWITCH_STATUS_FALSE;
}
if ((status = switch_core_media_bug_add(session, "google_transcribe", NULL, capture_callback, pUserData, 0, flags, &bug)) != SWITCH_STATUS_SUCCESS) {
return status;
}
switch_channel_set_private(channel, MY_BUG_NAME, bug);
/* play the prompt, looking for detection result */
if (play_file != NULL){
args.input_callback = transcribe_input_callback;
switch_ivr_play_file(session, NULL, play_file, &args);
}
return SWITCH_STATUS_SUCCESS;
}
static switch_status_t start_capture(switch_core_session_t *session, switch_media_bug_flag_t flags,
char* lang, int interim, char* bugname)
{
switch_channel_t *channel = switch_core_session_get_channel(session);
switch_media_bug_t *bug;
switch_status_t status;
switch_codec_implementation_t read_impl = { 0 };
void *pUserData;
uint32_t samples_per_second;
int single_utterance = 0, separate_recognition = 0, max_alternatives = 0, profanity_filter = 0, word_time_offset = 0, punctuation = 0, enhanced = 0;
const char* hints = NULL;
const char* model = NULL;
const char* var;
if (switch_channel_get_private(channel, bugname)) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "removing bug from previous transcribe\n");
do_stop(session, bugname);
}
if (switch_true(switch_channel_get_variable(channel, "GOOGLE_SPEECH_SINGLE_UTTERANCE"))) {
single_utterance = 1;
}
// transcribe each separately?
if (switch_true(switch_channel_get_variable(channel, "GOOGLE_SPEECH_SEPARATE_RECOGNITION_PER_CHANNEL"))) {
separate_recognition = 1;
}
// max alternatives
if ((var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_MAX_ALTERNATIVES"))) {
max_alternatives = atoi(var);
}
// profanity filter
if (switch_true(switch_channel_get_variable(channel, "GOOGLE_SPEECH_PROFANITY_FILTER"))) {
profanity_filter = 1;
}
// enable word offsets
if (switch_true(switch_channel_get_variable(channel, "GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS"))) {
word_time_offset = 1;
}
// enable automatic punctuation
if (switch_true(switch_channel_get_variable(channel, "GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION"))) {
punctuation = 1;
}
// speech model
if ((var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_MODEL"))) {
model = var;
}
// use enhanced model
if (switch_true(switch_channel_get_variable(channel, "GOOGLE_SPEECH_USE_ENHANCED"))) {
enhanced = 1;
}
// hints
if ((var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_HINTS"))) {
hints = var;
}
switch_core_session_get_read_impl(session, &read_impl);
if (switch_channel_pre_answer(channel) != SWITCH_STATUS_SUCCESS) {
return SWITCH_STATUS_FALSE;
}
samples_per_second = !strcasecmp(read_impl.iananame, "g722") ? read_impl.actual_samples_per_second : read_impl.samples_per_second;
if (SWITCH_STATUS_FALSE == google_speech_session_init(session, responseHandler, DEFAULT_SAMPLE_RATE, samples_per_second, flags & SMBF_STEREO ? 2 : 1, lang, interim, bugname, single_utterance,
separate_recognition, max_alternatives, profanity_filter, word_time_offset, punctuation, model, enhanced, hints, NULL, &pUserData)) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error initializing google speech session.\n");
return SWITCH_STATUS_FALSE;
}
if ((status = switch_core_media_bug_add(session, bugname, NULL, capture_callback, pUserData, 0, flags, &bug)) != SWITCH_STATUS_SUCCESS) {
return status;
}
switch_channel_set_private(channel, bugname, bug);
return SWITCH_STATUS_SUCCESS;
}
// #define TRANSCRIBE_API_SYNTAX "<uuid> [start|stop] [lang-code] [interim] [single-utterance](bool) [seperate-recognition](bool) [max-alternatives](int) [profinity-filter](bool) [word-time](bool) [punctuation](bool) [model](string) [enhanced](true) [hints](string without space) [play-file]"
#define TRANSCRIBE2_API_SYNTAX "<uuid> [start|stop] [lang-code] [interim] [single-utterance] [seperate-recognition] [max-alternatives] [profinity-filter] [word-time] [punctuation] [sample-rate] [model] [enhanced] [hints] [play-file]"
SWITCH_STANDARD_API(transcribe2_function)
{
char *mycmd = NULL, *argv[20] = { 0 };
int argc = 0, enhanced = 0;
uint32_t sample_rate = DEFAULT_SAMPLE_RATE;
const char* hints = NULL;
const char* model = NULL;
char* play_file = NULL;
switch_status_t status = SWITCH_STATUS_FALSE;
switch_media_bug_flag_t flags = SMBF_READ_STREAM /* | SMBF_WRITE_STREAM | SMBF_READ_PING */;
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
}
if (zstr(cmd) ||
(argc < 2) ||
(!strcasecmp(argv[1], "start") && argc < 10) ||
zstr(argv[0])) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s %s.\n", cmd, argv[0]);
stream->write_function(stream, "-USAGE: %s\n", TRANSCRIBE2_API_SYNTAX);
goto done;
} else {
switch_core_session_t *lsession = NULL;
if ((lsession = switch_core_session_locate(argv[0]))) {
if (!strcasecmp(argv[1], "stop")) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "stop transcribing\n");
status = do_stop(lsession, MY_BUG_NAME);
} else if (!strcasecmp(argv[1], "start")) {
char* lang = argv[2];
int interim = argc > 3 && !strcmp(argv[3], "true");
int single_utterance = !strcmp(argv[4], "true"); // single-utterance
int separate_recognition = !strcmp(argv[5], "true"); // sepreate-recognition
int max_alternatives = atoi(argv[6]); // max-alternatives
int profinity_filter = !strcmp(argv[7], "true"); // profinity-filter
int word_time_offset = !strcmp(argv[8], "true"); // word-time
int punctuation = !strcmp(argv[9], "true"); //punctuation
if (argc > 10) {
sample_rate = atol(argv[10]);
}
if (argc > 12){
model = argv[11]; // model
enhanced = !strcmp(argv[12], "true"); // enhanced
}
if (argc > 13){
hints = argv[13]; // hints
}
if (argc > 14){
play_file = argv[14];
}
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "start transcribing %s %s\n", lang, interim ? "interim": "complete");
status = start_capture2(lsession, flags, sample_rate, lang, interim, single_utterance, separate_recognition,max_alternatives,
profinity_filter, word_time_offset, punctuation, model, enhanced, hints, play_file);
}
switch_core_session_rwunlock(lsession);
}
}
if (status == SWITCH_STATUS_SUCCESS) {
stream->write_function(stream, "+OK Success\n");
} else {
stream->write_function(stream, "-ERR Operation Failed\n");
}
done:
switch_safe_free(mycmd);
return SWITCH_STATUS_SUCCESS;
}
#define TRANSCRIBE_API_SYNTAX "<uuid> [start|stop] [lang-code] [interim|full] [stereo|mono] [bug-name]"
SWITCH_STANDARD_API(transcribe_function)
{
char *mycmd = NULL, *argv[6] = { 0 };
int argc = 0;
switch_status_t status = SWITCH_STATUS_FALSE;
switch_media_bug_flag_t flags = SMBF_READ_STREAM /* | SMBF_WRITE_STREAM | SMBF_READ_PING */;
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
}
if (zstr(cmd) ||
(!strcasecmp(argv[1], "stop") && argc < 2) ||
(!strcasecmp(argv[1], "start") && argc < 3) ||
zstr(argv[0])) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s %s %s.\n", cmd, argv[0], argv[1]);
stream->write_function(stream, "-USAGE: %s\n", TRANSCRIBE_API_SYNTAX);
goto done;
} else {
switch_core_session_t *lsession = NULL;
if ((lsession = switch_core_session_locate(argv[0]))) {
if (!strcasecmp(argv[1], "stop")) {
char *bugname = argc > 2 ? argv[2] : MY_BUG_NAME;
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "stop transcribing\n");
status = do_stop(lsession, bugname);
} else if (!strcasecmp(argv[1], "start")) {
char* lang = argv[2];
int interim = argc > 3 && !strcmp(argv[3], "interim");
char *bugname = argc > 5 ? argv[5] : MY_BUG_NAME;
if (argc > 4 && !strcmp(argv[4], "stereo")) {
flags |= SMBF_WRITE_STREAM ;
flags |= SMBF_STEREO;
}
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s start transcribing %s %s\n", bugname, lang, interim ? "interim": "complete");
status = start_capture(lsession, flags, lang, interim, bugname);
}
switch_core_session_rwunlock(lsession);
}
}
if (status == SWITCH_STATUS_SUCCESS) {
stream->write_function(stream, "+OK Success\n");
} else {
stream->write_function(stream, "-ERR Operation Failed\n");
}
done:
switch_safe_free(mycmd);
return SWITCH_STATUS_SUCCESS;
}
SWITCH_MODULE_LOAD_FUNCTION(mod_transcribe_load)
{
switch_api_interface_t *api_interface;
/* create/register custom event message type */
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_RESULTS) != SWITCH_STATUS_SUCCESS) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_RESULTS);
return SWITCH_STATUS_TERM;
}
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_END_OF_UTTERANCE) != SWITCH_STATUS_SUCCESS) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_END_OF_UTTERANCE);
return SWITCH_STATUS_TERM;
}
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_START_OF_TRANSCRIPT) != SWITCH_STATUS_SUCCESS) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_START_OF_TRANSCRIPT);
return SWITCH_STATUS_TERM;
}
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_END_OF_TRANSCRIPT) != SWITCH_STATUS_SUCCESS) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_END_OF_TRANSCRIPT);
return SWITCH_STATUS_TERM;
}
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_NO_AUDIO_DETECTED) != SWITCH_STATUS_SUCCESS) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_NO_AUDIO_DETECTED);
return SWITCH_STATUS_TERM;
}
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_MAX_DURATION_EXCEEDED) != SWITCH_STATUS_SUCCESS) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_MAX_DURATION_EXCEEDED);
return SWITCH_STATUS_TERM;
}
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_PLAY_INTERRUPT) != SWITCH_STATUS_SUCCESS) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_PLAY_INTERRUPT);
return SWITCH_STATUS_TERM;
}
/* connect my internal structure to the blank pointer passed to me */
*module_interface = switch_loadable_module_create_module_interface(pool, modname);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Google Speech Transcription API loading..\n");
if (SWITCH_STATUS_FALSE == google_speech_init()) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Failed initializing google speech interface\n");
}
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Google Speech Transcription API successfully loaded\n");
SWITCH_ADD_API(api_interface, "uuid_google_transcribe", "Google Speech Transcription API", transcribe_function, TRANSCRIBE_API_SYNTAX);
SWITCH_ADD_API(api_interface, "uuid_google_transcribe2", "Google Speech Transcription API", transcribe2_function, TRANSCRIBE2_API_SYNTAX);
switch_console_set_complete("add uuid_google_transcribe start lang-code");
switch_console_set_complete("add uuid_google_transcribe stop ");
/* indicate that the module should continue to be loaded */
return SWITCH_STATUS_SUCCESS;
}
/*
Called when the system shuts down
Macro expands to: switch_status_t mod_google_transcribe_shutdown() */
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_transcribe_shutdown)
{
google_speech_cleanup();
switch_event_free_subclass(TRANSCRIBE_EVENT_RESULTS);
switch_event_free_subclass(TRANSCRIBE_EVENT_END_OF_UTTERANCE);
switch_event_free_subclass(TRANSCRIBE_EVENT_START_OF_TRANSCRIPT);
switch_event_free_subclass(TRANSCRIBE_EVENT_END_OF_TRANSCRIPT);
switch_event_free_subclass(TRANSCRIBE_EVENT_NO_AUDIO_DETECTED);
switch_event_free_subclass(TRANSCRIBE_EVENT_MAX_DURATION_EXCEEDED);
switch_event_free_subclass(TRANSCRIBE_EVENT_END_OF_UTTERANCE);
switch_event_free_subclass(TRANSCRIBE_EVENT_PLAY_INTERRUPT);
return SWITCH_STATUS_SUCCESS;
}

View File

@@ -0,0 +1,58 @@
#ifndef __MOD_GOOGLE_TRANSCRIBE_H__
#define __MOD_GOOGLE_TRANSCRIBE_H__
#include <switch.h>
#include <speex/speex_resampler.h>
#include <unistd.h>
#define MAX_SESSION_ID (256)
#define MAX_BUG_LEN (64)
#define MY_BUG_NAME "google_transcribe"
#define TRANSCRIBE_EVENT_RESULTS "google_transcribe::transcription"
#define TRANSCRIBE_EVENT_END_OF_UTTERANCE "google_transcribe::end_of_utterance"
#define TRANSCRIBE_EVENT_START_OF_TRANSCRIPT "google_transcribe::start_of_transcript"
#define TRANSCRIBE_EVENT_END_OF_TRANSCRIPT "google_transcribe::end_of_transcript"
#define TRANSCRIBE_EVENT_NO_AUDIO_DETECTED "google_transcribe::no_audio_detected"
#define TRANSCRIBE_EVENT_MAX_DURATION_EXCEEDED "google_transcribe::max_duration_exceeded"
#define TRANSCRIBE_EVENT_PLAY_INTERRUPT "google_transcribe::play_interrupt"
#define TRANSCRIBE_EVENT_VAD_DETECTED "google_transcribe::vad_detected"
#define TRANSCRIBE_EVENT_ERROR "jambonz_transcribe::error"
// simply write a wave file
//#define DEBUG_TRANSCRIBE 0
#ifdef DEBUG_TRANSCRIBE
/* per-channel data */
struct cap_cb {
switch_buffer_t *buffer;
switch_mutex_t *mutex;
char *base;
SpeexResamplerState *resampler;
FILE* fp;
};
#else
/* per-channel data */
typedef void (*responseHandler_t)(switch_core_session_t* session, const char* json, const char* bugname);
struct cap_cb {
switch_mutex_t *mutex;
char bugname[MAX_BUG_LEN+1];
char sessionId[MAX_SESSION_ID+1];
char *base;
SpeexResamplerState *resampler;
void* streamer;
responseHandler_t responseHandler;
switch_thread_t* thread;
int wants_single_utterance;
int got_end_of_utterance;
int play_file;
switch_vad_t * vad;
uint32_t samples_per_second;
};
#endif
#endif

View File

@@ -0,0 +1,51 @@
/**
* (very) simple and limited circular buffer,
* supporting only the use case of doing all of the adds
* and then subsquently retrieves.
*
*/
class SimpleBuffer {
public:
SimpleBuffer(uint32_t chunkSize, uint32_t numChunks) : numItems(0),
m_numChunks(numChunks), m_chunkSize(chunkSize) {
m_pData = new char[chunkSize * numChunks];
m_pNextWrite = m_pData;
}
~SimpleBuffer() {
delete [] m_pData;
}
void add(void *data, uint32_t datalen) {
if (datalen % m_chunkSize != 0) return;
int numChunks = datalen / m_chunkSize;
for (int i = 0; i < numChunks; i++) {
memcpy(m_pNextWrite, data, m_chunkSize);
data = static_cast<char*>(data) + m_chunkSize;
if (numItems < m_numChunks) numItems++;
uint32_t offset = (m_pNextWrite - m_pData) / m_chunkSize;
if (offset >= m_numChunks - 1) m_pNextWrite = m_pData;
else m_pNextWrite += m_chunkSize;
}
}
char* getNextChunk() {
if (numItems--) {
char *p = m_pNextWrite;
uint32_t offset = (m_pNextWrite - m_pData) / m_chunkSize;
if (offset >= m_numChunks - 1) m_pNextWrite = m_pData;
else m_pNextWrite += m_chunkSize;
return p;
}
return nullptr;
}
uint32_t getNumItems() { return numItems;}
private:
char *m_pData;
uint32_t numItems;
uint32_t m_chunkSize;
uint32_t m_numChunks;
char* m_pNextWrite;
};