mirror of
https://github.com/jambonz/freeswitch-modules.git
synced 2025-12-19 08:27:44 +00:00
eliminate support for multiple lws threads as part of fixing valgrind errors
Signed-off-by: Dave Horton <daveh@beachdognet.com>
This commit is contained in:
BIN
mod_google_transcribe/.DS_Store
vendored
Normal file
BIN
mod_google_transcribe/.DS_Store
vendored
Normal file
Binary file not shown.
8
mod_google_transcribe/LICENSE
Normal file
8
mod_google_transcribe/LICENSE
Normal file
@@ -0,0 +1,8 @@
|
||||
Copyright 2023, Drachtio Communications Services, LLC
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
10
mod_google_transcribe/Makefile.am
Normal file
10
mod_google_transcribe/Makefile.am
Normal file
@@ -0,0 +1,10 @@
|
||||
include $(top_srcdir)/build/modmake.rulesam
|
||||
MODNAME=mod_google_transcribe
|
||||
|
||||
mod_LTLIBRARIES = mod_google_transcribe.la
|
||||
mod_google_transcribe_la_SOURCES = mod_google_transcribe.c google_glue.cpp
|
||||
mod_google_transcribe_la_CFLAGS = $(AM_CFLAGS)
|
||||
mod_google_transcribe_la_CXXFLAGS = -I $(top_srcdir)/libs/googleapis/gens $(AM_CXXFLAGS) -std=c++17
|
||||
|
||||
mod_google_transcribe_la_LIBADD = $(switch_builddir)/libfreeswitch.la
|
||||
mod_google_transcribe_la_LDFLAGS = -avoid-version -module -no-undefined -shared `pkg-config --libs grpc++ grpc`
|
||||
101
mod_google_transcribe/README.md
Normal file
101
mod_google_transcribe/README.md
Normal file
@@ -0,0 +1,101 @@
|
||||
# mod_google_transcribe
|
||||
|
||||
A Freeswitch module that generates real-time transcriptions on a Freeswitch channel by using Google's Speech-to-Text API.
|
||||
|
||||
Optionally, the connection to the google cloud recognizer can be delayed until voice activity has been detected. This can be useful in cases where it is desired to minimize the costs of streaming audio for transcription. This setting is governed by the channel variables starting with 1RECOGNIZER_VAD`, as described below.
|
||||
|
||||
## API
|
||||
|
||||
### Commands
|
||||
The freeswitch module exposes two versions of an API command to transcribe speech:
|
||||
#### version 1
|
||||
```bash
|
||||
uuid_google_transcribe <uuid> start <lang-code> [interim]
|
||||
```
|
||||
When using this command, additional speech processing options can be provided through Freeswitch channel variables, described [below](#command-variables).
|
||||
|
||||
####version 2
|
||||
```bash
|
||||
uuid_google_transcribe2 <uuid> start <lang-code> [interim] (bool) \
|
||||
[single-utterance](bool) [separate-recognition](bool) [max-alternatives](int) \
|
||||
[profanity-filter](bool) [word-time](bool) [punctuation](bool) \
|
||||
[model](string) [enhanced](bool) [hints](word seperated by , and no spaces) \
|
||||
[play-file] (play file path)
|
||||
```
|
||||
This command allows speech processing options to be provided on the command line, and has the ability to optionally play an audio file as a prompt.
|
||||
|
||||
Example:
|
||||
```bash
|
||||
bgapi uuid_google_transcribe2 312033b6-4b2a-48d8-be0c-5f161aec2b3e start en-US \
|
||||
true true true 5 true true true command_and_search true \
|
||||
yes,no,hello https://www2.cs.uic.edu/~i101/SoundFiles/CantinaBand60.wav
|
||||
```
|
||||
Attaches media bug to channel and performs streaming recognize request.
|
||||
- `uuid` - unique identifier of Freeswitch channel
|
||||
- `lang-code` - a valid Google [language code](https://cloud.google.com/speech-to-text/docs/languages) to use for speech recognition
|
||||
- `interim` - If the 'interim' keyword is present then both interim and final transcription results will be returned; otherwise only final transcriptions will be returned
|
||||
|
||||
```
|
||||
uuid_google_transcribe <uuid> stop
|
||||
```
|
||||
Stop transcription on the channel.
|
||||
|
||||
### Command Variables
|
||||
Additional google speech options can be set through freeswitch channel variables for `uuid_google_transcribe` (some can alternatively be set in the command line for `uuid_google_transcribe2`).
|
||||
|
||||
| variable | Description |
|
||||
| --- | ----------- |
|
||||
| GOOGLE_SPEECH_SINGLE_UTTERANCE | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.StreamingRecognitionConfig.FIELDS.bool.google.cloud.speech.v1.StreamingRecognitionConfig.single_utterance) |
|
||||
| GOOGLE_SPEECH_SEPARATE_RECOGNITION_PER_CHANNEL | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.RecognitionConfig.FIELDS.bool.google.cloud.speech.v1.RecognitionConfig.enable_separate_recognition_per_channel) |
|
||||
| GOOGLE_SPEECH_MAX_ALTERNATIVES | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.RecognitionConfig.FIELDS.int32.google.cloud.speech.v1.RecognitionConfig.max_alternatives) |
|
||||
| GOOGLE_SPEECH_PROFANITY_FILTER | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.RecognitionConfig.FIELDS.bool.google.cloud.speech.v1.RecognitionConfig.profanity_filter) |
|
||||
| GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.RecognitionConfig.FIELDS.bool.google.cloud.speech.v1.RecognitionConfig.enable_word_time_offsets) |
|
||||
| GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.RecognitionConfig.FIELDS.bool.google.cloud.speech.v1.RecognitionConfig.enable_automatic_punctuation) |
|
||||
| GOOGLE_SPEECH_MODEL | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.RecognitionConfig.FIELDS.string.google.cloud.speech.v1.RecognitionConfig.model) |
|
||||
| GOOGLE_SPEECH_USE_ENHANCED | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.RecognitionConfig.FIELDS.bool.google.cloud.speech.v1.RecognitionConfig.use_enhanced) |
|
||||
| GOOGLE_SPEECH_HINTS | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.PhraseSet) |
|
||||
| GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES | a comma-separated list of language codes, [per this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.RecognitionConfig.FIELDS.repeated.string.google.cloud.speech.v1p1beta1.RecognitionConfig.alternative_language_codes) |
|
||||
| GOOGLE_SPEECH_SPEAKER_DIARIZATION | set to 1 to enable [speaker diarization](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.SpeakerDiarizationConfig) |
|
||||
| GOOGLE_SPEECH_SPEAKER_DIARIZATION_MIN_SPEAKER_COUNT | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.SpeakerDiarizationConfig) |
|
||||
| GOOGLE_SPEECH_SPEAKER_DIARIZATION_MAX_SPEAKER_COUNT | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.SpeakerDiarizationConfig) |
|
||||
| GOOGLE_SPEECH_METADATA_INTERACTION_TYPE | set to 'discussion', 'presentation', 'phone_call', 'voicemail', 'professionally_produced', 'voice_search', 'voice_command', or 'dictation' [per this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.RecognitionMetadata.InteractionType) |
|
||||
| GOOGLE_SPEECH_METADATA_INDUSTRY_NAICS_CODE | [read this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.RecognitionMetadata) |
|
||||
| GOOGLE_SPEECH_METADATA_MICROPHONE_DISTANCE | set to 'nearfield', 'midfield', or 'farfield' [per this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.RecognitionMetadata.MicrophoneDistance) |
|
||||
| GOOGLE_SPEECH_METADATA_ORIGINAL_MEDIA_TYPE | set to 'audio', or 'video' [per this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.RecognitionMetadata.OriginalMediaType) |
|
||||
| GOOGLE_SPEECH_METADATA_RECORDING_DEVICE_TYPE | set to 'smartphone', 'pc', 'phone_line', 'vehicle', 'other_outdoor_device', or 'other_indoor_device' [per this](https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.RecognitionMetadata.RecordingDeviceType)|
|
||||
| START_RECOGNIZING_ON_VAD | if set to 1 or true, do not begin streaming audio to google cloud until voice activity is detected.|
|
||||
| RECOGNIZER_VAD_MODE | An integer value 0-3 from less to more aggressive vad detection (default: 2).|
|
||||
| RECOGNIZER_VAD_VOICE_MS | The number of milliseconds of voice activity that is required to trigger the connection to google cloud, when START_RECOGNIZING_ON_VAD is set (default: 250).|
|
||||
| RECOGNIZER_VAD_DEBUG | if >0 vad debug logs will be generated (default: 0).|
|
||||
|
||||
|
||||
### Events
|
||||
**google_transcribe::transcription** - returns an interim or final transcription. The event contains a JSON body describing the transcription result:
|
||||
```js
|
||||
{
|
||||
"stability": 0,
|
||||
"is_final": true,
|
||||
"alternatives": [{
|
||||
"confidence": 0.96471,
|
||||
"transcript": "Donny was a good bowler, and a good man"
|
||||
}]
|
||||
}
|
||||
```
|
||||
|
||||
**google_transcribe::end_of_utterance** - returns an indication that an utterance has been detected. This may be returned prior to a final transcription. This event is only returned when GOOGLE_SPEECH_SINGLE_UTTERANCE is set to true.
|
||||
|
||||
**google_transcribe::end_of_transcript** - returned when a transcription operation has completed. If a final transcription has not been returned by now, it won't be. This event is only returned when GOOGLE_SPEECH_SINGLE_UTTERANCE is set to true.
|
||||
|
||||
**google_transcribe::no_audio_detected** - returned when google has returned an error indicating that no audio was received for a lengthy period of time.
|
||||
|
||||
**google_transcribe::max_duration_exceeded** - returned when google has returned an an indication that a long-running transcription has been stopped due to a max duration limit (305 seconds) on their side. It is the applications responsibility to respond by starting a new transcription session, if desired.
|
||||
|
||||
**google_transcribe::no_audio_detected** - returned when google has not received any audio for some reason.
|
||||
|
||||
## Usage
|
||||
When using [drachtio-fsrmf](https://www.npmjs.com/package/drachtio-fsmrf), you can access this API command via the api method on the 'endpoint' object.
|
||||
```js
|
||||
ep.api('uuid_google_transcribe', `${ep.uuid} start en-US`);
|
||||
```
|
||||
## Examples
|
||||
[google_transcribe.js](../../examples/google_transcribe.js)
|
||||
727
mod_google_transcribe/google_glue.cpp
Normal file
727
mod_google_transcribe/google_glue.cpp
Normal file
@@ -0,0 +1,727 @@
|
||||
#include <cstdlib>
|
||||
#include <algorithm>
|
||||
#include <future>
|
||||
|
||||
#include <switch.h>
|
||||
#include <switch_json.h>
|
||||
#include <grpc++/grpc++.h>
|
||||
|
||||
#include "google/cloud/speech/v1p1beta1/cloud_speech.grpc.pb.h"
|
||||
|
||||
#include <switch_json.h>
|
||||
|
||||
#include "mod_google_transcribe.h"
|
||||
#include "simple_buffer.h"
|
||||
|
||||
using google::cloud::speech::v1p1beta1::RecognitionConfig;
|
||||
using google::cloud::speech::v1p1beta1::Speech;
|
||||
using google::cloud::speech::v1p1beta1::SpeechContext;
|
||||
using google::cloud::speech::v1p1beta1::StreamingRecognizeRequest;
|
||||
using google::cloud::speech::v1p1beta1::StreamingRecognizeResponse;
|
||||
using google::cloud::speech::v1p1beta1::SpeakerDiarizationConfig;
|
||||
using google::cloud::speech::v1p1beta1::SpeechAdaptation;
|
||||
using google::cloud::speech::v1p1beta1::PhraseSet;
|
||||
using google::cloud::speech::v1p1beta1::PhraseSet_Phrase;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_DISCUSSION;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_PRESENTATION;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_PHONE_CALL;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_VOICEMAIL;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_PROFESSIONALLY_PRODUCED;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_VOICE_SEARCH;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_VOICE_COMMAND;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_InteractionType_DICTATION;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_MicrophoneDistance_NEARFIELD;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_MicrophoneDistance_MIDFIELD;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_MicrophoneDistance_FARFIELD;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_OriginalMediaType_AUDIO;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_OriginalMediaType_VIDEO;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_RecordingDeviceType_SMARTPHONE;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_RecordingDeviceType_PC;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_RecordingDeviceType_PHONE_LINE;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_RecordingDeviceType_VEHICLE;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_RecordingDeviceType_OTHER_OUTDOOR_DEVICE;
|
||||
using google::cloud::speech::v1p1beta1::RecognitionMetadata_RecordingDeviceType_OTHER_INDOOR_DEVICE;
|
||||
using google::cloud::speech::v1p1beta1::StreamingRecognizeResponse_SpeechEventType_END_OF_SINGLE_UTTERANCE;
|
||||
using google::rpc::Status;
|
||||
|
||||
#define CHUNKSIZE (320)
|
||||
|
||||
namespace {
|
||||
int case_insensitive_match(std::string s1, std::string s2) {
|
||||
std::transform(s1.begin(), s1.end(), s1.begin(), ::tolower);
|
||||
std::transform(s2.begin(), s2.end(), s2.begin(), ::tolower);
|
||||
if(s1.compare(s2) == 0)
|
||||
return 1; //The strings are same
|
||||
return 0; //not matched
|
||||
}
|
||||
}
|
||||
class GStreamer;
|
||||
|
||||
class GStreamer {
|
||||
public:
|
||||
GStreamer(
|
||||
switch_core_session_t *session,
|
||||
uint32_t channels,
|
||||
char* lang,
|
||||
int interim,
|
||||
uint32_t config_sample_rate,
|
||||
uint32_t samples_per_second,
|
||||
int single_utterance,
|
||||
int separate_recognition,
|
||||
int max_alternatives,
|
||||
int profanity_filter,
|
||||
int word_time_offset,
|
||||
int punctuation,
|
||||
const char* model,
|
||||
int enhanced,
|
||||
const char* hints) : m_session(session), m_writesDone(false), m_connected(false),
|
||||
m_audioBuffer(CHUNKSIZE, 15) {
|
||||
|
||||
const char* var;
|
||||
const char* google_uri;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
|
||||
if (!(google_uri = switch_channel_get_variable(channel, "GOOGLE_SPEECH_TO_TEXT_URI"))) {
|
||||
google_uri = "speech.googleapis.com";
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "GOOGLE_APPLICATION_CREDENTIALS")) {
|
||||
auto channelCreds = grpc::SslCredentials(grpc::SslCredentialsOptions());
|
||||
auto callCreds = grpc::ServiceAccountJWTAccessCredentials(var);
|
||||
auto creds = grpc::CompositeChannelCredentials(channelCreds, callCreds);
|
||||
m_channel = grpc::CreateChannel(google_uri, creds);
|
||||
}
|
||||
else {
|
||||
auto creds = grpc::GoogleDefaultCredentials();
|
||||
m_channel = grpc::CreateChannel(google_uri, creds);
|
||||
}
|
||||
|
||||
m_stub = Speech::NewStub(m_channel);
|
||||
|
||||
auto* streaming_config = m_request.mutable_streaming_config();
|
||||
RecognitionConfig* config = streaming_config->mutable_config();
|
||||
|
||||
streaming_config->set_interim_results(interim);
|
||||
if (single_utterance == 1) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "enable_single_utterance\n");
|
||||
streaming_config->set_single_utterance(true);
|
||||
}
|
||||
else {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "enable_single_utterance is FALSE\n");
|
||||
streaming_config->set_single_utterance(false);
|
||||
}
|
||||
|
||||
config->set_language_code(lang);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "transcribe language %s \n", lang);
|
||||
|
||||
config->set_sample_rate_hertz(config_sample_rate);
|
||||
|
||||
config->set_encoding(RecognitionConfig::LINEAR16);
|
||||
|
||||
// the rest of config comes from channel vars
|
||||
|
||||
// number of channels in the audio stream (default: 1)
|
||||
if (channels > 1) {
|
||||
config->set_audio_channel_count(channels);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "audio_channel_count %d\n", channels);
|
||||
|
||||
// transcribe each separately?
|
||||
if (separate_recognition == 1) {
|
||||
config->set_enable_separate_recognition_per_channel(true);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "enable_separate_recognition_per_channel on\n");
|
||||
}
|
||||
}
|
||||
|
||||
// max alternatives
|
||||
if (max_alternatives > 1) {
|
||||
config->set_max_alternatives(max_alternatives);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "max_alternatives %d\n", max_alternatives);
|
||||
}
|
||||
|
||||
// profanity filter
|
||||
if (profanity_filter == 1) {
|
||||
config->set_profanity_filter(true);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "profanity_filter\n");
|
||||
}
|
||||
|
||||
// enable word offsets
|
||||
if (word_time_offset == 1) {
|
||||
config->set_enable_word_time_offsets(true);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "enable_word_time_offsets\n");
|
||||
}
|
||||
|
||||
// enable automatic punctuation
|
||||
if (punctuation == 1) {
|
||||
config->set_enable_automatic_punctuation(true);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "enable_automatic_punctuation\n");
|
||||
}
|
||||
else {
|
||||
config->set_enable_automatic_punctuation(false);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "disable_automatic_punctuation\n");
|
||||
}
|
||||
|
||||
// speech model
|
||||
if (model != NULL) {
|
||||
config->set_model(model);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "speech model %s\n", model);
|
||||
}
|
||||
|
||||
// use enhanced model
|
||||
if (enhanced == 1) {
|
||||
config->set_use_enhanced(true);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "use_enhanced\n");
|
||||
}
|
||||
|
||||
// hints
|
||||
if (hints != NULL) {
|
||||
auto* adaptation = config->mutable_adaptation();
|
||||
auto* phrase_set = adaptation->add_phrase_sets();
|
||||
auto *context = config->add_speech_contexts();
|
||||
float boost = -1;
|
||||
|
||||
// get boost setting for the phrase set in its entirety
|
||||
if (switch_true(switch_channel_get_variable(channel, "GOOGLE_SPEECH_HINTS_BOOST"))) {
|
||||
boost = (float) atof(switch_channel_get_variable(channel, "GOOGLE_SPEECH_HINTS_BOOST"));
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "boost value: %f\n", boost);
|
||||
phrase_set->set_boost(boost);
|
||||
}
|
||||
|
||||
// hints are either a simple comma-separated list of phrases, or a json array of objects
|
||||
// containing a phrase and a boost value
|
||||
auto *jHint = cJSON_Parse((char *) hints);
|
||||
if (jHint) {
|
||||
int i = 0;
|
||||
cJSON *jPhrase = NULL;
|
||||
cJSON_ArrayForEach(jPhrase, jHint) {
|
||||
auto* phrase = phrase_set->add_phrases();
|
||||
cJSON *jItem = cJSON_GetObjectItem(jPhrase, "phrase");
|
||||
if (jItem) {
|
||||
phrase->set_value(cJSON_GetStringValue(jItem));
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "phrase: %s\n", phrase->value().c_str());
|
||||
if (cJSON_GetObjectItem(jPhrase, "boost")) {
|
||||
phrase->set_boost((float) cJSON_GetObjectItem(jPhrase, "boost")->valuedouble);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "boost value: %f\n", phrase->boost());
|
||||
}
|
||||
i++;
|
||||
}
|
||||
}
|
||||
cJSON_Delete(jHint);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "added %d hints\n", i);
|
||||
}
|
||||
else {
|
||||
char *phrases[500] = { 0 };
|
||||
int argc = switch_separate_string((char *) hints, ',', phrases, 500);
|
||||
for (int i = 0; i < argc; i++) {
|
||||
auto* phrase = phrase_set->add_phrases();
|
||||
phrase->set_value(phrases[i]);
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "added %d hints\n", argc);
|
||||
}
|
||||
}
|
||||
|
||||
// alternative language
|
||||
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES")) {
|
||||
char *alt_langs[3] = { 0 };
|
||||
int argc = switch_separate_string((char *) var, ',', alt_langs, 3);
|
||||
for (int i = 0; i < argc; i++) {
|
||||
config->add_alternative_language_codes(alt_langs[i]);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "added alternative lang %s\n", alt_langs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// speaker diarization
|
||||
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_SPEAKER_DIARIZATION")) {
|
||||
auto* diarization_config = config->mutable_diarization_config();
|
||||
diarization_config->set_enable_speaker_diarization(true);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "enabling speaker diarization\n", var);
|
||||
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_SPEAKER_DIARIZATION_MIN_SPEAKER_COUNT")) {
|
||||
int count = std::max(atoi(var), 1);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "setting min speaker count to %d\n", count);
|
||||
diarization_config->set_min_speaker_count(count);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_SPEAKER_DIARIZATION_MAX_SPEAKER_COUNT")) {
|
||||
int count = std::max(atoi(var), 2);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_DEBUG, "setting max speaker count to %d\n", count);
|
||||
diarization_config->set_max_speaker_count(count);
|
||||
}
|
||||
}
|
||||
|
||||
// recognition metadata
|
||||
auto* metadata = config->mutable_metadata();
|
||||
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_METADATA_INTERACTION_TYPE")) {
|
||||
if (case_insensitive_match("discussion", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_DISCUSSION);
|
||||
if (case_insensitive_match("presentation", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_PRESENTATION);
|
||||
if (case_insensitive_match("phone_call", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_PHONE_CALL);
|
||||
if (case_insensitive_match("voicemail", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_VOICEMAIL);
|
||||
if (case_insensitive_match("professionally_produced", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_PROFESSIONALLY_PRODUCED);
|
||||
if (case_insensitive_match("voice_search", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_VOICE_SEARCH);
|
||||
if (case_insensitive_match("voice_command", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_VOICE_COMMAND);
|
||||
if (case_insensitive_match("dictation", var)) metadata->set_interaction_type(RecognitionMetadata_InteractionType_DICTATION);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_METADATA_INDUSTRY_NAICS_CODE")) {
|
||||
metadata->set_industry_naics_code_of_audio(atoi(var));
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_METADATA_MICROPHONE_DISTANCE")) {
|
||||
if (case_insensitive_match("nearfield", var)) metadata->set_microphone_distance(RecognitionMetadata_MicrophoneDistance_NEARFIELD);
|
||||
if (case_insensitive_match("midfield", var)) metadata->set_microphone_distance(RecognitionMetadata_MicrophoneDistance_MIDFIELD);
|
||||
if (case_insensitive_match("farfield", var)) metadata->set_microphone_distance(RecognitionMetadata_MicrophoneDistance_FARFIELD);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_METADATA_ORIGINAL_MEDIA_TYPE")) {
|
||||
if (case_insensitive_match("audio", var)) metadata->set_original_media_type(RecognitionMetadata_OriginalMediaType_AUDIO);
|
||||
if (case_insensitive_match("video", var)) metadata->set_original_media_type(RecognitionMetadata_OriginalMediaType_VIDEO);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_METADATA_RECORDING_DEVICE_TYPE")) {
|
||||
if (case_insensitive_match("smartphone", var)) metadata->set_recording_device_type(RecognitionMetadata_RecordingDeviceType_SMARTPHONE);
|
||||
if (case_insensitive_match("pc", var)) metadata->set_recording_device_type(RecognitionMetadata_RecordingDeviceType_PC);
|
||||
if (case_insensitive_match("phone_line", var)) metadata->set_recording_device_type(RecognitionMetadata_RecordingDeviceType_PHONE_LINE);
|
||||
if (case_insensitive_match("vehicle", var)) metadata->set_recording_device_type(RecognitionMetadata_RecordingDeviceType_VEHICLE);
|
||||
if (case_insensitive_match("other_outdoor_device", var)) metadata->set_recording_device_type(RecognitionMetadata_RecordingDeviceType_OTHER_OUTDOOR_DEVICE);
|
||||
if (case_insensitive_match("other_indoor_device", var)) metadata->set_recording_device_type(RecognitionMetadata_RecordingDeviceType_OTHER_INDOOR_DEVICE);
|
||||
}
|
||||
}
|
||||
|
||||
~GStreamer() {
|
||||
//switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(m_session), SWITCH_LOG_INFO, "GStreamer::~GStreamer - deleting channel and stub: %p\n", (void*)this);
|
||||
}
|
||||
|
||||
void connect() {
|
||||
assert(!m_connected);
|
||||
// Begin a stream.
|
||||
m_streamer = m_stub->StreamingRecognize(&m_context);
|
||||
m_connected = true;
|
||||
|
||||
// read thread is waiting on this
|
||||
m_promise.set_value();
|
||||
|
||||
// Write the first request, containing the config only.
|
||||
m_streamer->Write(m_request);
|
||||
|
||||
// send any buffered audio
|
||||
int nFrames = m_audioBuffer.getNumItems();
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "GStreamer %p got stream ready, %d buffered frames\n", this, nFrames);
|
||||
if (nFrames) {
|
||||
char *p;
|
||||
do {
|
||||
p = m_audioBuffer.getNextChunk();
|
||||
if (p) {
|
||||
write(p, CHUNKSIZE);
|
||||
}
|
||||
} while (p);
|
||||
}
|
||||
}
|
||||
|
||||
bool write(void* data, uint32_t datalen) {
|
||||
if (!m_connected) {
|
||||
if (datalen % CHUNKSIZE == 0) {
|
||||
m_audioBuffer.add(data, datalen);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
m_request.set_audio_content(data, datalen);
|
||||
bool ok = m_streamer->Write(m_request);
|
||||
return ok;
|
||||
}
|
||||
|
||||
uint32_t nextMessageSize(void) {
|
||||
uint32_t size = 0;
|
||||
m_streamer->NextMessageSize(&size);
|
||||
return size;
|
||||
}
|
||||
|
||||
bool read(StreamingRecognizeResponse* response) {
|
||||
return m_streamer->Read(response);
|
||||
}
|
||||
|
||||
grpc::Status finish() {
|
||||
return m_streamer->Finish();
|
||||
}
|
||||
|
||||
void writesDone() {
|
||||
// grpc crashes if we call this twice on a stream
|
||||
if (!m_connected) {
|
||||
cancelConnect();
|
||||
}
|
||||
else if (!m_writesDone) {
|
||||
m_streamer->WritesDone();
|
||||
m_writesDone = true;
|
||||
}
|
||||
}
|
||||
|
||||
bool waitForConnect() {
|
||||
std::shared_future<void> sf(m_promise.get_future());
|
||||
sf.wait();
|
||||
return m_connected;
|
||||
}
|
||||
|
||||
void cancelConnect() {
|
||||
assert(!m_connected);
|
||||
m_promise.set_value();
|
||||
}
|
||||
|
||||
bool isConnected() {
|
||||
return m_connected;
|
||||
}
|
||||
|
||||
private:
|
||||
switch_core_session_t* m_session;
|
||||
grpc::ClientContext m_context;
|
||||
std::shared_ptr<grpc::Channel> m_channel;
|
||||
std::unique_ptr<Speech::Stub> m_stub;
|
||||
std::unique_ptr< grpc::ClientReaderWriterInterface<StreamingRecognizeRequest, StreamingRecognizeResponse> > m_streamer;
|
||||
StreamingRecognizeRequest m_request;
|
||||
bool m_writesDone;
|
||||
bool m_connected;
|
||||
std::promise<void> m_promise;
|
||||
SimpleBuffer m_audioBuffer;
|
||||
};
|
||||
|
||||
static void *SWITCH_THREAD_FUNC grpc_read_thread(switch_thread_t *thread, void *obj) {
|
||||
static int count;
|
||||
struct cap_cb *cb = (struct cap_cb *) obj;
|
||||
GStreamer* streamer = (GStreamer *) cb->streamer;
|
||||
|
||||
bool connected = streamer->waitForConnect();
|
||||
if (!connected) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "google transcribe grpc read thread exiting since we didnt connect\n") ;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Read responses.
|
||||
StreamingRecognizeResponse response;
|
||||
while (streamer->read(&response)) { // Returns false when no more to read.
|
||||
switch_core_session_t* session = switch_core_session_locate(cb->sessionId);
|
||||
if (!session) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "grpc_read_thread: session %s is gone!\n", cb->sessionId) ;
|
||||
return nullptr;
|
||||
}
|
||||
count++;
|
||||
auto speech_event_type = response.speech_event_type();
|
||||
if (response.has_error()) {
|
||||
Status status = response.error();
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "grpc_read_thread: error %s (%d)\n", status.message().c_str(), status.code()) ;
|
||||
cJSON* json = cJSON_CreateObject();
|
||||
cJSON_AddStringToObject(json, "type", "error");
|
||||
cJSON_AddStringToObject(json, "error", status.message().c_str());
|
||||
char* jsonString = cJSON_PrintUnformatted(json);
|
||||
cb->responseHandler(session, jsonString, cb->bugname);
|
||||
free(jsonString);
|
||||
cJSON_Delete(json);
|
||||
}
|
||||
|
||||
if (cb->play_file == 1){
|
||||
cb->responseHandler(session, "play_interrupt", cb->bugname);
|
||||
}
|
||||
|
||||
for (int r = 0; r < response.results_size(); ++r) {
|
||||
auto result = response.results(r);
|
||||
cJSON * jResult = cJSON_CreateObject();
|
||||
cJSON * jAlternatives = cJSON_CreateArray();
|
||||
cJSON * jStability = cJSON_CreateNumber(result.stability());
|
||||
cJSON * jIsFinal = cJSON_CreateBool(result.is_final());
|
||||
cJSON * jLanguageCode = cJSON_CreateString(result.language_code().c_str());
|
||||
cJSON * jChannelTag = cJSON_CreateNumber(result.channel_tag());
|
||||
|
||||
auto duration = result.result_end_time();
|
||||
int32_t seconds = duration.seconds();
|
||||
int64_t nanos = duration.nanos();
|
||||
int span = (int) trunc(seconds * 1000. + ((float) nanos / 1000000.));
|
||||
cJSON * jResultEndTime = cJSON_CreateNumber(span);
|
||||
|
||||
cJSON_AddItemToObject(jResult, "stability", jStability);
|
||||
cJSON_AddItemToObject(jResult, "is_final", jIsFinal);
|
||||
cJSON_AddItemToObject(jResult, "alternatives", jAlternatives);
|
||||
cJSON_AddItemToObject(jResult, "language_code", jLanguageCode);
|
||||
cJSON_AddItemToObject(jResult, "channel_tag", jChannelTag);
|
||||
cJSON_AddItemToObject(jResult, "result_end_time", jResultEndTime);
|
||||
|
||||
for (int a = 0; a < result.alternatives_size(); ++a) {
|
||||
auto alternative = result.alternatives(a);
|
||||
cJSON* jAlt = cJSON_CreateObject();
|
||||
cJSON* jConfidence = cJSON_CreateNumber(alternative.confidence());
|
||||
cJSON* jTranscript = cJSON_CreateString(alternative.transcript().c_str());
|
||||
cJSON_AddItemToObject(jAlt, "confidence", jConfidence);
|
||||
cJSON_AddItemToObject(jAlt, "transcript", jTranscript);
|
||||
|
||||
if (alternative.words_size() > 0) {
|
||||
cJSON * jWords = cJSON_CreateArray();
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "grpc_read_thread: %d words\n", alternative.words_size()) ;
|
||||
for (int b = 0; b < alternative.words_size(); b++) {
|
||||
auto words = alternative.words(b);
|
||||
cJSON* jWord = cJSON_CreateObject();
|
||||
cJSON_AddItemToObject(jWord, "word", cJSON_CreateString(words.word().c_str()));
|
||||
if (words.has_start_time()) {
|
||||
cJSON_AddItemToObject(jWord, "start_time", cJSON_CreateNumber(words.start_time().seconds()));
|
||||
}
|
||||
if (words.has_end_time()) {
|
||||
cJSON_AddItemToObject(jWord, "end_time", cJSON_CreateNumber(words.end_time().seconds()));
|
||||
}
|
||||
int speaker_tag = words.speaker_tag();
|
||||
if (speaker_tag > 0) {
|
||||
cJSON_AddItemToObject(jWord, "speaker_tag", cJSON_CreateNumber(speaker_tag));
|
||||
}
|
||||
float confidence = words.confidence();
|
||||
if (confidence > 0.0) {
|
||||
cJSON_AddItemToObject(jWord, "confidence", cJSON_CreateNumber(confidence));
|
||||
}
|
||||
|
||||
cJSON_AddItemToArray(jWords, jWord);
|
||||
}
|
||||
cJSON_AddItemToObject(jAlt, "words", jWords);
|
||||
}
|
||||
cJSON_AddItemToArray(jAlternatives, jAlt);
|
||||
}
|
||||
|
||||
char* json = cJSON_PrintUnformatted(jResult);
|
||||
cb->responseHandler(session, (const char *) json, cb->bugname);
|
||||
free(json);
|
||||
|
||||
cJSON_Delete(jResult);
|
||||
}
|
||||
|
||||
if (speech_event_type == StreamingRecognizeResponse_SpeechEventType_END_OF_SINGLE_UTTERANCE) {
|
||||
// we only get this when we have requested it, and recognition stops after we get this
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "grpc_read_thread: got end_of_utterance\n") ;
|
||||
cb->got_end_of_utterance = 1;
|
||||
cb->responseHandler(session, "end_of_utterance", cb->bugname);
|
||||
if (cb->wants_single_utterance) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "grpc_read_thread: sending writesDone because we want only a single utterance\n") ;
|
||||
streamer->writesDone();
|
||||
}
|
||||
}
|
||||
switch_core_session_rwunlock(session);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "grpc_read_thread: got %d responses\n", response.results_size());
|
||||
}
|
||||
|
||||
{
|
||||
switch_core_session_t* session = switch_core_session_locate(cb->sessionId);
|
||||
if (session) {
|
||||
grpc::Status status = streamer->finish();
|
||||
if (11 == status.error_code()) {
|
||||
if (std::string::npos != status.error_message().find("Exceeded maximum allowed stream duration")) {
|
||||
cb->responseHandler(session, "max_duration_exceeded", cb->bugname);
|
||||
}
|
||||
else {
|
||||
cb->responseHandler(session, "no_audio", cb->bugname);
|
||||
}
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "grpc_read_thread: finish() status %s (%d)\n", status.error_message().c_str(), status.error_code()) ;
|
||||
switch_core_session_rwunlock(session);
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
switch_status_t google_speech_init() {
|
||||
const char* gcsServiceKeyFile = std::getenv("GOOGLE_APPLICATION_CREDENTIALS");
|
||||
if (gcsServiceKeyFile) {
|
||||
try {
|
||||
auto creds = grpc::GoogleDefaultCredentials();
|
||||
} catch (const std::exception& e) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT,
|
||||
"Error initializing google api with provided credentials in %s: %s\n", gcsServiceKeyFile, e.what());
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
}
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_status_t google_speech_cleanup() {
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
switch_status_t google_speech_session_init(switch_core_session_t *session, responseHandler_t responseHandler,
|
||||
uint32_t to_rate, uint32_t samples_per_second, uint32_t channels, char* lang, int interim, char *bugname,
|
||||
int single_utterance, int separate_recognition, int max_alternatives, int profanity_filter, int word_time_offset,
|
||||
int punctuation, const char* model, int enhanced, const char* hints, char* play_file, void **ppUserData) {
|
||||
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
auto read_codec = switch_core_session_get_read_codec(session);
|
||||
uint32_t sampleRate = read_codec->implementation->actual_samples_per_second;
|
||||
struct cap_cb *cb;
|
||||
int err;
|
||||
|
||||
cb =(struct cap_cb *) switch_core_session_alloc(session, sizeof(*cb));
|
||||
strncpy(cb->sessionId, switch_core_session_get_uuid(session), MAX_SESSION_ID);
|
||||
strncpy(cb->bugname, bugname, MAX_BUG_LEN);
|
||||
cb->got_end_of_utterance = 0;
|
||||
cb->wants_single_utterance = single_utterance;
|
||||
if (play_file != NULL){
|
||||
cb->play_file = 1;
|
||||
}
|
||||
|
||||
switch_mutex_init(&cb->mutex, SWITCH_MUTEX_NESTED, switch_core_session_get_pool(session));
|
||||
if (sampleRate != to_rate) {
|
||||
cb->resampler = speex_resampler_init(channels, sampleRate, to_rate, SWITCH_RESAMPLE_QUALITY, &err);
|
||||
if (0 != err) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error initializing resampler: %s.\n",
|
||||
switch_channel_get_name(channel), speex_resampler_strerror(err));
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
} else {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "%s: no resampling needed for this call\n", switch_channel_get_name(channel));
|
||||
}
|
||||
cb->responseHandler = responseHandler;
|
||||
|
||||
// allocate vad if we are delaying connecting to the recognizer until we detect speech
|
||||
if (switch_channel_var_true(channel, "START_RECOGNIZING_ON_VAD")) {
|
||||
cb->vad = switch_vad_init(sampleRate, channels);
|
||||
if (cb->vad) {
|
||||
const char* var;
|
||||
int mode = 2;
|
||||
int silence_ms = 150;
|
||||
int voice_ms = 250;
|
||||
int debug = 0;
|
||||
|
||||
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_MODE")) {
|
||||
mode = atoi(var);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_SILENCE_MS")) {
|
||||
silence_ms = atoi(var);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_VOICE_MS")) {
|
||||
voice_ms = atoi(var);
|
||||
}
|
||||
if (var = switch_channel_get_variable(channel, "RECOGNIZER_VAD_VOICE_MS")) {
|
||||
voice_ms = atoi(var);
|
||||
}
|
||||
switch_vad_set_mode(cb->vad, mode);
|
||||
switch_vad_set_param(cb->vad, "silence_ms", silence_ms);
|
||||
switch_vad_set_param(cb->vad, "voice_ms", voice_ms);
|
||||
switch_vad_set_param(cb->vad, "debug", debug);
|
||||
}
|
||||
}
|
||||
|
||||
GStreamer *streamer = NULL;
|
||||
try {
|
||||
streamer = new GStreamer(session, channels, lang, interim, to_rate, sampleRate, single_utterance, separate_recognition, max_alternatives,
|
||||
profanity_filter, word_time_offset, punctuation, model, enhanced, hints);
|
||||
cb->streamer = streamer;
|
||||
} catch (std::exception& e) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error initializing gstreamer: %s.\n",
|
||||
switch_channel_get_name(channel), e.what());
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
if (!cb->vad) streamer->connect();
|
||||
|
||||
// create the read thread
|
||||
switch_threadattr_t *thd_attr = NULL;
|
||||
switch_memory_pool_t *pool = switch_core_session_get_pool(session);
|
||||
|
||||
switch_threadattr_create(&thd_attr, pool);
|
||||
switch_threadattr_stacksize_set(thd_attr, SWITCH_THREAD_STACKSIZE);
|
||||
switch_thread_create(&cb->thread, thd_attr, grpc_read_thread, cb, pool);
|
||||
|
||||
*ppUserData = cb;
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_status_t google_speech_session_cleanup(switch_core_session_t *session, int channelIsClosing, switch_media_bug_t *bug) {
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
|
||||
if (bug) {
|
||||
struct cap_cb *cb = (struct cap_cb *) switch_core_media_bug_get_user_data(bug);
|
||||
switch_mutex_lock(cb->mutex);
|
||||
|
||||
if (!switch_channel_get_private(channel, cb->bugname)) {
|
||||
// race condition
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s Bug is not attached (race).\n", switch_channel_get_name(channel));
|
||||
switch_mutex_unlock(cb->mutex);
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
switch_channel_set_private(channel, cb->bugname, NULL);
|
||||
|
||||
// stop playback if available
|
||||
if (cb->play_file == 1){
|
||||
if (switch_channel_test_flag(channel, CF_BROADCAST)) {
|
||||
switch_channel_stop_broadcast(channel);
|
||||
} else {
|
||||
switch_channel_set_flag_value(channel, CF_BREAK, 1);
|
||||
}
|
||||
}
|
||||
|
||||
// close connection and get final responses
|
||||
GStreamer* streamer = (GStreamer *) cb->streamer;
|
||||
|
||||
if (streamer) {
|
||||
streamer->writesDone();
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "google_speech_session_cleanup: GStreamer (%p) waiting for read thread to complete\n", (void*)streamer);
|
||||
switch_status_t st;
|
||||
switch_thread_join(&st, cb->thread);
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "google_speech_session_cleanup: GStreamer (%p) read thread completed\n", (void*)streamer);
|
||||
|
||||
delete streamer;
|
||||
cb->streamer = NULL;
|
||||
}
|
||||
|
||||
if (cb->resampler) {
|
||||
speex_resampler_destroy(cb->resampler);
|
||||
}
|
||||
if (cb->vad) {
|
||||
switch_vad_destroy(&cb->vad);
|
||||
cb->vad = nullptr;
|
||||
}
|
||||
if (!channelIsClosing) {
|
||||
switch_core_media_bug_remove(session, &bug);
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "google_speech_session_cleanup: Closed stream\n");
|
||||
|
||||
switch_mutex_unlock(cb->mutex);
|
||||
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s Bug is not attached.\n", switch_channel_get_name(channel));
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
switch_bool_t google_speech_frame(switch_media_bug_t *bug, void* user_data) {
|
||||
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
|
||||
struct cap_cb *cb = (struct cap_cb *) user_data;
|
||||
if (cb->streamer && (!cb->wants_single_utterance || !cb->got_end_of_utterance)) {
|
||||
GStreamer* streamer = (GStreamer *) cb->streamer;
|
||||
uint8_t data[SWITCH_RECOMMENDED_BUFFER_SIZE];
|
||||
switch_frame_t frame = {};
|
||||
frame.data = data;
|
||||
frame.buflen = SWITCH_RECOMMENDED_BUFFER_SIZE;
|
||||
|
||||
if (switch_mutex_trylock(cb->mutex) == SWITCH_STATUS_SUCCESS) {
|
||||
while (switch_core_media_bug_read(bug, &frame, SWITCH_TRUE) == SWITCH_STATUS_SUCCESS && !switch_test_flag((&frame), SFF_CNG)) {
|
||||
if (frame.datalen) {
|
||||
if (cb->vad && !streamer->isConnected()) {
|
||||
switch_vad_state_t state = switch_vad_process(cb->vad, (int16_t*) frame.data, frame.samples);
|
||||
if (state == SWITCH_VAD_STATE_START_TALKING) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "detected speech, connect to google speech now\n");
|
||||
streamer->connect();
|
||||
cb->responseHandler(session, "vad_detected", cb->bugname);
|
||||
}
|
||||
}
|
||||
|
||||
if (cb->resampler) {
|
||||
spx_int16_t out[SWITCH_RECOMMENDED_BUFFER_SIZE];
|
||||
spx_uint32_t out_len = SWITCH_RECOMMENDED_BUFFER_SIZE;
|
||||
spx_uint32_t in_len = frame.samples;
|
||||
size_t written;
|
||||
|
||||
speex_resampler_process_interleaved_int(cb->resampler,
|
||||
(const spx_int16_t *) frame.data,
|
||||
(spx_uint32_t *) &in_len,
|
||||
&out[0],
|
||||
&out_len);
|
||||
streamer->write( &out[0], sizeof(spx_int16_t) * out_len);
|
||||
}
|
||||
else {
|
||||
streamer->write( frame.data, sizeof(spx_int16_t) * frame.samples);
|
||||
}
|
||||
}
|
||||
}
|
||||
switch_mutex_unlock(cb->mutex);
|
||||
}
|
||||
}
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
}
|
||||
13
mod_google_transcribe/google_glue.h
Normal file
13
mod_google_transcribe/google_glue.h
Normal file
@@ -0,0 +1,13 @@
|
||||
#ifndef __GOOGLE_GLUE_H__
|
||||
#define __GOOGLE_GLUE_H__
|
||||
|
||||
switch_status_t google_speech_init();
|
||||
switch_status_t google_speech_cleanup();
|
||||
switch_status_t google_speech_session_init(switch_core_session_t *session, responseHandler_t responseHandler,
|
||||
uint32_t to_rate, uint32_t samples_per_second, uint32_t channels, char* lang, int interim, char *bugname, int single_utterence,
|
||||
int separate_recognition, int max_alternatives, int profinity_filter, int word_time_offset, int punctuation, const char* model, int enhanced,
|
||||
const char* hints, char* play_file, void **ppUserData);
|
||||
switch_status_t google_speech_session_cleanup(switch_core_session_t *session, int channelIsClosing, switch_media_bug_t *bug);
|
||||
switch_bool_t google_speech_frame(switch_media_bug_t *bug, void* user_data);
|
||||
|
||||
#endif
|
||||
484
mod_google_transcribe/mod_google_transcribe.c
Normal file
484
mod_google_transcribe/mod_google_transcribe.c
Normal file
@@ -0,0 +1,484 @@
|
||||
/*
|
||||
*
|
||||
* mod_google_transcribe.c -- Freeswitch module for real-time transcription using google's gRPC interface
|
||||
*
|
||||
*/
|
||||
#include "mod_google_transcribe.h"
|
||||
#include "google_glue.h"
|
||||
#include <stdlib.h>
|
||||
#include <switch.h>
|
||||
|
||||
static const uint32_t DEFAULT_SAMPLE_RATE = 8000;
|
||||
|
||||
/* Prototypes */
|
||||
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_transcribe_shutdown);
|
||||
SWITCH_MODULE_RUNTIME_FUNCTION(mod_transcribe_runtime);
|
||||
SWITCH_MODULE_LOAD_FUNCTION(mod_transcribe_load);
|
||||
|
||||
SWITCH_MODULE_DEFINITION(mod_google_transcribe, mod_transcribe_load, mod_transcribe_shutdown, NULL);
|
||||
|
||||
static switch_status_t do_stop(switch_core_session_t *session, char* bugname);
|
||||
|
||||
|
||||
static void responseHandler(switch_core_session_t* session, const char * json, const char* bugname) {
|
||||
switch_event_t *event;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
|
||||
if (0 == strcmp("vad_detected", json)) {
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_VAD_DETECTED);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
|
||||
}
|
||||
else if (0 == strcmp("end_of_utterance", json)) {
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_END_OF_UTTERANCE);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
|
||||
}
|
||||
else if (0 == strcmp("end_of_transcript", json)) {
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_END_OF_TRANSCRIPT);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
|
||||
}
|
||||
else if (0 == strcmp("start_of_transcript", json)) {
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_START_OF_TRANSCRIPT);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
|
||||
}
|
||||
else if (0 == strcmp("max_duration_exceeded", json)) {
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_MAX_DURATION_EXCEEDED);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
|
||||
}
|
||||
else if (0 == strcmp("no_audio", json)) {
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_NO_AUDIO_DETECTED);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
|
||||
}
|
||||
else if (0 == strcmp("play_interrupt", json)){
|
||||
switch_event_t *qevent;
|
||||
switch_status_t status;
|
||||
if (switch_event_create(&qevent, SWITCH_EVENT_DETECTED_SPEECH) == SWITCH_STATUS_SUCCESS) {
|
||||
if ((status = switch_core_session_queue_event(session, &qevent)) != SWITCH_STATUS_SUCCESS){
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "unable to queue play inturrupt event %d \n", status);
|
||||
}
|
||||
}else{
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "unable to create play inturrupt event \n");
|
||||
}
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_PLAY_INTERRUPT);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
|
||||
}
|
||||
else {
|
||||
int error = 0;
|
||||
cJSON* jMessage = cJSON_Parse(json);
|
||||
if (jMessage) {
|
||||
const char* type = cJSON_GetStringValue(cJSON_GetObjectItem(jMessage, "type"));
|
||||
if (type && 0 == strcmp(type, "error")) {
|
||||
error = 1;
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_ERROR);
|
||||
}
|
||||
cJSON_Delete(jMessage);
|
||||
}
|
||||
if (!error) {
|
||||
switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, TRANSCRIBE_EVENT_RESULTS);
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "%s json payload: %s.\n", bugname ? bugname : "google_transcribe", json);
|
||||
switch_channel_event_set_data(channel, event);
|
||||
switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "transcription-vendor", "google");
|
||||
switch_event_add_body(event, "%s", json);
|
||||
}
|
||||
if (bugname) switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "media-bugname", bugname);
|
||||
switch_event_fire(&event);
|
||||
}
|
||||
|
||||
static switch_bool_t capture_callback(switch_media_bug_t *bug, void *user_data, switch_abc_type_t type)
|
||||
{
|
||||
switch_core_session_t *session = switch_core_media_bug_get_session(bug);
|
||||
struct cap_cb* cb = (struct cap_cb*) switch_core_media_bug_get_user_data(bug);
|
||||
|
||||
switch (type) {
|
||||
case SWITCH_ABC_TYPE_INIT:
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Got SWITCH_ABC_TYPE_INIT.\n");
|
||||
responseHandler(session, "start_of_transcript", cb->bugname);
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_CLOSE:
|
||||
{
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Got SWITCH_ABC_TYPE_CLOSE, calling google_speech_session_cleanup.\n");
|
||||
responseHandler(session, "end_of_transcript", cb->bugname);
|
||||
google_speech_session_cleanup(session, 1, bug);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Finished SWITCH_ABC_TYPE_CLOSE.\n");
|
||||
}
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_READ:
|
||||
|
||||
return google_speech_frame(bug, user_data);
|
||||
break;
|
||||
|
||||
case SWITCH_ABC_TYPE_WRITE:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return SWITCH_TRUE;
|
||||
}
|
||||
|
||||
static switch_status_t transcribe_input_callback(switch_core_session_t *session, void *input, switch_input_type_t input_type, void *data, unsigned int len){
|
||||
if (input_type == SWITCH_INPUT_TYPE_EVENT) {
|
||||
switch_event_t *event;
|
||||
event = (switch_event_t *)input;
|
||||
if (event->event_id == SWITCH_EVENT_DETECTED_SPEECH) {
|
||||
return SWITCH_STATUS_BREAK;
|
||||
}
|
||||
}
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static switch_status_t do_stop(switch_core_session_t *session, char *bugname)
|
||||
{
|
||||
switch_status_t status = SWITCH_STATUS_SUCCESS;
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug = switch_channel_get_private(channel, bugname);
|
||||
|
||||
if (bug) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Received user command command, calling google_speech_session_cleanup (possibly to stop prev transcribe)\n");
|
||||
status = google_speech_session_cleanup(session, 0, bug);
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "stopped transcription.\n");
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static switch_status_t start_capture2(switch_core_session_t *session, switch_media_bug_flag_t flags,
|
||||
uint32_t sample_rate, char* lang, int interim, int single_utterance, int separate_recognition, int max_alternatives,
|
||||
int profinity_filter, int word_time_offset, int punctuation, const char* model, int enhanced, const char* hints, char* play_file)
|
||||
{
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug;
|
||||
switch_status_t status;
|
||||
switch_codec_implementation_t read_impl = { 0 };
|
||||
void *pUserData;
|
||||
uint32_t samples_per_second;
|
||||
switch_input_args_t args = { 0 };
|
||||
|
||||
if (switch_channel_get_private(channel, MY_BUG_NAME)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "removing bug from previous transcribe\n");
|
||||
do_stop(session, MY_BUG_NAME);
|
||||
}
|
||||
|
||||
switch_core_session_get_read_impl(session, &read_impl);
|
||||
|
||||
if (switch_channel_pre_answer(channel) != SWITCH_STATUS_SUCCESS) {
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
samples_per_second = !strcasecmp(read_impl.iananame, "g722") ? read_impl.actual_samples_per_second : read_impl.samples_per_second;
|
||||
|
||||
if (SWITCH_STATUS_FALSE == google_speech_session_init(session, responseHandler, sample_rate, samples_per_second, flags & SMBF_STEREO ? 2 : 1, lang, interim, MY_BUG_NAME, single_utterance,
|
||||
separate_recognition, max_alternatives, profinity_filter, word_time_offset, punctuation, model, enhanced, hints, play_file, &pUserData)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error initializing google speech session.\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
if ((status = switch_core_media_bug_add(session, "google_transcribe", NULL, capture_callback, pUserData, 0, flags, &bug)) != SWITCH_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
|
||||
switch_channel_set_private(channel, MY_BUG_NAME, bug);
|
||||
|
||||
/* play the prompt, looking for detection result */
|
||||
if (play_file != NULL){
|
||||
args.input_callback = transcribe_input_callback;
|
||||
switch_ivr_play_file(session, NULL, play_file, &args);
|
||||
}
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static switch_status_t start_capture(switch_core_session_t *session, switch_media_bug_flag_t flags,
|
||||
char* lang, int interim, char* bugname)
|
||||
{
|
||||
switch_channel_t *channel = switch_core_session_get_channel(session);
|
||||
switch_media_bug_t *bug;
|
||||
switch_status_t status;
|
||||
switch_codec_implementation_t read_impl = { 0 };
|
||||
void *pUserData;
|
||||
uint32_t samples_per_second;
|
||||
int single_utterance = 0, separate_recognition = 0, max_alternatives = 0, profanity_filter = 0, word_time_offset = 0, punctuation = 0, enhanced = 0;
|
||||
const char* hints = NULL;
|
||||
const char* model = NULL;
|
||||
const char* var;
|
||||
|
||||
if (switch_channel_get_private(channel, bugname)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "removing bug from previous transcribe\n");
|
||||
do_stop(session, bugname);
|
||||
}
|
||||
|
||||
if (switch_true(switch_channel_get_variable(channel, "GOOGLE_SPEECH_SINGLE_UTTERANCE"))) {
|
||||
single_utterance = 1;
|
||||
}
|
||||
|
||||
// transcribe each separately?
|
||||
if (switch_true(switch_channel_get_variable(channel, "GOOGLE_SPEECH_SEPARATE_RECOGNITION_PER_CHANNEL"))) {
|
||||
separate_recognition = 1;
|
||||
}
|
||||
|
||||
// max alternatives
|
||||
if ((var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_MAX_ALTERNATIVES"))) {
|
||||
max_alternatives = atoi(var);
|
||||
}
|
||||
|
||||
// profanity filter
|
||||
if (switch_true(switch_channel_get_variable(channel, "GOOGLE_SPEECH_PROFANITY_FILTER"))) {
|
||||
profanity_filter = 1;
|
||||
}
|
||||
|
||||
// enable word offsets
|
||||
if (switch_true(switch_channel_get_variable(channel, "GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS"))) {
|
||||
word_time_offset = 1;
|
||||
}
|
||||
|
||||
// enable automatic punctuation
|
||||
if (switch_true(switch_channel_get_variable(channel, "GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION"))) {
|
||||
punctuation = 1;
|
||||
}
|
||||
|
||||
// speech model
|
||||
if ((var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_MODEL"))) {
|
||||
model = var;
|
||||
}
|
||||
|
||||
// use enhanced model
|
||||
if (switch_true(switch_channel_get_variable(channel, "GOOGLE_SPEECH_USE_ENHANCED"))) {
|
||||
enhanced = 1;
|
||||
}
|
||||
|
||||
// hints
|
||||
if ((var = switch_channel_get_variable(channel, "GOOGLE_SPEECH_HINTS"))) {
|
||||
hints = var;
|
||||
}
|
||||
|
||||
switch_core_session_get_read_impl(session, &read_impl);
|
||||
|
||||
if (switch_channel_pre_answer(channel) != SWITCH_STATUS_SUCCESS) {
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
samples_per_second = !strcasecmp(read_impl.iananame, "g722") ? read_impl.actual_samples_per_second : read_impl.samples_per_second;
|
||||
|
||||
if (SWITCH_STATUS_FALSE == google_speech_session_init(session, responseHandler, DEFAULT_SAMPLE_RATE, samples_per_second, flags & SMBF_STEREO ? 2 : 1, lang, interim, bugname, single_utterance,
|
||||
separate_recognition, max_alternatives, profanity_filter, word_time_offset, punctuation, model, enhanced, hints, NULL, &pUserData)) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error initializing google speech session.\n");
|
||||
return SWITCH_STATUS_FALSE;
|
||||
}
|
||||
|
||||
if ((status = switch_core_media_bug_add(session, bugname, NULL, capture_callback, pUserData, 0, flags, &bug)) != SWITCH_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
|
||||
switch_channel_set_private(channel, bugname, bug);
|
||||
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// #define TRANSCRIBE_API_SYNTAX "<uuid> [start|stop] [lang-code] [interim] [single-utterance](bool) [seperate-recognition](bool) [max-alternatives](int) [profinity-filter](bool) [word-time](bool) [punctuation](bool) [model](string) [enhanced](true) [hints](string without space) [play-file]"
|
||||
#define TRANSCRIBE2_API_SYNTAX "<uuid> [start|stop] [lang-code] [interim] [single-utterance] [seperate-recognition] [max-alternatives] [profinity-filter] [word-time] [punctuation] [sample-rate] [model] [enhanced] [hints] [play-file]"
|
||||
SWITCH_STANDARD_API(transcribe2_function)
|
||||
{
|
||||
char *mycmd = NULL, *argv[20] = { 0 };
|
||||
int argc = 0, enhanced = 0;
|
||||
uint32_t sample_rate = DEFAULT_SAMPLE_RATE;
|
||||
const char* hints = NULL;
|
||||
const char* model = NULL;
|
||||
char* play_file = NULL;
|
||||
|
||||
switch_status_t status = SWITCH_STATUS_FALSE;
|
||||
switch_media_bug_flag_t flags = SMBF_READ_STREAM /* | SMBF_WRITE_STREAM | SMBF_READ_PING */;
|
||||
|
||||
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
|
||||
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
|
||||
}
|
||||
|
||||
if (zstr(cmd) ||
|
||||
(argc < 2) ||
|
||||
(!strcasecmp(argv[1], "start") && argc < 10) ||
|
||||
zstr(argv[0])) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s %s.\n", cmd, argv[0]);
|
||||
stream->write_function(stream, "-USAGE: %s\n", TRANSCRIBE2_API_SYNTAX);
|
||||
goto done;
|
||||
} else {
|
||||
switch_core_session_t *lsession = NULL;
|
||||
|
||||
if ((lsession = switch_core_session_locate(argv[0]))) {
|
||||
if (!strcasecmp(argv[1], "stop")) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "stop transcribing\n");
|
||||
status = do_stop(lsession, MY_BUG_NAME);
|
||||
} else if (!strcasecmp(argv[1], "start")) {
|
||||
char* lang = argv[2];
|
||||
int interim = argc > 3 && !strcmp(argv[3], "true");
|
||||
int single_utterance = !strcmp(argv[4], "true"); // single-utterance
|
||||
int separate_recognition = !strcmp(argv[5], "true"); // sepreate-recognition
|
||||
int max_alternatives = atoi(argv[6]); // max-alternatives
|
||||
int profinity_filter = !strcmp(argv[7], "true"); // profinity-filter
|
||||
int word_time_offset = !strcmp(argv[8], "true"); // word-time
|
||||
int punctuation = !strcmp(argv[9], "true"); //punctuation
|
||||
if (argc > 10) {
|
||||
sample_rate = atol(argv[10]);
|
||||
}
|
||||
if (argc > 12){
|
||||
model = argv[11]; // model
|
||||
enhanced = !strcmp(argv[12], "true"); // enhanced
|
||||
}
|
||||
if (argc > 13){
|
||||
hints = argv[13]; // hints
|
||||
}
|
||||
if (argc > 14){
|
||||
play_file = argv[14];
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "start transcribing %s %s\n", lang, interim ? "interim": "complete");
|
||||
status = start_capture2(lsession, flags, sample_rate, lang, interim, single_utterance, separate_recognition,max_alternatives,
|
||||
profinity_filter, word_time_offset, punctuation, model, enhanced, hints, play_file);
|
||||
}
|
||||
switch_core_session_rwunlock(lsession);
|
||||
}
|
||||
}
|
||||
|
||||
if (status == SWITCH_STATUS_SUCCESS) {
|
||||
stream->write_function(stream, "+OK Success\n");
|
||||
} else {
|
||||
stream->write_function(stream, "-ERR Operation Failed\n");
|
||||
}
|
||||
|
||||
done:
|
||||
|
||||
switch_safe_free(mycmd);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
#define TRANSCRIBE_API_SYNTAX "<uuid> [start|stop] [lang-code] [interim|full] [stereo|mono] [bug-name]"
|
||||
SWITCH_STANDARD_API(transcribe_function)
|
||||
{
|
||||
char *mycmd = NULL, *argv[6] = { 0 };
|
||||
int argc = 0;
|
||||
switch_status_t status = SWITCH_STATUS_FALSE;
|
||||
switch_media_bug_flag_t flags = SMBF_READ_STREAM /* | SMBF_WRITE_STREAM | SMBF_READ_PING */;
|
||||
|
||||
if (!zstr(cmd) && (mycmd = strdup(cmd))) {
|
||||
argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
|
||||
}
|
||||
|
||||
if (zstr(cmd) ||
|
||||
(!strcasecmp(argv[1], "stop") && argc < 2) ||
|
||||
(!strcasecmp(argv[1], "start") && argc < 3) ||
|
||||
zstr(argv[0])) {
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s %s %s.\n", cmd, argv[0], argv[1]);
|
||||
stream->write_function(stream, "-USAGE: %s\n", TRANSCRIBE_API_SYNTAX);
|
||||
goto done;
|
||||
} else {
|
||||
switch_core_session_t *lsession = NULL;
|
||||
|
||||
if ((lsession = switch_core_session_locate(argv[0]))) {
|
||||
if (!strcasecmp(argv[1], "stop")) {
|
||||
char *bugname = argc > 2 ? argv[2] : MY_BUG_NAME;
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "stop transcribing\n");
|
||||
status = do_stop(lsession, bugname);
|
||||
} else if (!strcasecmp(argv[1], "start")) {
|
||||
char* lang = argv[2];
|
||||
int interim = argc > 3 && !strcmp(argv[3], "interim");
|
||||
char *bugname = argc > 5 ? argv[5] : MY_BUG_NAME;
|
||||
if (argc > 4 && !strcmp(argv[4], "stereo")) {
|
||||
flags |= SMBF_WRITE_STREAM ;
|
||||
flags |= SMBF_STEREO;
|
||||
}
|
||||
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "%s start transcribing %s %s\n", bugname, lang, interim ? "interim": "complete");
|
||||
status = start_capture(lsession, flags, lang, interim, bugname);
|
||||
}
|
||||
switch_core_session_rwunlock(lsession);
|
||||
}
|
||||
}
|
||||
|
||||
if (status == SWITCH_STATUS_SUCCESS) {
|
||||
stream->write_function(stream, "+OK Success\n");
|
||||
} else {
|
||||
stream->write_function(stream, "-ERR Operation Failed\n");
|
||||
}
|
||||
|
||||
done:
|
||||
|
||||
switch_safe_free(mycmd);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
SWITCH_MODULE_LOAD_FUNCTION(mod_transcribe_load)
|
||||
{
|
||||
switch_api_interface_t *api_interface;
|
||||
|
||||
/* create/register custom event message type */
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_RESULTS) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_RESULTS);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_END_OF_UTTERANCE) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_END_OF_UTTERANCE);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_START_OF_TRANSCRIPT) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_START_OF_TRANSCRIPT);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_END_OF_TRANSCRIPT) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_END_OF_TRANSCRIPT);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_NO_AUDIO_DETECTED) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_NO_AUDIO_DETECTED);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_MAX_DURATION_EXCEEDED) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_MAX_DURATION_EXCEEDED);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
|
||||
if (switch_event_reserve_subclass(TRANSCRIBE_EVENT_PLAY_INTERRUPT) != SWITCH_STATUS_SUCCESS) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", TRANSCRIBE_EVENT_PLAY_INTERRUPT);
|
||||
return SWITCH_STATUS_TERM;
|
||||
}
|
||||
|
||||
/* connect my internal structure to the blank pointer passed to me */
|
||||
*module_interface = switch_loadable_module_create_module_interface(pool, modname);
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Google Speech Transcription API loading..\n");
|
||||
|
||||
if (SWITCH_STATUS_FALSE == google_speech_init()) {
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Failed initializing google speech interface\n");
|
||||
}
|
||||
|
||||
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Google Speech Transcription API successfully loaded\n");
|
||||
|
||||
SWITCH_ADD_API(api_interface, "uuid_google_transcribe", "Google Speech Transcription API", transcribe_function, TRANSCRIBE_API_SYNTAX);
|
||||
SWITCH_ADD_API(api_interface, "uuid_google_transcribe2", "Google Speech Transcription API", transcribe2_function, TRANSCRIBE2_API_SYNTAX);
|
||||
switch_console_set_complete("add uuid_google_transcribe start lang-code");
|
||||
switch_console_set_complete("add uuid_google_transcribe stop ");
|
||||
|
||||
/* indicate that the module should continue to be loaded */
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Called when the system shuts down
|
||||
Macro expands to: switch_status_t mod_google_transcribe_shutdown() */
|
||||
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_transcribe_shutdown)
|
||||
{
|
||||
google_speech_cleanup();
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_RESULTS);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_END_OF_UTTERANCE);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_START_OF_TRANSCRIPT);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_END_OF_TRANSCRIPT);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_NO_AUDIO_DETECTED);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_MAX_DURATION_EXCEEDED);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_END_OF_UTTERANCE);
|
||||
switch_event_free_subclass(TRANSCRIBE_EVENT_PLAY_INTERRUPT);
|
||||
return SWITCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
58
mod_google_transcribe/mod_google_transcribe.h
Normal file
58
mod_google_transcribe/mod_google_transcribe.h
Normal file
@@ -0,0 +1,58 @@
|
||||
#ifndef __MOD_GOOGLE_TRANSCRIBE_H__
|
||||
#define __MOD_GOOGLE_TRANSCRIBE_H__
|
||||
|
||||
#include <switch.h>
|
||||
#include <speex/speex_resampler.h>
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
#define MAX_SESSION_ID (256)
|
||||
#define MAX_BUG_LEN (64)
|
||||
#define MY_BUG_NAME "google_transcribe"
|
||||
#define TRANSCRIBE_EVENT_RESULTS "google_transcribe::transcription"
|
||||
#define TRANSCRIBE_EVENT_END_OF_UTTERANCE "google_transcribe::end_of_utterance"
|
||||
#define TRANSCRIBE_EVENT_START_OF_TRANSCRIPT "google_transcribe::start_of_transcript"
|
||||
#define TRANSCRIBE_EVENT_END_OF_TRANSCRIPT "google_transcribe::end_of_transcript"
|
||||
#define TRANSCRIBE_EVENT_NO_AUDIO_DETECTED "google_transcribe::no_audio_detected"
|
||||
#define TRANSCRIBE_EVENT_MAX_DURATION_EXCEEDED "google_transcribe::max_duration_exceeded"
|
||||
#define TRANSCRIBE_EVENT_PLAY_INTERRUPT "google_transcribe::play_interrupt"
|
||||
#define TRANSCRIBE_EVENT_VAD_DETECTED "google_transcribe::vad_detected"
|
||||
#define TRANSCRIBE_EVENT_ERROR "jambonz_transcribe::error"
|
||||
|
||||
|
||||
// simply write a wave file
|
||||
//#define DEBUG_TRANSCRIBE 0
|
||||
|
||||
|
||||
#ifdef DEBUG_TRANSCRIBE
|
||||
|
||||
/* per-channel data */
|
||||
struct cap_cb {
|
||||
switch_buffer_t *buffer;
|
||||
switch_mutex_t *mutex;
|
||||
char *base;
|
||||
SpeexResamplerState *resampler;
|
||||
FILE* fp;
|
||||
};
|
||||
#else
|
||||
/* per-channel data */
|
||||
typedef void (*responseHandler_t)(switch_core_session_t* session, const char* json, const char* bugname);
|
||||
|
||||
struct cap_cb {
|
||||
switch_mutex_t *mutex;
|
||||
char bugname[MAX_BUG_LEN+1];
|
||||
char sessionId[MAX_SESSION_ID+1];
|
||||
char *base;
|
||||
SpeexResamplerState *resampler;
|
||||
void* streamer;
|
||||
responseHandler_t responseHandler;
|
||||
switch_thread_t* thread;
|
||||
int wants_single_utterance;
|
||||
int got_end_of_utterance;
|
||||
int play_file;
|
||||
switch_vad_t * vad;
|
||||
uint32_t samples_per_second;
|
||||
};
|
||||
#endif
|
||||
|
||||
#endif
|
||||
51
mod_google_transcribe/simple_buffer.h
Normal file
51
mod_google_transcribe/simple_buffer.h
Normal file
@@ -0,0 +1,51 @@
|
||||
/**
|
||||
* (very) simple and limited circular buffer,
|
||||
* supporting only the use case of doing all of the adds
|
||||
* and then subsquently retrieves.
|
||||
*
|
||||
*/
|
||||
class SimpleBuffer {
|
||||
public:
|
||||
SimpleBuffer(uint32_t chunkSize, uint32_t numChunks) : numItems(0),
|
||||
m_numChunks(numChunks), m_chunkSize(chunkSize) {
|
||||
m_pData = new char[chunkSize * numChunks];
|
||||
m_pNextWrite = m_pData;
|
||||
}
|
||||
~SimpleBuffer() {
|
||||
delete [] m_pData;
|
||||
}
|
||||
|
||||
void add(void *data, uint32_t datalen) {
|
||||
if (datalen % m_chunkSize != 0) return;
|
||||
int numChunks = datalen / m_chunkSize;
|
||||
for (int i = 0; i < numChunks; i++) {
|
||||
memcpy(m_pNextWrite, data, m_chunkSize);
|
||||
data = static_cast<char*>(data) + m_chunkSize;
|
||||
if (numItems < m_numChunks) numItems++;
|
||||
|
||||
uint32_t offset = (m_pNextWrite - m_pData) / m_chunkSize;
|
||||
if (offset >= m_numChunks - 1) m_pNextWrite = m_pData;
|
||||
else m_pNextWrite += m_chunkSize;
|
||||
}
|
||||
}
|
||||
|
||||
char* getNextChunk() {
|
||||
if (numItems--) {
|
||||
char *p = m_pNextWrite;
|
||||
uint32_t offset = (m_pNextWrite - m_pData) / m_chunkSize;
|
||||
if (offset >= m_numChunks - 1) m_pNextWrite = m_pData;
|
||||
else m_pNextWrite += m_chunkSize;
|
||||
return p;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
uint32_t getNumItems() { return numItems;}
|
||||
|
||||
private:
|
||||
char *m_pData;
|
||||
uint32_t numItems;
|
||||
uint32_t m_chunkSize;
|
||||
uint32_t m_numChunks;
|
||||
char* m_pNextWrite;
|
||||
};
|
||||
Reference in New Issue
Block a user