support mod_vad_detect (#762)

* support mod_vad_detect

* wip

* update verb spec and drachtio fsmrf

* Update example-voicemail-greetings.json (#761)

Update voicemail english greetings

* wip

* stopvad if playdone

---------

Co-authored-by: Vinod Dharashive <vdharashive@gmail.com>
This commit is contained in:
Hoan Luu Huu
2024-05-29 18:31:59 +07:00
committed by GitHub
parent 24b6d2464b
commit 498dd64025
7 changed files with 73 additions and 30 deletions

View File

@@ -338,6 +338,17 @@ class CallSession extends Emitter {
this.application.fallback_speech_recognizer_language = language; this.application.fallback_speech_recognizer_language = language;
} }
/**
* Vad
*/
get vad() {
return this._vad;
}
set vad(v) {
this._vad = v;
}
/** /**
* indicates whether the call currently in progress * indicates whether the call currently in progress
*/ */

View File

@@ -15,7 +15,8 @@ class TaskConfig extends Task {
'transcribe', 'transcribe',
'fillerNoise', 'fillerNoise',
'actionHookDelayAction', 'actionHookDelayAction',
'boostAudioSignal' 'boostAudioSignal',
'vad'
].forEach((k) => this[k] = this.data[k] || {}); ].forEach((k) => this[k] = this.data[k] || {});
if ('notifyEvents' in this.data) { if ('notifyEvents' in this.data) {
@@ -70,6 +71,7 @@ class TaskConfig extends Task {
get hasListen() { return Object.keys(this.listen).length; } get hasListen() { return Object.keys(this.listen).length; }
get hasTranscribe() { return Object.keys(this.transcribe).length; } get hasTranscribe() { return Object.keys(this.transcribe).length; }
get hasDub() { return Object.keys(this.dub).length; } get hasDub() { return Object.keys(this.dub).length; }
get hasVad() { return Object.keys(this.vad).length; }
get hasFillerNoise() { return Object.keys(this.fillerNoise).length; } get hasFillerNoise() { return Object.keys(this.fillerNoise).length; }
get summary() { get summary() {
@@ -287,6 +289,16 @@ class TaskConfig extends Task {
cs.enableFillerNoise(opts); cs.enableFillerNoise(opts);
} }
} }
if (this.hasVad) {
cs.vad = {
enable: this.vad.enable || false,
voiceMs: this.vad.voiceMs || 250,
silenceMs: this.vad.silenceMs || 150,
strategy: this.vad.strategy || 'one-shot',
mode: this.vad.mod || 2
};
}
} }
async kill(cs) { async kill(cs) {

View File

@@ -10,7 +10,8 @@ const {
IbmTranscriptionEvents, IbmTranscriptionEvents,
NvidiaTranscriptionEvents, NvidiaTranscriptionEvents,
JambonzTranscriptionEvents, JambonzTranscriptionEvents,
AssemblyAiTranscriptionEvents AssemblyAiTranscriptionEvents,
VadDetection
} = require('../utils/constants.json'); } = require('../utils/constants.json');
const { const {
JAMBONES_GATHER_EARLY_HINTS_MATCH, JAMBONES_GATHER_EARLY_HINTS_MATCH,
@@ -27,7 +28,7 @@ class TaskGather extends SttTask {
[ [
'finishOnKey', 'input', 'numDigits', 'minDigits', 'maxDigits', 'finishOnKey', 'input', 'numDigits', 'minDigits', 'maxDigits',
'interDigitTimeout', 'partialResultHook', 'bargein', 'dtmfBargein', 'interDigitTimeout', 'partialResultHook', 'bargein', 'dtmfBargein',
'speechTimeout', 'timeout', 'say', 'play', 'actionHookDelayAction', 'fillerNoise' 'speechTimeout', 'timeout', 'say', 'play', 'actionHookDelayAction', 'fillerNoise', 'vad'
].forEach((k) => this[k] = this.data[k]); ].forEach((k) => this[k] = this.data[k]);
// gather default input is digits // gather default input is digits
@@ -41,7 +42,8 @@ class TaskGather extends SttTask {
this.timeout = this.timeout === 0 ? 0 : (this.timeout || 15) * 1000; this.timeout = this.timeout === 0 ? 0 : (this.timeout || 15) * 1000;
this.interim = !!this.partialResultHook || this.bargein || (this.timeout > 0); this.interim = !!this.partialResultHook || this.bargein || (this.timeout > 0);
this.listenDuringPrompt = this.data.listenDuringPrompt === false ? false : true; this.listenDuringPrompt = this.data.listenDuringPrompt === false ? false : true;
this.minBargeinWordCount = this.data.minBargeinWordCount || 1; this.minBargeinWordCount = this.data.minBargeinWordCount !== undefined ? this.data.minBargeinWordCount : 1;
this._vadEnabled = this.minBargeinWordCount === 0;
if (this.data.recognizer) { if (this.data.recognizer) {
/* continuous ASR (i.e. compile transcripts until a special timeout or dtmf key) */ /* continuous ASR (i.e. compile transcripts until a special timeout or dtmf key) */
this.asrTimeout = typeof this.data.recognizer.asrTimeout === 'number' ? this.asrTimeout = typeof this.data.recognizer.asrTimeout === 'number' ?
@@ -128,6 +130,11 @@ class TaskGather extends SttTask {
...(this.fillerNoise || {}) ...(this.fillerNoise || {})
}; };
this.vad = {
...(cs.vad || {}),
...(this.vad || {})
};
if (cs.hasGlobalSttHints && !this.maskGlobalSttHints) { if (cs.hasGlobalSttHints && !this.maskGlobalSttHints) {
const {hints, hintsBoost} = cs.globalSttHints; const {hints, hintsBoost} = cs.globalSttHints;
const setOfHints = new Set((this.data.recognizer.hints || []) const setOfHints = new Set((this.data.recognizer.hints || [])
@@ -178,6 +185,8 @@ class TaskGather extends SttTask {
retries: this._hookDelayRetries retries: this._hookDelayRetries
}; };
this._startVad();
const startListening = async(cs, ep) => { const startListening = async(cs, ep) => {
this._startTimer(); this._startTimer();
if (this.isContinuousAsr && 0 === this.timeout) this._startAsrTimer(); if (this.isContinuousAsr && 0 === this.timeout) this._startAsrTimer();
@@ -201,6 +210,7 @@ class TaskGather extends SttTask {
const {span, ctx} = this.startChildSpan(`nested:${this.sayTask.summary}`); const {span, ctx} = this.startChildSpan(`nested:${this.sayTask.summary}`);
const process = () => { const process = () => {
this.logger.debug('Gather: nested say task completed'); this.logger.debug('Gather: nested say task completed');
this._stopVad();
if (!this.killed) { if (!this.killed) {
startListening(cs, ep); startListening(cs, ep);
if (this.input.includes('speech') && this.vendor === 'nuance' && this.listenDuringPrompt) { if (this.input.includes('speech') && this.vendor === 'nuance' && this.listenDuringPrompt) {
@@ -227,6 +237,7 @@ class TaskGather extends SttTask {
const {span, ctx} = this.startChildSpan(`nested:${this.playTask.summary}`); const {span, ctx} = this.startChildSpan(`nested:${this.playTask.summary}`);
const process = () => { const process = () => {
this.logger.debug('Gather: nested play task completed'); this.logger.debug('Gather: nested play task completed');
this._stopVad();
if (!this.killed) { if (!this.killed) {
startListening(cs, ep); startListening(cs, ep);
if (this.input.includes('speech') && this.vendor === 'nuance' && this.listenDuringPrompt) { if (this.input.includes('speech') && this.vendor === 'nuance' && this.listenDuringPrompt) {
@@ -291,6 +302,7 @@ class TaskGather extends SttTask {
this._clearAsrTimer(); this._clearAsrTimer();
this.playTask?.span.end(); this.playTask?.span.end();
this.sayTask?.span.end(); this.sayTask?.span.end();
this._stopVad();
this._resolve('killed'); this._resolve('killed');
} }
@@ -368,15 +380,12 @@ class TaskGather extends SttTask {
ep, GoogleTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep)); ep, GoogleTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
this.addCustomEventListener( this.addCustomEventListener(
ep, GoogleTranscriptionEvents.EndOfUtterance, this._onEndOfUtterance.bind(this, cs, ep)); ep, GoogleTranscriptionEvents.EndOfUtterance, this._onEndOfUtterance.bind(this, cs, ep));
this.addCustomEventListener(
ep, GoogleTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep));
break; break;
case 'aws': case 'aws':
case 'polly': case 'polly':
this.bugname = `${this.bugname_prefix}aws_transcribe`; this.bugname = `${this.bugname_prefix}aws_transcribe`;
this.addCustomEventListener(ep, AwsTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep)); this.addCustomEventListener(ep, AwsTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
this.addCustomEventListener(ep, AwsTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep));
break; break;
case 'microsoft': case 'microsoft':
this.bugname = `${this.bugname_prefix}azure_transcribe`; this.bugname = `${this.bugname_prefix}azure_transcribe`;
@@ -384,7 +393,6 @@ class TaskGather extends SttTask {
ep, AzureTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep)); ep, AzureTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
//this.addCustomEventListener(ep, AzureTranscriptionEvents.NoSpeechDetected, //this.addCustomEventListener(ep, AzureTranscriptionEvents.NoSpeechDetected,
//this._onNoSpeechDetected.bind(this, cs, ep)); //this._onNoSpeechDetected.bind(this, cs, ep));
this.addCustomEventListener(ep, AzureTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep));
break; break;
case 'nuance': case 'nuance':
this.bugname = `${this.bugname_prefix}nuance_transcribe`; this.bugname = `${this.bugname_prefix}nuance_transcribe`;
@@ -394,8 +402,6 @@ class TaskGather extends SttTask {
this._onStartOfSpeech.bind(this, cs, ep)); this._onStartOfSpeech.bind(this, cs, ep));
this.addCustomEventListener(ep, NuanceTranscriptionEvents.TranscriptionComplete, this.addCustomEventListener(ep, NuanceTranscriptionEvents.TranscriptionComplete,
this._onTranscriptionComplete.bind(this, cs, ep)); this._onTranscriptionComplete.bind(this, cs, ep));
this.addCustomEventListener(ep, NuanceTranscriptionEvents.VadDetected,
this._onVadDetected.bind(this, cs, ep));
/* stall timers until prompt finishes playing */ /* stall timers until prompt finishes playing */
if ((this.sayTask || this.playTask) && this.listenDuringPrompt) { if ((this.sayTask || this.playTask) && this.listenDuringPrompt) {
@@ -465,8 +471,6 @@ class TaskGather extends SttTask {
this._onStartOfSpeech.bind(this, cs, ep)); this._onStartOfSpeech.bind(this, cs, ep));
this.addCustomEventListener(ep, NvidiaTranscriptionEvents.TranscriptionComplete, this.addCustomEventListener(ep, NvidiaTranscriptionEvents.TranscriptionComplete,
this._onTranscriptionComplete.bind(this, cs, ep)); this._onTranscriptionComplete.bind(this, cs, ep));
this.addCustomEventListener(ep, NvidiaTranscriptionEvents.VadDetected,
this._onVadDetected.bind(this, cs, ep));
/* I think nvidia has this (??) - stall timers until prompt finishes playing */ /* I think nvidia has this (??) - stall timers until prompt finishes playing */
if ((this.sayTask || this.playTask) && this.listenDuringPrompt) { if ((this.sayTask || this.playTask) && this.listenDuringPrompt) {
@@ -704,6 +708,25 @@ class TaskGather extends SttTask {
this._finalAsrTimer = null; this._finalAsrTimer = null;
} }
_startVad() {
if (!this._vadStarted && this._vadEnabled) {
this.logger.debug('_startVad');
this.addCustomEventListener(this.ep, VadDetection.Detection, this._onVadDetected.bind(this, this.cs, this.ep));
this.ep?.startVadDetection(this.vad);
this._vadStarted = true;
}
}
_stopVad() {
if (this._vadStarted) {
this.logger.debug('_stopVad');
this.ep?.stopVadDetection(this.vad);
this.ep?.removeCustomEventListener(VadDetection.Detection, this._onVadDetected);
this._vadStarted = false;
}
}
_startFillerNoise() { _startFillerNoise() {
this.logger.debug('Gather:_startFillerNoise - playing filler noise'); this.logger.debug('Gather:_startFillerNoise - playing filler noise');
this.ep?.play(this.fillerNoise.url); this.ep?.play(this.fillerNoise.url);
@@ -1039,6 +1062,10 @@ class TaskGather extends SttTask {
this._killAudio(cs); this._killAudio(cs);
this.emit('vad'); this.emit('vad');
} }
if (this.vad?.strategy === 'one-shot') {
this.ep?.removeCustomEventListener(VadDetection.Detection, this._onVadDetected);
this._vadStarted = false;
}
} }
_onNoSpeechDetected(cs, ep, evt, fsEvent) { _onNoSpeechDetected(cs, ep, evt, fsEvent) {

View File

@@ -134,6 +134,9 @@
"ConnectFailure": "assemblyai_transcribe::connect_failed", "ConnectFailure": "assemblyai_transcribe::connect_failed",
"Connect": "assemblyai_transcribe::connect" "Connect": "assemblyai_transcribe::connect"
}, },
"VadDetection": {
"Detection": "vad_detect:detection"
},
"ListenEvents": { "ListenEvents": {
"Connect": "mod_audio_fork::connect", "Connect": "mod_audio_fork::connect",
"ConnectFailure": "mod_audio_fork::connect_failed", "ConnectFailure": "mod_audio_fork::connect_failed",

View File

@@ -474,18 +474,8 @@ module.exports = (logger) => {
const setChannelVarsForStt = (task, sttCredentials, language, rOpts = {}) => { const setChannelVarsForStt = (task, sttCredentials, language, rOpts = {}) => {
let opts = {}; let opts = {};
const {enable, voiceMs = 0, mode = -1} = rOpts.vad || {};
const vad = {enable, voiceMs, mode};
const vendor = rOpts.vendor; const vendor = rOpts.vendor;
/* voice activity detection works across vendors */
opts = {
...opts,
...(vad.enable && {START_RECOGNIZING_ON_VAD: 1}),
...(vad.enable && vad.voiceMs && {RECOGNIZER_VAD_VOICE_MS: vad.voiceMs}),
...(vad.enable && typeof vad.mode === 'number' && {RECOGNIZER_VAD_MODE: vad.mode}),
};
if ('google' === vendor) { if ('google' === vendor) {
const useV2 = rOpts.googleOptions?.serviceVersion === 'v2'; const useV2 = rOpts.googleOptions?.serviceVersion === 'v2';
const model = task.name === TaskName.Gather ? const model = task.name === TaskName.Gather ?

14
package-lock.json generated
View File

@@ -18,7 +18,7 @@
"@jambonz/speech-utils": "^0.1.3", "@jambonz/speech-utils": "^0.1.3",
"@jambonz/stats-collector": "^0.1.10", "@jambonz/stats-collector": "^0.1.10",
"@jambonz/time-series": "^0.2.8", "@jambonz/time-series": "^0.2.8",
"@jambonz/verb-specifications": "^0.0.69", "@jambonz/verb-specifications": "^0.0.71",
"@opentelemetry/api": "^1.8.0", "@opentelemetry/api": "^1.8.0",
"@opentelemetry/exporter-jaeger": "^1.23.0", "@opentelemetry/exporter-jaeger": "^1.23.0",
"@opentelemetry/exporter-trace-otlp-http": "^0.50.0", "@opentelemetry/exporter-trace-otlp-http": "^0.50.0",
@@ -2360,9 +2360,9 @@
} }
}, },
"node_modules/@jambonz/verb-specifications": { "node_modules/@jambonz/verb-specifications": {
"version": "0.0.69", "version": "0.0.71",
"resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.69.tgz", "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.71.tgz",
"integrity": "sha512-DWnz7XRkCzpzyCVJH7NtScv+wSlUC414/EO8j/gPZs3RT4WBW1OBXwXpfjURHcSrDG7lycz+tfA+2WoUdW/W+g==", "integrity": "sha512-e4f7zbSncuh4cVtEg0DlGBp60B6d9SMxa0sI+bgIWLq9oRfvziL2Afb0od/a8AiPgDmIxBp6a3IoXcOy9gNCxw==",
"dependencies": { "dependencies": {
"debug": "^4.3.4", "debug": "^4.3.4",
"pino": "^8.8.0" "pino": "^8.8.0"
@@ -11992,9 +11992,9 @@
} }
}, },
"@jambonz/verb-specifications": { "@jambonz/verb-specifications": {
"version": "0.0.69", "version": "0.0.71",
"resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.69.tgz", "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.71.tgz",
"integrity": "sha512-DWnz7XRkCzpzyCVJH7NtScv+wSlUC414/EO8j/gPZs3RT4WBW1OBXwXpfjURHcSrDG7lycz+tfA+2WoUdW/W+g==", "integrity": "sha512-e4f7zbSncuh4cVtEg0DlGBp60B6d9SMxa0sI+bgIWLq9oRfvziL2Afb0od/a8AiPgDmIxBp6a3IoXcOy9gNCxw==",
"requires": { "requires": {
"debug": "^4.3.4", "debug": "^4.3.4",
"pino": "^8.8.0" "pino": "^8.8.0"

View File

@@ -34,7 +34,7 @@
"@jambonz/speech-utils": "^0.1.3", "@jambonz/speech-utils": "^0.1.3",
"@jambonz/stats-collector": "^0.1.10", "@jambonz/stats-collector": "^0.1.10",
"@jambonz/time-series": "^0.2.8", "@jambonz/time-series": "^0.2.8",
"@jambonz/verb-specifications": "^0.0.69", "@jambonz/verb-specifications": "^0.0.71",
"@opentelemetry/api": "^1.8.0", "@opentelemetry/api": "^1.8.0",
"@opentelemetry/exporter-jaeger": "^1.23.0", "@opentelemetry/exporter-jaeger": "^1.23.0",
"@opentelemetry/exporter-trace-otlp-http": "^0.50.0", "@opentelemetry/exporter-trace-otlp-http": "^0.50.0",