mirror of
https://github.com/jambonz/jambonz-feature-server.git
synced 2025-12-20 16:50:39 +00:00
support mod_vad_detect (#762)
* support mod_vad_detect * wip * update verb spec and drachtio fsmrf * Update example-voicemail-greetings.json (#761) Update voicemail english greetings * wip * stopvad if playdone --------- Co-authored-by: Vinod Dharashive <vdharashive@gmail.com>
This commit is contained in:
@@ -338,6 +338,17 @@ class CallSession extends Emitter {
|
|||||||
this.application.fallback_speech_recognizer_language = language;
|
this.application.fallback_speech_recognizer_language = language;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Vad
|
||||||
|
*/
|
||||||
|
get vad() {
|
||||||
|
return this._vad;
|
||||||
|
}
|
||||||
|
|
||||||
|
set vad(v) {
|
||||||
|
this._vad = v;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* indicates whether the call currently in progress
|
* indicates whether the call currently in progress
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -15,7 +15,8 @@ class TaskConfig extends Task {
|
|||||||
'transcribe',
|
'transcribe',
|
||||||
'fillerNoise',
|
'fillerNoise',
|
||||||
'actionHookDelayAction',
|
'actionHookDelayAction',
|
||||||
'boostAudioSignal'
|
'boostAudioSignal',
|
||||||
|
'vad'
|
||||||
].forEach((k) => this[k] = this.data[k] || {});
|
].forEach((k) => this[k] = this.data[k] || {});
|
||||||
|
|
||||||
if ('notifyEvents' in this.data) {
|
if ('notifyEvents' in this.data) {
|
||||||
@@ -70,6 +71,7 @@ class TaskConfig extends Task {
|
|||||||
get hasListen() { return Object.keys(this.listen).length; }
|
get hasListen() { return Object.keys(this.listen).length; }
|
||||||
get hasTranscribe() { return Object.keys(this.transcribe).length; }
|
get hasTranscribe() { return Object.keys(this.transcribe).length; }
|
||||||
get hasDub() { return Object.keys(this.dub).length; }
|
get hasDub() { return Object.keys(this.dub).length; }
|
||||||
|
get hasVad() { return Object.keys(this.vad).length; }
|
||||||
get hasFillerNoise() { return Object.keys(this.fillerNoise).length; }
|
get hasFillerNoise() { return Object.keys(this.fillerNoise).length; }
|
||||||
|
|
||||||
get summary() {
|
get summary() {
|
||||||
@@ -287,6 +289,16 @@ class TaskConfig extends Task {
|
|||||||
cs.enableFillerNoise(opts);
|
cs.enableFillerNoise(opts);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (this.hasVad) {
|
||||||
|
cs.vad = {
|
||||||
|
enable: this.vad.enable || false,
|
||||||
|
voiceMs: this.vad.voiceMs || 250,
|
||||||
|
silenceMs: this.vad.silenceMs || 150,
|
||||||
|
strategy: this.vad.strategy || 'one-shot',
|
||||||
|
mode: this.vad.mod || 2
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async kill(cs) {
|
async kill(cs) {
|
||||||
|
|||||||
@@ -10,7 +10,8 @@ const {
|
|||||||
IbmTranscriptionEvents,
|
IbmTranscriptionEvents,
|
||||||
NvidiaTranscriptionEvents,
|
NvidiaTranscriptionEvents,
|
||||||
JambonzTranscriptionEvents,
|
JambonzTranscriptionEvents,
|
||||||
AssemblyAiTranscriptionEvents
|
AssemblyAiTranscriptionEvents,
|
||||||
|
VadDetection
|
||||||
} = require('../utils/constants.json');
|
} = require('../utils/constants.json');
|
||||||
const {
|
const {
|
||||||
JAMBONES_GATHER_EARLY_HINTS_MATCH,
|
JAMBONES_GATHER_EARLY_HINTS_MATCH,
|
||||||
@@ -27,7 +28,7 @@ class TaskGather extends SttTask {
|
|||||||
[
|
[
|
||||||
'finishOnKey', 'input', 'numDigits', 'minDigits', 'maxDigits',
|
'finishOnKey', 'input', 'numDigits', 'minDigits', 'maxDigits',
|
||||||
'interDigitTimeout', 'partialResultHook', 'bargein', 'dtmfBargein',
|
'interDigitTimeout', 'partialResultHook', 'bargein', 'dtmfBargein',
|
||||||
'speechTimeout', 'timeout', 'say', 'play', 'actionHookDelayAction', 'fillerNoise'
|
'speechTimeout', 'timeout', 'say', 'play', 'actionHookDelayAction', 'fillerNoise', 'vad'
|
||||||
].forEach((k) => this[k] = this.data[k]);
|
].forEach((k) => this[k] = this.data[k]);
|
||||||
|
|
||||||
// gather default input is digits
|
// gather default input is digits
|
||||||
@@ -41,7 +42,8 @@ class TaskGather extends SttTask {
|
|||||||
this.timeout = this.timeout === 0 ? 0 : (this.timeout || 15) * 1000;
|
this.timeout = this.timeout === 0 ? 0 : (this.timeout || 15) * 1000;
|
||||||
this.interim = !!this.partialResultHook || this.bargein || (this.timeout > 0);
|
this.interim = !!this.partialResultHook || this.bargein || (this.timeout > 0);
|
||||||
this.listenDuringPrompt = this.data.listenDuringPrompt === false ? false : true;
|
this.listenDuringPrompt = this.data.listenDuringPrompt === false ? false : true;
|
||||||
this.minBargeinWordCount = this.data.minBargeinWordCount || 1;
|
this.minBargeinWordCount = this.data.minBargeinWordCount !== undefined ? this.data.minBargeinWordCount : 1;
|
||||||
|
this._vadEnabled = this.minBargeinWordCount === 0;
|
||||||
if (this.data.recognizer) {
|
if (this.data.recognizer) {
|
||||||
/* continuous ASR (i.e. compile transcripts until a special timeout or dtmf key) */
|
/* continuous ASR (i.e. compile transcripts until a special timeout or dtmf key) */
|
||||||
this.asrTimeout = typeof this.data.recognizer.asrTimeout === 'number' ?
|
this.asrTimeout = typeof this.data.recognizer.asrTimeout === 'number' ?
|
||||||
@@ -128,6 +130,11 @@ class TaskGather extends SttTask {
|
|||||||
...(this.fillerNoise || {})
|
...(this.fillerNoise || {})
|
||||||
};
|
};
|
||||||
|
|
||||||
|
this.vad = {
|
||||||
|
...(cs.vad || {}),
|
||||||
|
...(this.vad || {})
|
||||||
|
};
|
||||||
|
|
||||||
if (cs.hasGlobalSttHints && !this.maskGlobalSttHints) {
|
if (cs.hasGlobalSttHints && !this.maskGlobalSttHints) {
|
||||||
const {hints, hintsBoost} = cs.globalSttHints;
|
const {hints, hintsBoost} = cs.globalSttHints;
|
||||||
const setOfHints = new Set((this.data.recognizer.hints || [])
|
const setOfHints = new Set((this.data.recognizer.hints || [])
|
||||||
@@ -178,6 +185,8 @@ class TaskGather extends SttTask {
|
|||||||
retries: this._hookDelayRetries
|
retries: this._hookDelayRetries
|
||||||
};
|
};
|
||||||
|
|
||||||
|
this._startVad();
|
||||||
|
|
||||||
const startListening = async(cs, ep) => {
|
const startListening = async(cs, ep) => {
|
||||||
this._startTimer();
|
this._startTimer();
|
||||||
if (this.isContinuousAsr && 0 === this.timeout) this._startAsrTimer();
|
if (this.isContinuousAsr && 0 === this.timeout) this._startAsrTimer();
|
||||||
@@ -201,6 +210,7 @@ class TaskGather extends SttTask {
|
|||||||
const {span, ctx} = this.startChildSpan(`nested:${this.sayTask.summary}`);
|
const {span, ctx} = this.startChildSpan(`nested:${this.sayTask.summary}`);
|
||||||
const process = () => {
|
const process = () => {
|
||||||
this.logger.debug('Gather: nested say task completed');
|
this.logger.debug('Gather: nested say task completed');
|
||||||
|
this._stopVad();
|
||||||
if (!this.killed) {
|
if (!this.killed) {
|
||||||
startListening(cs, ep);
|
startListening(cs, ep);
|
||||||
if (this.input.includes('speech') && this.vendor === 'nuance' && this.listenDuringPrompt) {
|
if (this.input.includes('speech') && this.vendor === 'nuance' && this.listenDuringPrompt) {
|
||||||
@@ -227,6 +237,7 @@ class TaskGather extends SttTask {
|
|||||||
const {span, ctx} = this.startChildSpan(`nested:${this.playTask.summary}`);
|
const {span, ctx} = this.startChildSpan(`nested:${this.playTask.summary}`);
|
||||||
const process = () => {
|
const process = () => {
|
||||||
this.logger.debug('Gather: nested play task completed');
|
this.logger.debug('Gather: nested play task completed');
|
||||||
|
this._stopVad();
|
||||||
if (!this.killed) {
|
if (!this.killed) {
|
||||||
startListening(cs, ep);
|
startListening(cs, ep);
|
||||||
if (this.input.includes('speech') && this.vendor === 'nuance' && this.listenDuringPrompt) {
|
if (this.input.includes('speech') && this.vendor === 'nuance' && this.listenDuringPrompt) {
|
||||||
@@ -291,6 +302,7 @@ class TaskGather extends SttTask {
|
|||||||
this._clearAsrTimer();
|
this._clearAsrTimer();
|
||||||
this.playTask?.span.end();
|
this.playTask?.span.end();
|
||||||
this.sayTask?.span.end();
|
this.sayTask?.span.end();
|
||||||
|
this._stopVad();
|
||||||
this._resolve('killed');
|
this._resolve('killed');
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -368,15 +380,12 @@ class TaskGather extends SttTask {
|
|||||||
ep, GoogleTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
|
ep, GoogleTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
|
||||||
this.addCustomEventListener(
|
this.addCustomEventListener(
|
||||||
ep, GoogleTranscriptionEvents.EndOfUtterance, this._onEndOfUtterance.bind(this, cs, ep));
|
ep, GoogleTranscriptionEvents.EndOfUtterance, this._onEndOfUtterance.bind(this, cs, ep));
|
||||||
this.addCustomEventListener(
|
|
||||||
ep, GoogleTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep));
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 'aws':
|
case 'aws':
|
||||||
case 'polly':
|
case 'polly':
|
||||||
this.bugname = `${this.bugname_prefix}aws_transcribe`;
|
this.bugname = `${this.bugname_prefix}aws_transcribe`;
|
||||||
this.addCustomEventListener(ep, AwsTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
|
this.addCustomEventListener(ep, AwsTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
|
||||||
this.addCustomEventListener(ep, AwsTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep));
|
|
||||||
break;
|
break;
|
||||||
case 'microsoft':
|
case 'microsoft':
|
||||||
this.bugname = `${this.bugname_prefix}azure_transcribe`;
|
this.bugname = `${this.bugname_prefix}azure_transcribe`;
|
||||||
@@ -384,7 +393,6 @@ class TaskGather extends SttTask {
|
|||||||
ep, AzureTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
|
ep, AzureTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
|
||||||
//this.addCustomEventListener(ep, AzureTranscriptionEvents.NoSpeechDetected,
|
//this.addCustomEventListener(ep, AzureTranscriptionEvents.NoSpeechDetected,
|
||||||
//this._onNoSpeechDetected.bind(this, cs, ep));
|
//this._onNoSpeechDetected.bind(this, cs, ep));
|
||||||
this.addCustomEventListener(ep, AzureTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep));
|
|
||||||
break;
|
break;
|
||||||
case 'nuance':
|
case 'nuance':
|
||||||
this.bugname = `${this.bugname_prefix}nuance_transcribe`;
|
this.bugname = `${this.bugname_prefix}nuance_transcribe`;
|
||||||
@@ -394,8 +402,6 @@ class TaskGather extends SttTask {
|
|||||||
this._onStartOfSpeech.bind(this, cs, ep));
|
this._onStartOfSpeech.bind(this, cs, ep));
|
||||||
this.addCustomEventListener(ep, NuanceTranscriptionEvents.TranscriptionComplete,
|
this.addCustomEventListener(ep, NuanceTranscriptionEvents.TranscriptionComplete,
|
||||||
this._onTranscriptionComplete.bind(this, cs, ep));
|
this._onTranscriptionComplete.bind(this, cs, ep));
|
||||||
this.addCustomEventListener(ep, NuanceTranscriptionEvents.VadDetected,
|
|
||||||
this._onVadDetected.bind(this, cs, ep));
|
|
||||||
|
|
||||||
/* stall timers until prompt finishes playing */
|
/* stall timers until prompt finishes playing */
|
||||||
if ((this.sayTask || this.playTask) && this.listenDuringPrompt) {
|
if ((this.sayTask || this.playTask) && this.listenDuringPrompt) {
|
||||||
@@ -465,8 +471,6 @@ class TaskGather extends SttTask {
|
|||||||
this._onStartOfSpeech.bind(this, cs, ep));
|
this._onStartOfSpeech.bind(this, cs, ep));
|
||||||
this.addCustomEventListener(ep, NvidiaTranscriptionEvents.TranscriptionComplete,
|
this.addCustomEventListener(ep, NvidiaTranscriptionEvents.TranscriptionComplete,
|
||||||
this._onTranscriptionComplete.bind(this, cs, ep));
|
this._onTranscriptionComplete.bind(this, cs, ep));
|
||||||
this.addCustomEventListener(ep, NvidiaTranscriptionEvents.VadDetected,
|
|
||||||
this._onVadDetected.bind(this, cs, ep));
|
|
||||||
|
|
||||||
/* I think nvidia has this (??) - stall timers until prompt finishes playing */
|
/* I think nvidia has this (??) - stall timers until prompt finishes playing */
|
||||||
if ((this.sayTask || this.playTask) && this.listenDuringPrompt) {
|
if ((this.sayTask || this.playTask) && this.listenDuringPrompt) {
|
||||||
@@ -704,6 +708,25 @@ class TaskGather extends SttTask {
|
|||||||
this._finalAsrTimer = null;
|
this._finalAsrTimer = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
_startVad() {
|
||||||
|
if (!this._vadStarted && this._vadEnabled) {
|
||||||
|
this.logger.debug('_startVad');
|
||||||
|
this.addCustomEventListener(this.ep, VadDetection.Detection, this._onVadDetected.bind(this, this.cs, this.ep));
|
||||||
|
this.ep?.startVadDetection(this.vad);
|
||||||
|
this._vadStarted = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
_stopVad() {
|
||||||
|
if (this._vadStarted) {
|
||||||
|
this.logger.debug('_stopVad');
|
||||||
|
this.ep?.stopVadDetection(this.vad);
|
||||||
|
this.ep?.removeCustomEventListener(VadDetection.Detection, this._onVadDetected);
|
||||||
|
this._vadStarted = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
_startFillerNoise() {
|
_startFillerNoise() {
|
||||||
this.logger.debug('Gather:_startFillerNoise - playing filler noise');
|
this.logger.debug('Gather:_startFillerNoise - playing filler noise');
|
||||||
this.ep?.play(this.fillerNoise.url);
|
this.ep?.play(this.fillerNoise.url);
|
||||||
@@ -1039,6 +1062,10 @@ class TaskGather extends SttTask {
|
|||||||
this._killAudio(cs);
|
this._killAudio(cs);
|
||||||
this.emit('vad');
|
this.emit('vad');
|
||||||
}
|
}
|
||||||
|
if (this.vad?.strategy === 'one-shot') {
|
||||||
|
this.ep?.removeCustomEventListener(VadDetection.Detection, this._onVadDetected);
|
||||||
|
this._vadStarted = false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
_onNoSpeechDetected(cs, ep, evt, fsEvent) {
|
_onNoSpeechDetected(cs, ep, evt, fsEvent) {
|
||||||
|
|||||||
@@ -134,6 +134,9 @@
|
|||||||
"ConnectFailure": "assemblyai_transcribe::connect_failed",
|
"ConnectFailure": "assemblyai_transcribe::connect_failed",
|
||||||
"Connect": "assemblyai_transcribe::connect"
|
"Connect": "assemblyai_transcribe::connect"
|
||||||
},
|
},
|
||||||
|
"VadDetection": {
|
||||||
|
"Detection": "vad_detect:detection"
|
||||||
|
},
|
||||||
"ListenEvents": {
|
"ListenEvents": {
|
||||||
"Connect": "mod_audio_fork::connect",
|
"Connect": "mod_audio_fork::connect",
|
||||||
"ConnectFailure": "mod_audio_fork::connect_failed",
|
"ConnectFailure": "mod_audio_fork::connect_failed",
|
||||||
|
|||||||
@@ -474,18 +474,8 @@ module.exports = (logger) => {
|
|||||||
|
|
||||||
const setChannelVarsForStt = (task, sttCredentials, language, rOpts = {}) => {
|
const setChannelVarsForStt = (task, sttCredentials, language, rOpts = {}) => {
|
||||||
let opts = {};
|
let opts = {};
|
||||||
const {enable, voiceMs = 0, mode = -1} = rOpts.vad || {};
|
|
||||||
const vad = {enable, voiceMs, mode};
|
|
||||||
const vendor = rOpts.vendor;
|
const vendor = rOpts.vendor;
|
||||||
|
|
||||||
/* voice activity detection works across vendors */
|
|
||||||
opts = {
|
|
||||||
...opts,
|
|
||||||
...(vad.enable && {START_RECOGNIZING_ON_VAD: 1}),
|
|
||||||
...(vad.enable && vad.voiceMs && {RECOGNIZER_VAD_VOICE_MS: vad.voiceMs}),
|
|
||||||
...(vad.enable && typeof vad.mode === 'number' && {RECOGNIZER_VAD_MODE: vad.mode}),
|
|
||||||
};
|
|
||||||
|
|
||||||
if ('google' === vendor) {
|
if ('google' === vendor) {
|
||||||
const useV2 = rOpts.googleOptions?.serviceVersion === 'v2';
|
const useV2 = rOpts.googleOptions?.serviceVersion === 'v2';
|
||||||
const model = task.name === TaskName.Gather ?
|
const model = task.name === TaskName.Gather ?
|
||||||
|
|||||||
14
package-lock.json
generated
14
package-lock.json
generated
@@ -18,7 +18,7 @@
|
|||||||
"@jambonz/speech-utils": "^0.1.3",
|
"@jambonz/speech-utils": "^0.1.3",
|
||||||
"@jambonz/stats-collector": "^0.1.10",
|
"@jambonz/stats-collector": "^0.1.10",
|
||||||
"@jambonz/time-series": "^0.2.8",
|
"@jambonz/time-series": "^0.2.8",
|
||||||
"@jambonz/verb-specifications": "^0.0.69",
|
"@jambonz/verb-specifications": "^0.0.71",
|
||||||
"@opentelemetry/api": "^1.8.0",
|
"@opentelemetry/api": "^1.8.0",
|
||||||
"@opentelemetry/exporter-jaeger": "^1.23.0",
|
"@opentelemetry/exporter-jaeger": "^1.23.0",
|
||||||
"@opentelemetry/exporter-trace-otlp-http": "^0.50.0",
|
"@opentelemetry/exporter-trace-otlp-http": "^0.50.0",
|
||||||
@@ -2360,9 +2360,9 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/@jambonz/verb-specifications": {
|
"node_modules/@jambonz/verb-specifications": {
|
||||||
"version": "0.0.69",
|
"version": "0.0.71",
|
||||||
"resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.69.tgz",
|
"resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.71.tgz",
|
||||||
"integrity": "sha512-DWnz7XRkCzpzyCVJH7NtScv+wSlUC414/EO8j/gPZs3RT4WBW1OBXwXpfjURHcSrDG7lycz+tfA+2WoUdW/W+g==",
|
"integrity": "sha512-e4f7zbSncuh4cVtEg0DlGBp60B6d9SMxa0sI+bgIWLq9oRfvziL2Afb0od/a8AiPgDmIxBp6a3IoXcOy9gNCxw==",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"debug": "^4.3.4",
|
"debug": "^4.3.4",
|
||||||
"pino": "^8.8.0"
|
"pino": "^8.8.0"
|
||||||
@@ -11992,9 +11992,9 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"@jambonz/verb-specifications": {
|
"@jambonz/verb-specifications": {
|
||||||
"version": "0.0.69",
|
"version": "0.0.71",
|
||||||
"resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.69.tgz",
|
"resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.71.tgz",
|
||||||
"integrity": "sha512-DWnz7XRkCzpzyCVJH7NtScv+wSlUC414/EO8j/gPZs3RT4WBW1OBXwXpfjURHcSrDG7lycz+tfA+2WoUdW/W+g==",
|
"integrity": "sha512-e4f7zbSncuh4cVtEg0DlGBp60B6d9SMxa0sI+bgIWLq9oRfvziL2Afb0od/a8AiPgDmIxBp6a3IoXcOy9gNCxw==",
|
||||||
"requires": {
|
"requires": {
|
||||||
"debug": "^4.3.4",
|
"debug": "^4.3.4",
|
||||||
"pino": "^8.8.0"
|
"pino": "^8.8.0"
|
||||||
|
|||||||
@@ -34,7 +34,7 @@
|
|||||||
"@jambonz/speech-utils": "^0.1.3",
|
"@jambonz/speech-utils": "^0.1.3",
|
||||||
"@jambonz/stats-collector": "^0.1.10",
|
"@jambonz/stats-collector": "^0.1.10",
|
||||||
"@jambonz/time-series": "^0.2.8",
|
"@jambonz/time-series": "^0.2.8",
|
||||||
"@jambonz/verb-specifications": "^0.0.69",
|
"@jambonz/verb-specifications": "^0.0.71",
|
||||||
"@opentelemetry/api": "^1.8.0",
|
"@opentelemetry/api": "^1.8.0",
|
||||||
"@opentelemetry/exporter-jaeger": "^1.23.0",
|
"@opentelemetry/exporter-jaeger": "^1.23.0",
|
||||||
"@opentelemetry/exporter-trace-otlp-http": "^0.50.0",
|
"@opentelemetry/exporter-trace-otlp-http": "^0.50.0",
|
||||||
|
|||||||
Reference in New Issue
Block a user