mirror of
https://github.com/jambonz/jambonz-feature-server.git
synced 2026-07-04 19:32:01 +00:00
wip
This commit is contained in:
+58
-8
@@ -593,6 +593,8 @@ class TaskGather extends SttTask {
|
||||
this._onVendorConnectFailure.bind(this, cs, ep));
|
||||
this.addCustomEventListener(ep, HoundifyTranscriptionEvents.Connect,
|
||||
this._onVendorConnect.bind(this, cs, ep));
|
||||
this.addCustomEventListener(ep, HoundifyTranscriptionEvents.EoqVadTriggered,
|
||||
this._onHoundifyEoqVadTriggered.bind(this, cs, ep));
|
||||
break;
|
||||
|
||||
case 'voxist':
|
||||
@@ -760,14 +762,23 @@ class TaskGather extends SttTask {
|
||||
return false;
|
||||
}
|
||||
|
||||
_onHoundifyEoqVadTriggered(_cs, _ep, _evt, _fsEvent) {
|
||||
if (this._asrTimer) return;
|
||||
this._startAsrTimer();
|
||||
}
|
||||
|
||||
_startAsrTimer() {
|
||||
// Deepgram has a case that UtteranceEnd is not sent to cover the last word end time.
|
||||
// So we need to wait for the asrTimeout to be sure that the last word is sent.
|
||||
// if (this.vendor === 'deepgram') return; // no need
|
||||
assert(this.isContinuousAsr);
|
||||
assert(this.isContinuousAsr || this.vendor === 'houndify');
|
||||
this._clearAsrTimer();
|
||||
const timeoutMs = this.asrTimeout || 500;
|
||||
this._asrTimer = setTimeout(() => {
|
||||
this.logger.info('_startAsrTimer - asr timer went off');
|
||||
|
||||
if (this.vendor === 'houndify') return this._sendHoundifyDone(this.ep);
|
||||
|
||||
const evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language, this.vendor);
|
||||
|
||||
/* special case for speechmatics - keep listening if we dont have any transcripts */
|
||||
@@ -777,8 +788,8 @@ class TaskGather extends SttTask {
|
||||
return;
|
||||
}
|
||||
this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout', evt);
|
||||
}, this.asrTimeout);
|
||||
this.logger.info(`_startAsrTimer: set for ${this.asrTimeout}ms`);
|
||||
}, timeoutMs);
|
||||
this.logger.info(`_startAsrTimer: set for ${timeoutMs}ms`);
|
||||
}
|
||||
|
||||
_clearAsrTimer() {
|
||||
@@ -982,6 +993,10 @@ class TaskGather extends SttTask {
|
||||
|
||||
let emptyTranscript = false;
|
||||
if (evt.is_final) {
|
||||
if (this.vendor === 'houndify') {
|
||||
this._clearAsrTimer();
|
||||
this._houndifyDoneSent = false;
|
||||
}
|
||||
if (evt.alternatives[0].transcript === '' && !this.callSession.callGone && !this.killed) {
|
||||
emptyTranscript = true;
|
||||
if (finished === 'true' &&
|
||||
@@ -990,6 +1005,19 @@ class TaskGather extends SttTask {
|
||||
this.logger.debug({evt}, 'TaskGather:_onTranscription - got empty transcript from old gather, disregarding');
|
||||
return;
|
||||
}
|
||||
else if (this.vendor === 'houndify') {
|
||||
const segMode = this.data.recognizer?.houndifyOptions?.requestInfo?.segmentation?.mode !== 'none';
|
||||
if (!segMode && this._houndifyLastPartial) {
|
||||
this.logger.info({evt},
|
||||
'TaskGather:_onTranscription - empty FT from houndify (mode OFF), using lastPartial');
|
||||
} else if (segMode && this._bufferedTranscripts.length > 0) {
|
||||
this.logger.info({evt},
|
||||
'TaskGather:_onTranscription - empty FT from houndify (mode ON), using buffered segments');
|
||||
} else {
|
||||
this.logger.info({evt}, 'TaskGather:_onTranscription - empty FT from houndify, nothing buffered');
|
||||
return;
|
||||
}
|
||||
}
|
||||
else if (this.vendor !== 'deepgram') {
|
||||
this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening');
|
||||
return;
|
||||
@@ -1031,10 +1059,7 @@ class TaskGather extends SttTask {
|
||||
}
|
||||
this._startAsrTimer();
|
||||
|
||||
/* some STT engines will keep listening after a final response, so no need to restart */
|
||||
if (!['soniox', 'aws', 'microsoft', 'deepgram', 'speechmatics', 'houndify', 'google', 'openai']
|
||||
.includes(this.vendor) &&
|
||||
!this.vendor.startsWith('custom')) {
|
||||
if (!this.doesVendorContinueListeningAfterFinalTranscript(this.vendor)) {
|
||||
this._startTranscribing(ep);
|
||||
}
|
||||
}
|
||||
@@ -1065,6 +1090,19 @@ class TaskGather extends SttTask {
|
||||
evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language, this.vendor);
|
||||
this._bufferedTranscripts = [];
|
||||
}
|
||||
else if (this.vendor === 'houndify') {
|
||||
const segMode = this.data.recognizer?.houndifyOptions?.requestInfo?.segmentation?.mode !== 'none';
|
||||
if (segMode) {
|
||||
if (!emptyTranscript) this._bufferedTranscripts.push(evt);
|
||||
if (this._bufferedTranscripts.length === 0) return;
|
||||
evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language, this.vendor);
|
||||
this._bufferedTranscripts = [];
|
||||
}
|
||||
else if (emptyTranscript && this._houndifyLastPartial) {
|
||||
evt = this._houndifyLastPartial;
|
||||
this._houndifyLastPartial = null;
|
||||
}
|
||||
}
|
||||
|
||||
/* here is where we return a final transcript */
|
||||
this._resolve('speech', evt);
|
||||
@@ -1074,7 +1112,19 @@ class TaskGather extends SttTask {
|
||||
else {
|
||||
/* deepgram can send a non-final transcript but with words that are final, so we need to buffer */
|
||||
let emptyTranscript = false;
|
||||
if (this.vendor === 'deepgram') {
|
||||
if (this.vendor === 'houndify') {
|
||||
const transcript = evt.alternatives[0]?.transcript;
|
||||
if (evt.is_partial_final && transcript) {
|
||||
/* mode segmentation ON: buffer FinalSegment + arm asrTimer (sends Done on expiry) */
|
||||
this._bufferedTranscripts.push(evt);
|
||||
this._startAsrTimer();
|
||||
} else if (transcript) {
|
||||
/* mode segmentation OFF: track last non-empty partial */
|
||||
this._houndifyLastPartial = evt;
|
||||
}
|
||||
if (!transcript) emptyTranscript = true;
|
||||
}
|
||||
else if (this.vendor === 'deepgram') {
|
||||
const originalEvent = evt.vendor.evt;
|
||||
if (originalEvent.is_final && evt.alternatives[0].transcript !== '') {
|
||||
this.logger.debug({evt}, 'Gather:_onTranscription - buffering a completed (partial) deepgram transcript');
|
||||
|
||||
+16
-2
@@ -417,7 +417,7 @@ class SttTask extends Task {
|
||||
return finalPrompt?.trimStart();
|
||||
}
|
||||
|
||||
/* some STT engines will keep listening after a final response, so no need to restart */
|
||||
/* some STT engines will keep listening after a final response, so no need to restart*/
|
||||
doesVendorContinueListeningAfterFinalTranscript(vendor) {
|
||||
return (vendor.startsWith('custom:') || [
|
||||
'soniox',
|
||||
@@ -427,10 +427,24 @@ class SttTask extends Task {
|
||||
'google',
|
||||
'speechmatics',
|
||||
'openai',
|
||||
'houndify',
|
||||
].includes(vendor));
|
||||
}
|
||||
|
||||
/* Send Done to Houndify via FS API; FinalTranscript arrives via normal flow.*/
|
||||
_sendHoundifyDone(ep, channel) {
|
||||
if (channel !== undefined) {
|
||||
this._houndifyDoneSentByCh = this._houndifyDoneSentByCh || [false, false];
|
||||
if (this._houndifyDoneSentByCh[channel - 1]) return;
|
||||
this._houndifyDoneSentByCh[channel - 1] = true;
|
||||
} else {
|
||||
if (this._houndifyDoneSent) return;
|
||||
this._houndifyDoneSent = true;
|
||||
}
|
||||
this.logger.info(`houndify: sending Done via uuid_houndify_send_done ${ep.uuid}`);
|
||||
ep.api('uuid_houndify_send_done', `${ep.uuid} ${this.bugname}`)
|
||||
.catch((err) => this.logger.error({err}, 'uuid_houndify_send_done failed'));
|
||||
}
|
||||
|
||||
_onCompileContext(ep, key, evt) {
|
||||
const {addKey} = this.cs.srf.locals.dbHelpers;
|
||||
this.logger.debug({evt}, `received cobalt compile context event, will cache under ${key}`);
|
||||
|
||||
+77
-6
@@ -165,6 +165,8 @@ class TaskTranscribe extends SttTask {
|
||||
|
||||
async kill(cs) {
|
||||
super.kill(cs);
|
||||
this._clearAsrTimer(1);
|
||||
this._clearAsrTimer(2);
|
||||
const stopTranscription = this._stopTranscription();
|
||||
cs.stopSttLatencyVad();
|
||||
// hangup after 1 sec if we don't get a final transcription
|
||||
@@ -351,6 +353,8 @@ class TaskTranscribe extends SttTask {
|
||||
this._onVendorConnectFailure.bind(this, cs, ep, channel));
|
||||
this.addCustomEventListener(ep, HoundifyTranscriptionEvents.Connect,
|
||||
this._onVendorConnect.bind(this, cs, ep));
|
||||
this.addCustomEventListener(ep, HoundifyTranscriptionEvents.EoqVadTriggered,
|
||||
this._onHoundifyEoqVadTriggered.bind(this, cs, ep, channel));
|
||||
break;
|
||||
|
||||
case 'voxist':
|
||||
@@ -541,6 +545,19 @@ class TaskTranscribe extends SttTask {
|
||||
this.logger.debug({evt}, 'TaskGather:_onTranscription - got empty transcript from old gather, disregarding');
|
||||
return;
|
||||
}
|
||||
else if (this.vendor === 'houndify') {
|
||||
const segMode = this.data.recognizer?.houndifyOptions?.requestInfo?.segmentation?.mode !== 'none';
|
||||
if (!segMode && this._houndifyLastPartial?.[channel - 1]) {
|
||||
this.logger.info({evt},
|
||||
'TaskTranscribe: empty FT from houndify (mode OFF), using lastPartial');
|
||||
} else if (segMode && bufferedTranscripts.length > 0) {
|
||||
this.logger.info({evt},
|
||||
'TaskTranscribe: empty FT from houndify (mode ON), using buffered segments');
|
||||
} else {
|
||||
this.logger.info({evt}, 'TaskTranscribe: empty FT from houndify, nothing buffered');
|
||||
return;
|
||||
}
|
||||
}
|
||||
else if (this.vendor !== 'deepgram') {
|
||||
this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening');
|
||||
return;
|
||||
@@ -590,6 +607,22 @@ class TaskTranscribe extends SttTask {
|
||||
evt = this.consolidateTranscripts(bufferedTranscripts, channel, this.language);
|
||||
this._bufferedTranscripts[channel - 1] = [];
|
||||
}
|
||||
else if (this.vendor === 'houndify') {
|
||||
this._clearAsrTimer(channel);
|
||||
if (this._houndifyDoneSentByCh) this._houndifyDoneSentByCh[channel - 1] = false;
|
||||
this._houndifyDoneSent = false;
|
||||
const segMode = this.data.recognizer?.houndifyOptions?.requestInfo?.segmentation?.mode !== 'none';
|
||||
if (segMode) {
|
||||
if (!emptyTranscript) bufferedTranscripts.push(evt);
|
||||
if (bufferedTranscripts.length === 0) return;
|
||||
evt = this.consolidateTranscripts(bufferedTranscripts, channel, this.language);
|
||||
this._bufferedTranscripts[channel - 1] = [];
|
||||
}
|
||||
else if (emptyTranscript && this._houndifyLastPartial?.[channel - 1]) {
|
||||
evt = this._houndifyLastPartial[channel - 1];
|
||||
this._houndifyLastPartial[channel - 1] = null;
|
||||
}
|
||||
}
|
||||
|
||||
/* here is where we return a final transcript */
|
||||
this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - sending final transcript');
|
||||
@@ -612,6 +645,18 @@ class TaskTranscribe extends SttTask {
|
||||
bufferedTranscripts.push(evt);
|
||||
}
|
||||
}
|
||||
else if (this.vendor === 'houndify') {
|
||||
const transcript = evt.alternatives[0]?.transcript;
|
||||
if (evt.is_partial_final && transcript) {
|
||||
/* mode segmentation ON: buffer FinalSegment + arm asrTimer */
|
||||
bufferedTranscripts.push(evt);
|
||||
this._startAsrTimer(channel);
|
||||
} else if (transcript) {
|
||||
/* mode segmentation OFF: track last non-empty partial per channel */
|
||||
this._houndifyLastPartial = this._houndifyLastPartial || [null, null];
|
||||
this._houndifyLastPartial[channel - 1] = evt;
|
||||
}
|
||||
}
|
||||
|
||||
if (this.interim) {
|
||||
this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - sending interim transcript');
|
||||
@@ -889,23 +934,49 @@ class TaskTranscribe extends SttTask {
|
||||
this._onVendorError(cs, _ep, {error: JSON.stringify(e)});
|
||||
}
|
||||
|
||||
/* houndify mode=none: FS detected eoq/vad, arm asrTimer for this channel */
|
||||
_onHoundifyEoqVadTriggered(_cs, _ep, channel, _evt, _fsEvent) {
|
||||
if (this._asrTimers?.[channel - 1]) return; /* idempotent per channel */
|
||||
this._startAsrTimer(channel);
|
||||
}
|
||||
|
||||
_startAsrTimer(channel) {
|
||||
if (this.vendor === 'deepgram') return; // no need
|
||||
assert(this.isContinuousAsr);
|
||||
assert(this.isContinuousAsr || this.vendor === 'houndify');
|
||||
this._clearAsrTimer(channel);
|
||||
this._asrTimer = setTimeout(() => {
|
||||
const timeoutMs = this.asrTimeout || 500;
|
||||
const timer = setTimeout(() => {
|
||||
this.logger.debug(`TaskTranscribe:_startAsrTimer - asr timer went off for channel: ${channel}`);
|
||||
if (this._asrTimers) this._asrTimers[channel - 1] = null;
|
||||
else this._asrTimer = null;
|
||||
|
||||
/* houndify: timer cue to send Done; wait for real FinalTranscript */
|
||||
if (this.vendor === 'houndify') return this._sendHoundifyDone(channel === 2 ? this.ep2 : this.ep, channel);
|
||||
|
||||
const evt = this.consolidateTranscripts(
|
||||
this._bufferedTranscripts[channel - 1], channel, this.language, this.vendor);
|
||||
this._bufferedTranscripts[channel - 1] = [];
|
||||
this._resolve(channel, evt);
|
||||
}, this.asrTimeout);
|
||||
this.logger.debug(`TaskTranscribe:_startAsrTimer: set for ${this.asrTimeout}ms for channel ${channel}`);
|
||||
}, timeoutMs);
|
||||
/* houndify: per-channel timers (stereo). Other vendors retain legacy single timer. */
|
||||
if (this.vendor === 'houndify') {
|
||||
this._asrTimers = this._asrTimers || [null, null];
|
||||
this._asrTimers[channel - 1] = timer;
|
||||
} else {
|
||||
this._asrTimer = timer;
|
||||
}
|
||||
this.logger.debug(`TaskTranscribe:_startAsrTimer: set for ${timeoutMs}ms for channel ${channel}`);
|
||||
}
|
||||
|
||||
_clearAsrTimer(channel) {
|
||||
if (this._asrTimer) clearTimeout(this._asrTimer);
|
||||
this._asrTimer = null;
|
||||
if (this._asrTimers?.[channel - 1]) {
|
||||
clearTimeout(this._asrTimers[channel - 1]);
|
||||
this._asrTimers[channel - 1] = null;
|
||||
}
|
||||
if (this._asrTimer) {
|
||||
clearTimeout(this._asrTimer);
|
||||
this._asrTimer = null;
|
||||
}
|
||||
}
|
||||
|
||||
// We need to keep track the fallback is happened for each endpoint
|
||||
|
||||
@@ -179,7 +179,8 @@
|
||||
"Transcription": "houndify_transcribe::transcription",
|
||||
"Error": "houndify_transcribe::error",
|
||||
"ConnectFailure": "houndify_transcribe::connect_failed",
|
||||
"Connect": "houndify_transcribe::connect"
|
||||
"Connect": "houndify_transcribe::connect",
|
||||
"EoqVadTriggered": "houndify_transcribe::eoq_vad_triggered"
|
||||
},
|
||||
"VoxistTranscriptionEvents": {
|
||||
"Transcription": "voxist_transcribe::transcription",
|
||||
|
||||
@@ -619,31 +619,32 @@ const normalizeHoundify = (evt, channel, language) => {
|
||||
const copy = JSON.parse(JSON.stringify(evt));
|
||||
const alternatives = [];
|
||||
let is_final = false;
|
||||
let is_partial_final = false;
|
||||
|
||||
if (evt.type) {
|
||||
/* WS API format: has "type" field (PartialTranscript, FinalSegmentTranscript, FinalTranscript, VadMessage) */
|
||||
/* WS API: PartialTranscript / FinalSegmentTranscript / FinalTranscript / VadMessage */
|
||||
if (evt.type === 'VadMessage') {
|
||||
/* VadMessage is not a transcription result - return empty alternatives so callers skip it */
|
||||
return {
|
||||
language_code: language,
|
||||
channel_tag: channel,
|
||||
is_final: false,
|
||||
is_partial_final: false,
|
||||
alternatives: [],
|
||||
vendor: {name: 'houndify', evt: copy}
|
||||
};
|
||||
}
|
||||
|
||||
is_final = evt.type === 'FinalSegmentTranscript' || evt.type === 'FinalTranscript';
|
||||
/* Only FinalTranscript ends the session; FinalSegmentTranscript is buffered. */
|
||||
is_final = evt.type === 'FinalTranscript';
|
||||
is_partial_final = evt.type === 'FinalSegmentTranscript';
|
||||
|
||||
if (evt.hypotheses && evt.hypotheses.length > 0) {
|
||||
/* FinalSegmentTranscript / FinalTranscript with hypotheses */
|
||||
const best = evt.hypotheses[0];
|
||||
alternatives.push({
|
||||
confidence: best.confidence || 0.0,
|
||||
transcript: best.text || evt.text || '',
|
||||
});
|
||||
} else if (evt.text !== undefined) {
|
||||
/* PartialTranscript */
|
||||
alternatives.push({
|
||||
confidence: evt.confidence || (evt.eoq !== undefined ? 1.0 - evt.eoq : 0.8),
|
||||
transcript: evt.text || '',
|
||||
@@ -666,6 +667,7 @@ const normalizeHoundify = (evt, channel, language) => {
|
||||
language_code: language,
|
||||
channel_tag: channel,
|
||||
is_final,
|
||||
is_partial_final,
|
||||
alternatives,
|
||||
vendor: {
|
||||
name: 'houndify',
|
||||
|
||||
Generated
+15
@@ -8133,6 +8133,21 @@
|
||||
"node": ">= 6.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/microsoft-cognitiveservices-speech-sdk/node_modules/utf-8-validate": {
|
||||
"version": "5.0.10",
|
||||
"resolved": "https://registry.npmjs.org/utf-8-validate/-/utf-8-validate-5.0.10.tgz",
|
||||
"integrity": "sha512-Z6czzLq4u8fPOyx7TU6X3dvUZVvoJmxSQ+IcrlmagKhilxlhZgxPK6C5Jqbkw1IDUmFTM+cz9QDnnLTwDz/2gQ==",
|
||||
"hasInstallScript": true,
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"node-gyp-build": "^4.3.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=6.14.2"
|
||||
}
|
||||
},
|
||||
"node_modules/microsoft-cognitiveservices-speech-sdk/node_modules/uuid": {
|
||||
"version": "9.0.1",
|
||||
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz",
|
||||
|
||||
Reference in New Issue
Block a user