This commit is contained in:
xquanluu
2026-05-21 09:33:48 +07:00
parent ace41a39f7
commit f1f857ea4a
6 changed files with 175 additions and 22 deletions
+58 -8
View File
@@ -593,6 +593,8 @@ class TaskGather extends SttTask {
this._onVendorConnectFailure.bind(this, cs, ep));
this.addCustomEventListener(ep, HoundifyTranscriptionEvents.Connect,
this._onVendorConnect.bind(this, cs, ep));
this.addCustomEventListener(ep, HoundifyTranscriptionEvents.EoqVadTriggered,
this._onHoundifyEoqVadTriggered.bind(this, cs, ep));
break;
case 'voxist':
@@ -760,14 +762,23 @@ class TaskGather extends SttTask {
return false;
}
_onHoundifyEoqVadTriggered(_cs, _ep, _evt, _fsEvent) {
if (this._asrTimer) return;
this._startAsrTimer();
}
_startAsrTimer() {
// Deepgram has a case that UtteranceEnd is not sent to cover the last word end time.
// So we need to wait for the asrTimeout to be sure that the last word is sent.
// if (this.vendor === 'deepgram') return; // no need
assert(this.isContinuousAsr);
assert(this.isContinuousAsr || this.vendor === 'houndify');
this._clearAsrTimer();
const timeoutMs = this.asrTimeout || 500;
this._asrTimer = setTimeout(() => {
this.logger.info('_startAsrTimer - asr timer went off');
if (this.vendor === 'houndify') return this._sendHoundifyDone(this.ep);
const evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language, this.vendor);
/* special case for speechmatics - keep listening if we dont have any transcripts */
@@ -777,8 +788,8 @@ class TaskGather extends SttTask {
return;
}
this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout', evt);
}, this.asrTimeout);
this.logger.info(`_startAsrTimer: set for ${this.asrTimeout}ms`);
}, timeoutMs);
this.logger.info(`_startAsrTimer: set for ${timeoutMs}ms`);
}
_clearAsrTimer() {
@@ -982,6 +993,10 @@ class TaskGather extends SttTask {
let emptyTranscript = false;
if (evt.is_final) {
if (this.vendor === 'houndify') {
this._clearAsrTimer();
this._houndifyDoneSent = false;
}
if (evt.alternatives[0].transcript === '' && !this.callSession.callGone && !this.killed) {
emptyTranscript = true;
if (finished === 'true' &&
@@ -990,6 +1005,19 @@ class TaskGather extends SttTask {
this.logger.debug({evt}, 'TaskGather:_onTranscription - got empty transcript from old gather, disregarding');
return;
}
else if (this.vendor === 'houndify') {
const segMode = this.data.recognizer?.houndifyOptions?.requestInfo?.segmentation?.mode !== 'none';
if (!segMode && this._houndifyLastPartial) {
this.logger.info({evt},
'TaskGather:_onTranscription - empty FT from houndify (mode OFF), using lastPartial');
} else if (segMode && this._bufferedTranscripts.length > 0) {
this.logger.info({evt},
'TaskGather:_onTranscription - empty FT from houndify (mode ON), using buffered segments');
} else {
this.logger.info({evt}, 'TaskGather:_onTranscription - empty FT from houndify, nothing buffered');
return;
}
}
else if (this.vendor !== 'deepgram') {
this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening');
return;
@@ -1031,10 +1059,7 @@ class TaskGather extends SttTask {
}
this._startAsrTimer();
/* some STT engines will keep listening after a final response, so no need to restart */
if (!['soniox', 'aws', 'microsoft', 'deepgram', 'speechmatics', 'houndify', 'google', 'openai']
.includes(this.vendor) &&
!this.vendor.startsWith('custom')) {
if (!this.doesVendorContinueListeningAfterFinalTranscript(this.vendor)) {
this._startTranscribing(ep);
}
}
@@ -1065,6 +1090,19 @@ class TaskGather extends SttTask {
evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language, this.vendor);
this._bufferedTranscripts = [];
}
else if (this.vendor === 'houndify') {
const segMode = this.data.recognizer?.houndifyOptions?.requestInfo?.segmentation?.mode !== 'none';
if (segMode) {
if (!emptyTranscript) this._bufferedTranscripts.push(evt);
if (this._bufferedTranscripts.length === 0) return;
evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language, this.vendor);
this._bufferedTranscripts = [];
}
else if (emptyTranscript && this._houndifyLastPartial) {
evt = this._houndifyLastPartial;
this._houndifyLastPartial = null;
}
}
/* here is where we return a final transcript */
this._resolve('speech', evt);
@@ -1074,7 +1112,19 @@ class TaskGather extends SttTask {
else {
/* deepgram can send a non-final transcript but with words that are final, so we need to buffer */
let emptyTranscript = false;
if (this.vendor === 'deepgram') {
if (this.vendor === 'houndify') {
const transcript = evt.alternatives[0]?.transcript;
if (evt.is_partial_final && transcript) {
/* mode segmentation ON: buffer FinalSegment + arm asrTimer (sends Done on expiry) */
this._bufferedTranscripts.push(evt);
this._startAsrTimer();
} else if (transcript) {
/* mode segmentation OFF: track last non-empty partial */
this._houndifyLastPartial = evt;
}
if (!transcript) emptyTranscript = true;
}
else if (this.vendor === 'deepgram') {
const originalEvent = evt.vendor.evt;
if (originalEvent.is_final && evt.alternatives[0].transcript !== '') {
this.logger.debug({evt}, 'Gather:_onTranscription - buffering a completed (partial) deepgram transcript');
+16 -2
View File
@@ -417,7 +417,7 @@ class SttTask extends Task {
return finalPrompt?.trimStart();
}
/* some STT engines will keep listening after a final response, so no need to restart */
/* some STT engines will keep listening after a final response, so no need to restart*/
doesVendorContinueListeningAfterFinalTranscript(vendor) {
return (vendor.startsWith('custom:') || [
'soniox',
@@ -427,10 +427,24 @@ class SttTask extends Task {
'google',
'speechmatics',
'openai',
'houndify',
].includes(vendor));
}
/* Send Done to Houndify via FS API; FinalTranscript arrives via normal flow.*/
_sendHoundifyDone(ep, channel) {
if (channel !== undefined) {
this._houndifyDoneSentByCh = this._houndifyDoneSentByCh || [false, false];
if (this._houndifyDoneSentByCh[channel - 1]) return;
this._houndifyDoneSentByCh[channel - 1] = true;
} else {
if (this._houndifyDoneSent) return;
this._houndifyDoneSent = true;
}
this.logger.info(`houndify: sending Done via uuid_houndify_send_done ${ep.uuid}`);
ep.api('uuid_houndify_send_done', `${ep.uuid} ${this.bugname}`)
.catch((err) => this.logger.error({err}, 'uuid_houndify_send_done failed'));
}
_onCompileContext(ep, key, evt) {
const {addKey} = this.cs.srf.locals.dbHelpers;
this.logger.debug({evt}, `received cobalt compile context event, will cache under ${key}`);
+77 -6
View File
@@ -165,6 +165,8 @@ class TaskTranscribe extends SttTask {
async kill(cs) {
super.kill(cs);
this._clearAsrTimer(1);
this._clearAsrTimer(2);
const stopTranscription = this._stopTranscription();
cs.stopSttLatencyVad();
// hangup after 1 sec if we don't get a final transcription
@@ -351,6 +353,8 @@ class TaskTranscribe extends SttTask {
this._onVendorConnectFailure.bind(this, cs, ep, channel));
this.addCustomEventListener(ep, HoundifyTranscriptionEvents.Connect,
this._onVendorConnect.bind(this, cs, ep));
this.addCustomEventListener(ep, HoundifyTranscriptionEvents.EoqVadTriggered,
this._onHoundifyEoqVadTriggered.bind(this, cs, ep, channel));
break;
case 'voxist':
@@ -541,6 +545,19 @@ class TaskTranscribe extends SttTask {
this.logger.debug({evt}, 'TaskGather:_onTranscription - got empty transcript from old gather, disregarding');
return;
}
else if (this.vendor === 'houndify') {
const segMode = this.data.recognizer?.houndifyOptions?.requestInfo?.segmentation?.mode !== 'none';
if (!segMode && this._houndifyLastPartial?.[channel - 1]) {
this.logger.info({evt},
'TaskTranscribe: empty FT from houndify (mode OFF), using lastPartial');
} else if (segMode && bufferedTranscripts.length > 0) {
this.logger.info({evt},
'TaskTranscribe: empty FT from houndify (mode ON), using buffered segments');
} else {
this.logger.info({evt}, 'TaskTranscribe: empty FT from houndify, nothing buffered');
return;
}
}
else if (this.vendor !== 'deepgram') {
this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening');
return;
@@ -590,6 +607,22 @@ class TaskTranscribe extends SttTask {
evt = this.consolidateTranscripts(bufferedTranscripts, channel, this.language);
this._bufferedTranscripts[channel - 1] = [];
}
else if (this.vendor === 'houndify') {
this._clearAsrTimer(channel);
if (this._houndifyDoneSentByCh) this._houndifyDoneSentByCh[channel - 1] = false;
this._houndifyDoneSent = false;
const segMode = this.data.recognizer?.houndifyOptions?.requestInfo?.segmentation?.mode !== 'none';
if (segMode) {
if (!emptyTranscript) bufferedTranscripts.push(evt);
if (bufferedTranscripts.length === 0) return;
evt = this.consolidateTranscripts(bufferedTranscripts, channel, this.language);
this._bufferedTranscripts[channel - 1] = [];
}
else if (emptyTranscript && this._houndifyLastPartial?.[channel - 1]) {
evt = this._houndifyLastPartial[channel - 1];
this._houndifyLastPartial[channel - 1] = null;
}
}
/* here is where we return a final transcript */
this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - sending final transcript');
@@ -612,6 +645,18 @@ class TaskTranscribe extends SttTask {
bufferedTranscripts.push(evt);
}
}
else if (this.vendor === 'houndify') {
const transcript = evt.alternatives[0]?.transcript;
if (evt.is_partial_final && transcript) {
/* mode segmentation ON: buffer FinalSegment + arm asrTimer */
bufferedTranscripts.push(evt);
this._startAsrTimer(channel);
} else if (transcript) {
/* mode segmentation OFF: track last non-empty partial per channel */
this._houndifyLastPartial = this._houndifyLastPartial || [null, null];
this._houndifyLastPartial[channel - 1] = evt;
}
}
if (this.interim) {
this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - sending interim transcript');
@@ -889,23 +934,49 @@ class TaskTranscribe extends SttTask {
this._onVendorError(cs, _ep, {error: JSON.stringify(e)});
}
/* houndify mode=none: FS detected eoq/vad, arm asrTimer for this channel */
_onHoundifyEoqVadTriggered(_cs, _ep, channel, _evt, _fsEvent) {
if (this._asrTimers?.[channel - 1]) return; /* idempotent per channel */
this._startAsrTimer(channel);
}
_startAsrTimer(channel) {
if (this.vendor === 'deepgram') return; // no need
assert(this.isContinuousAsr);
assert(this.isContinuousAsr || this.vendor === 'houndify');
this._clearAsrTimer(channel);
this._asrTimer = setTimeout(() => {
const timeoutMs = this.asrTimeout || 500;
const timer = setTimeout(() => {
this.logger.debug(`TaskTranscribe:_startAsrTimer - asr timer went off for channel: ${channel}`);
if (this._asrTimers) this._asrTimers[channel - 1] = null;
else this._asrTimer = null;
/* houndify: timer cue to send Done; wait for real FinalTranscript */
if (this.vendor === 'houndify') return this._sendHoundifyDone(channel === 2 ? this.ep2 : this.ep, channel);
const evt = this.consolidateTranscripts(
this._bufferedTranscripts[channel - 1], channel, this.language, this.vendor);
this._bufferedTranscripts[channel - 1] = [];
this._resolve(channel, evt);
}, this.asrTimeout);
this.logger.debug(`TaskTranscribe:_startAsrTimer: set for ${this.asrTimeout}ms for channel ${channel}`);
}, timeoutMs);
/* houndify: per-channel timers (stereo). Other vendors retain legacy single timer. */
if (this.vendor === 'houndify') {
this._asrTimers = this._asrTimers || [null, null];
this._asrTimers[channel - 1] = timer;
} else {
this._asrTimer = timer;
}
this.logger.debug(`TaskTranscribe:_startAsrTimer: set for ${timeoutMs}ms for channel ${channel}`);
}
_clearAsrTimer(channel) {
if (this._asrTimer) clearTimeout(this._asrTimer);
this._asrTimer = null;
if (this._asrTimers?.[channel - 1]) {
clearTimeout(this._asrTimers[channel - 1]);
this._asrTimers[channel - 1] = null;
}
if (this._asrTimer) {
clearTimeout(this._asrTimer);
this._asrTimer = null;
}
}
// We need to keep track the fallback is happened for each endpoint
+2 -1
View File
@@ -179,7 +179,8 @@
"Transcription": "houndify_transcribe::transcription",
"Error": "houndify_transcribe::error",
"ConnectFailure": "houndify_transcribe::connect_failed",
"Connect": "houndify_transcribe::connect"
"Connect": "houndify_transcribe::connect",
"EoqVadTriggered": "houndify_transcribe::eoq_vad_triggered"
},
"VoxistTranscriptionEvents": {
"Transcription": "voxist_transcribe::transcription",
+7 -5
View File
@@ -619,31 +619,32 @@ const normalizeHoundify = (evt, channel, language) => {
const copy = JSON.parse(JSON.stringify(evt));
const alternatives = [];
let is_final = false;
let is_partial_final = false;
if (evt.type) {
/* WS API format: has "type" field (PartialTranscript, FinalSegmentTranscript, FinalTranscript, VadMessage) */
/* WS API: PartialTranscript / FinalSegmentTranscript / FinalTranscript / VadMessage */
if (evt.type === 'VadMessage') {
/* VadMessage is not a transcription result - return empty alternatives so callers skip it */
return {
language_code: language,
channel_tag: channel,
is_final: false,
is_partial_final: false,
alternatives: [],
vendor: {name: 'houndify', evt: copy}
};
}
is_final = evt.type === 'FinalSegmentTranscript' || evt.type === 'FinalTranscript';
/* Only FinalTranscript ends the session; FinalSegmentTranscript is buffered. */
is_final = evt.type === 'FinalTranscript';
is_partial_final = evt.type === 'FinalSegmentTranscript';
if (evt.hypotheses && evt.hypotheses.length > 0) {
/* FinalSegmentTranscript / FinalTranscript with hypotheses */
const best = evt.hypotheses[0];
alternatives.push({
confidence: best.confidence || 0.0,
transcript: best.text || evt.text || '',
});
} else if (evt.text !== undefined) {
/* PartialTranscript */
alternatives.push({
confidence: evt.confidence || (evt.eoq !== undefined ? 1.0 - evt.eoq : 0.8),
transcript: evt.text || '',
@@ -666,6 +667,7 @@ const normalizeHoundify = (evt, channel, language) => {
language_code: language,
channel_tag: channel,
is_final,
is_partial_final,
alternatives,
vendor: {
name: 'houndify',
+15
View File
@@ -8133,6 +8133,21 @@
"node": ">= 6.0.0"
}
},
"node_modules/microsoft-cognitiveservices-speech-sdk/node_modules/utf-8-validate": {
"version": "5.0.10",
"resolved": "https://registry.npmjs.org/utf-8-validate/-/utf-8-validate-5.0.10.tgz",
"integrity": "sha512-Z6czzLq4u8fPOyx7TU6X3dvUZVvoJmxSQ+IcrlmagKhilxlhZgxPK6C5Jqbkw1IDUmFTM+cz9QDnnLTwDz/2gQ==",
"hasInstallScript": true,
"license": "MIT",
"optional": true,
"peer": true,
"dependencies": {
"node-gyp-build": "^4.3.0"
},
"engines": {
"node": ">=6.14.2"
}
},
"node_modules/microsoft-cognitiveservices-speech-sdk/node_modules/uuid": {
"version": "9.0.1",
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz",