diff --git a/lib/tasks/stt-task.js b/lib/tasks/stt-task.js index 7833710f..b51c3e12 100644 --- a/lib/tasks/stt-task.js +++ b/lib/tasks/stt-task.js @@ -427,6 +427,7 @@ class SttTask extends Task { 'google', 'speechmatics', 'openai', + 'houndify', ].includes(vendor)); } diff --git a/lib/utils/transcription-utils.js b/lib/utils/transcription-utils.js index 0a8ed93c..91fc1701 100644 --- a/lib/utils/transcription-utils.js +++ b/lib/utils/transcription-utils.js @@ -138,35 +138,7 @@ const stickyVars = { 'HOUNDIFY_MAX_SILENCE_SECONDS', 'HOUNDIFY_MAX_SILENCE_AFTER_FULL_QUERY_SECONDS', 'HOUNDIFY_MAX_SILENCE_AFTER_PARTIAL_QUERY_SECONDS', - 'HOUNDIFY_VAD_SENSITIVITY', - 'HOUNDIFY_VAD_TIMEOUT', - 'HOUNDIFY_VAD_MODE', - 'HOUNDIFY_VAD_VOICE_MS', - 'HOUNDIFY_VAD_SILENCE_MS', - 'HOUNDIFY_VAD_DEBUG', - 'HOUNDIFY_AUDIO_FORMAT', - 'HOUNDIFY_ENABLE_NOISE_REDUCTION', - 'HOUNDIFY_AUDIO_ENDPOINT', - 'HOUNDIFY_ENABLE_PROFANITY_FILTER', - 'HOUNDIFY_ENABLE_PUNCTUATION', - 'HOUNDIFY_ENABLE_CAPITALIZATION', - 'HOUNDIFY_CONFIDENCE_THRESHOLD', - 'HOUNDIFY_ENABLE_DISFLUENCY_FILTER', - 'HOUNDIFY_MAX_RESULTS', - 'HOUNDIFY_ENABLE_WORD_TIMESTAMPS', - 'HOUNDIFY_MAX_ALTERNATIVES', - 'HOUNDIFY_PARTIAL_TRANSCRIPT_INTERVAL', - 'HOUNDIFY_SESSION_TIMEOUT', - 'HOUNDIFY_CONNECTION_TIMEOUT', - 'HOUNDIFY_LATITUDE', - 'HOUNDIFY_LONGITUDE', - 'HOUNDIFY_CITY', - 'HOUNDIFY_STATE', - 'HOUNDIFY_COUNTRY', - 'HOUNDIFY_TIMEZONE', - 'HOUNDIFY_DOMAIN', - 'HOUNDIFY_CUSTOM_VOCABULARY', - 'HOUNDIFY_LANGUAGE_MODEL' + 'HOUNDIFY_MODEL' ], }; @@ -646,15 +618,50 @@ const normalizeAssemblyAi = (evt, channel, language) => { const normalizeHoundify = (evt, channel, language) => { const copy = JSON.parse(JSON.stringify(evt)); const alternatives = []; - const is_final = evt.ResultsAreFinal && evt.ResultsAreFinal[0] === true; - if (evt.Disambiguation && evt.Disambiguation.ChoiceData && evt.Disambiguation.ChoiceData.length > 0) { - // Handle Houndify Voice Search Result format - const choiceData = evt.Disambiguation.ChoiceData[0]; - alternatives.push({ - confidence: choiceData.ConfidenceScore || choiceData.ASRConfidence || 0.0, - transcript: choiceData.FormattedTranscription || choiceData.Transcription || '', - }); + let is_final = false; + + if (evt.type) { + /* WS API format: has "type" field (PartialTranscript, FinalSegmentTranscript, FinalTranscript, VadMessage) */ + if (evt.type === 'VadMessage') { + /* VadMessage is not a transcription result - return empty alternatives so callers skip it */ + return { + language_code: language, + channel_tag: channel, + is_final: false, + alternatives: [], + vendor: {name: 'houndify', evt: copy} + }; + } + + is_final = evt.type === 'FinalSegmentTranscript' || evt.type === 'FinalTranscript'; + + if (evt.hypotheses && evt.hypotheses.length > 0) { + /* FinalSegmentTranscript / FinalTranscript with hypotheses */ + const best = evt.hypotheses[0]; + alternatives.push({ + confidence: best.confidence || 0.0, + transcript: best.text || evt.text || '', + }); + } else if (evt.text !== undefined) { + /* PartialTranscript */ + alternatives.push({ + confidence: evt.confidence || (evt.eoq !== undefined ? 1.0 - evt.eoq : 0.8), + transcript: evt.text || '', + }); + } + } else { + /* SDK format: has Disambiguation/ResultsAreFinal fields */ + is_final = evt.ResultsAreFinal && evt.ResultsAreFinal[0] === true; + if (evt.Disambiguation && evt.Disambiguation.ChoiceData && evt.Disambiguation.ChoiceData.length > 0) { + // Handle Houndify Voice Search Result format + const choiceData = evt.Disambiguation.ChoiceData[0]; + alternatives.push({ + confidence: choiceData.ConfidenceScore || choiceData.ASRConfidence || 0.0, + transcript: choiceData.FormattedTranscription || choiceData.Transcription || '', + }); + } } + return { language_code: language, channel_tag: channel, @@ -1220,8 +1227,10 @@ module.exports = (logger) => { enableCapitalization, confidenceThreshold, enableDisfluencyFilter, maxResults, enableWordTimestamps, maxAlternatives, partialTranscriptInterval, sessionTimeout, connectionTimeout, customVocabulary, languageModel, - requestInfo, sampleRate + requestInfo, sampleRate, audioQueryAbsoluteTimeout, + eoqThreshold, vadStopThreshold } = rOpts.houndifyOptions || {}; + const {model} = rOpts; const audioEndpointUri = audioEndpoint || sttCredentials.houndify_server_uri; opts = { @@ -1232,6 +1241,7 @@ module.exports = (logger) => { HOUNDIFY_MAX_SILENCE_SECONDS: maxSilenceSeconds || 5, HOUNDIFY_MAX_SILENCE_AFTER_FULL_QUERY_SECONDS: maxSilenceAfterFullQuerySeconds || 1, HOUNDIFY_MAX_SILENCE_AFTER_PARTIAL_QUERY_SECONDS: maxSilenceAfterPartialQuerySeconds || 1.5, + HOUNDIFY_MODEL: model || 'base_8k', ...(vadSensitivity && {HOUNDIFY_VAD_SENSITIVITY: vadSensitivity}), ...(vadTimeout && {HOUNDIFY_VAD_TIMEOUT: vadTimeout}), ...(vadMode && {HOUNDIFY_VAD_MODE: vadMode}), @@ -1264,6 +1274,9 @@ module.exports = (logger) => { ...(languageModel && {HOUNDIFY_LANGUAGE_MODEL: languageModel}), ...(requestInfo && {HOUNDIFY_REQUEST_INFO: JSON.stringify(requestInfo)}), ...(sampleRate && {HOUNDIFY_SAMPLING_RATE: sampleRate}), + ...(audioQueryAbsoluteTimeout && {HOUNDIFY_AUDIO_QUERY_ABSOLUTE_TIMEOUT: audioQueryAbsoluteTimeout}), + ...(eoqThreshold !== undefined && {HOUNDIFY_EOQ_THRESHOLD: eoqThreshold}), + ...(vadStopThreshold !== undefined && {HOUNDIFY_VAD_STOP_THRESHOLD: vadStopThreshold}), }; } else if ('voxist' === vendor) { diff --git a/package-lock.json b/package-lock.json index c71e8268..0d686012 100644 --- a/package-lock.json +++ b/package-lock.json @@ -18,7 +18,7 @@ "@jambonz/speech-utils": "^0.2.30", "@jambonz/stats-collector": "^0.1.10", "@jambonz/time-series": "^0.2.17", - "@jambonz/verb-specifications": "^0.0.125", + "@jambonz/verb-specifications": "^0.1.11", "@modelcontextprotocol/sdk": "^1.9.0", "@opentelemetry/api": "^1.8.0", "@opentelemetry/exporter-jaeger": "^1.23.0", @@ -1647,9 +1647,9 @@ } }, "node_modules/@jambonz/verb-specifications": { - "version": "0.0.125", - "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.125.tgz", - "integrity": "sha512-lU1fyyYyjXOdIfQ2gmOFmssZASYNu6LD066iXjqFrBJpiI7shkprcZ1qeWGibuEk9nR2k+em3/YL31Wc8L4wvA==", + "version": "0.1.11", + "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.1.11.tgz", + "integrity": "sha512-2YWqBXhLUD+WLipMdysE155eOSgFVW3id0l4Ci37OYELJIG+GeJYMubReSlMXthU/WoVk91ATu+XkSsjuVO1dg==", "license": "MIT", "dependencies": { "debug": "^4.3.4", diff --git a/package.json b/package.json index 975759e4..54b0b971 100644 --- a/package.json +++ b/package.json @@ -34,7 +34,7 @@ "@jambonz/speech-utils": "^0.2.30", "@jambonz/stats-collector": "^0.1.10", "@jambonz/time-series": "^0.2.17", - "@jambonz/verb-specifications": "^0.0.125", + "@jambonz/verb-specifications": "^0.1.11", "@modelcontextprotocol/sdk": "^1.9.0", "@opentelemetry/api": "^1.8.0", "@opentelemetry/exporter-jaeger": "^1.23.0",