mirror of
https://github.com/jambonz/jambonz-feature-server.git
synced 2025-12-20 16:50:39 +00:00
deepgram: rework continuous asr, and resolve on speech_final not is_f… (#501)
* deepgram: rework continuous asr, and resolve on speech_final not is_final (wip) * wip * deepgram: empty final transcript should trigger resolve with speech if we have buffered transcripts * wip * fixes for deepgram compiling multiple transcripts * test deepgram utteranceEndMs * more handling of utteranceEndMs * wip * better handling of digit strings collected over multiple deepgram responses * wip * add support for deepgramOptions.shortUtterance which triggers off of is_final instead of speech_final * apply deepgram fixes to transcribe * cleanup continnuous asr * more continuous asr fixes for deepgram * update to verb-specifications for handling SttTask properties * set log level for tests back to error
This commit is contained in:
@@ -52,6 +52,7 @@ const stickyVars = {
|
||||
'DEEPGRAM_SPEECH_SEARCH',
|
||||
'DEEPGRAM_SPEECH_REPLACE',
|
||||
'DEEPGRAM_SPEECH_ENDPOINTING',
|
||||
'DEEPGRAM_SPEECH_UTTERANCE_END_MS',
|
||||
'DEEPGRAM_SPEECH_VAD_TURNOFF',
|
||||
'DEEPGRAM_SPEECH_TAG'
|
||||
],
|
||||
@@ -106,6 +107,53 @@ const stickyVars = {
|
||||
]
|
||||
};
|
||||
|
||||
const consolidateTranscripts = (bufferedTranscripts, channel, language) => {
|
||||
if (bufferedTranscripts.length === 1) return bufferedTranscripts[0];
|
||||
let totalConfidence = 0;
|
||||
const finalTranscript = bufferedTranscripts.reduce((acc, evt) => {
|
||||
totalConfidence += evt.alternatives[0].confidence;
|
||||
|
||||
let newTranscript = evt.alternatives[0].transcript;
|
||||
|
||||
// If new transcript consists only of digits, spaces, and a trailing comma or period
|
||||
if (newTranscript.match(/^[\d\s]+[,.]?$/)) {
|
||||
newTranscript = newTranscript.replace(/\s/g, ''); // Remove all spaces
|
||||
if (newTranscript.endsWith(',')) {
|
||||
newTranscript = newTranscript.slice(0, -1); // Remove the trailing comma
|
||||
} else if (newTranscript.endsWith('.')) {
|
||||
newTranscript = newTranscript.slice(0, -1); // Remove the trailing period
|
||||
}
|
||||
}
|
||||
|
||||
const lastChar = acc.alternatives[0].transcript.slice(-1);
|
||||
const firstChar = newTranscript.charAt(0);
|
||||
|
||||
if (lastChar.match(/\d/) && firstChar.match(/\d/)) {
|
||||
acc.alternatives[0].transcript += newTranscript;
|
||||
} else {
|
||||
acc.alternatives[0].transcript += ` ${newTranscript}`;
|
||||
}
|
||||
|
||||
return acc;
|
||||
}, {
|
||||
language_code: language,
|
||||
channel_tag: channel,
|
||||
is_final: true,
|
||||
alternatives: [{
|
||||
transcript: ''
|
||||
}]
|
||||
});
|
||||
finalTranscript.alternatives[0].confidence = bufferedTranscripts.length === 1 ?
|
||||
bufferedTranscripts[0].alternatives[0].confidence :
|
||||
totalConfidence / bufferedTranscripts.length;
|
||||
finalTranscript.alternatives[0].transcript = finalTranscript.alternatives[0].transcript.trim();
|
||||
finalTranscript.vendor = {
|
||||
name: 'deepgram',
|
||||
evt: bufferedTranscripts
|
||||
};
|
||||
return finalTranscript;
|
||||
};
|
||||
|
||||
const compileSonioxTranscripts = (finalWordChunks, channel, language) => {
|
||||
const words = finalWordChunks.flat();
|
||||
const transcript = words.reduce((acc, word) => {
|
||||
@@ -163,7 +211,7 @@ const normalizeSoniox = (evt, channel, language) => {
|
||||
};
|
||||
};
|
||||
|
||||
const normalizeDeepgram = (evt, channel, language) => {
|
||||
const normalizeDeepgram = (evt, channel, language, shortUtterance) => {
|
||||
const copy = JSON.parse(JSON.stringify(evt));
|
||||
const alternatives = (evt.channel?.alternatives || [])
|
||||
.map((alt) => ({
|
||||
@@ -171,10 +219,14 @@ const normalizeDeepgram = (evt, channel, language) => {
|
||||
transcript: alt.transcript,
|
||||
}));
|
||||
|
||||
/**
|
||||
* note difference between is_final and speech_final in Deepgram:
|
||||
* https://developers.deepgram.com/docs/understand-endpointing-interim-results
|
||||
*/
|
||||
return {
|
||||
language_code: language,
|
||||
channel_tag: channel,
|
||||
is_final: evt.is_final,
|
||||
is_final: shortUtterance ? evt.is_final : evt.speech_final,
|
||||
alternatives: [alternatives[0]],
|
||||
vendor: {
|
||||
name: 'deepgram',
|
||||
@@ -325,12 +377,12 @@ const normalizeAws = (evt, channel, language) => {
|
||||
|
||||
|
||||
module.exports = (logger) => {
|
||||
const normalizeTranscription = (evt, vendor, channel, language) => {
|
||||
const normalizeTranscription = (evt, vendor, channel, language, shortUtterance) => {
|
||||
|
||||
//logger.debug({ evt, vendor, channel, language }, 'normalizeTranscription');
|
||||
switch (vendor) {
|
||||
case 'deepgram':
|
||||
return normalizeDeepgram(evt, channel, language);
|
||||
return normalizeDeepgram(evt, channel, language, shortUtterance);
|
||||
case 'microsoft':
|
||||
return normalizeMicrosoft(evt, channel, language);
|
||||
case 'google':
|
||||
@@ -536,6 +588,8 @@ module.exports = (logger) => {
|
||||
{DEEPGRAM_SPEECH_KEYWORDS: deepgramOptions.keywords.join(',')},
|
||||
...('endpointing' in deepgramOptions) &&
|
||||
{DEEPGRAM_SPEECH_ENDPOINTING: deepgramOptions.endpointing},
|
||||
...(deepgramOptions.utteranceEndMs) &&
|
||||
{DEEPGRAM_SPEECH_UTTERANCE_END_MS: deepgramOptions.utteranceEndMs},
|
||||
...(deepgramOptions.vadTurnoff) &&
|
||||
{DEEPGRAM_SPEECH_VAD_TURNOFF: deepgramOptions.vadTurnoff},
|
||||
...(deepgramOptions.tag) &&
|
||||
@@ -743,6 +797,7 @@ module.exports = (logger) => {
|
||||
setChannelVarsForStt,
|
||||
removeSpeechListeners,
|
||||
setSpeechCredentialsAtRuntime,
|
||||
compileSonioxTranscripts
|
||||
compileSonioxTranscripts,
|
||||
consolidateTranscripts
|
||||
};
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user