mirror of
https://github.com/jambonz/jambonz-feature-server.git
synced 2025-12-19 04:17:44 +00:00
511 lines
19 KiB
JavaScript
511 lines
19 KiB
JavaScript
const Task = require('./task');
|
|
const assert = require('assert');
|
|
const crypto = require('crypto');
|
|
const { TaskPreconditions, CobaltTranscriptionEvents } = require('../utils/constants');
|
|
const { SpeechCredentialError } = require('../utils/error');
|
|
const {JAMBONES_AWS_TRANSCRIBE_USE_GRPC} = require('../config');
|
|
const {TaskName} = require('../utils/constants.json');
|
|
|
|
/**
|
|
* "Please insert turns here: {{turns:4}}"
|
|
// -> { processed: 'Please insert turns here: {{turns}}', turns: 4 }
|
|
|
|
processTurnString("Please insert turns here: {{turns}}"));
|
|
// -> { processed: 'Please insert turns here: {{turns}}', turns: null }
|
|
*/
|
|
const processTurnString = (input) => {
|
|
const regex = /\{\{turns(?::(\d+))?\}\}/;
|
|
const match = input.match(regex);
|
|
|
|
if (!match) {
|
|
return {
|
|
processed: input,
|
|
turns: null
|
|
};
|
|
}
|
|
|
|
const turns = match[1] ? parseInt(match[1], 10) : null;
|
|
const processed = input.replace(regex, '{{turns}}');
|
|
|
|
return { processed, turns };
|
|
};
|
|
|
|
class SttTask extends Task {
|
|
|
|
constructor(logger, data, parentTask) {
|
|
super(logger, data);
|
|
this.parentTask = parentTask;
|
|
|
|
this.preconditions = TaskPreconditions.Endpoint;
|
|
|
|
const {
|
|
setChannelVarsForStt,
|
|
normalizeTranscription,
|
|
setSpeechCredentialsAtRuntime,
|
|
compileSonioxTranscripts,
|
|
consolidateTranscripts,
|
|
updateSpeechmaticsPayload
|
|
} = require('../utils/transcription-utils')(logger);
|
|
this.setChannelVarsForStt = setChannelVarsForStt;
|
|
this.normalizeTranscription = normalizeTranscription;
|
|
this.compileSonioxTranscripts = compileSonioxTranscripts;
|
|
this.consolidateTranscripts = consolidateTranscripts;
|
|
this.updateSpeechmaticsPayload = updateSpeechmaticsPayload;
|
|
this.eventHandlers = [];
|
|
this.isHandledByPrimaryProvider = true;
|
|
/**
|
|
* Task use taskIncludeRecognizer to identify
|
|
* if taskIncludeRecognizer === true, use label from verb.recognizer, even it's empty
|
|
* if taskIncludeRecognizer === false, use label from application.recognizer
|
|
*/
|
|
this.taskIncludeRecognizer = !!this.data.recognizer;
|
|
if (this.data.recognizer) {
|
|
const recognizer = this.data.recognizer;
|
|
this.vendor = recognizer.vendor;
|
|
this.language = recognizer.language;
|
|
this.label = recognizer.label;
|
|
|
|
//fallback
|
|
this.fallbackVendor = recognizer.fallbackVendor || 'default';
|
|
this.fallbackLanguage = recognizer.fallbackLanguage || 'default';
|
|
this.fallbackLabel = recognizer.fallbackLabel;
|
|
|
|
/* let credentials be supplied in the recognizer object at runtime */
|
|
this.sttCredentials = setSpeechCredentialsAtRuntime(recognizer);
|
|
|
|
if (!Array.isArray(this.data.recognizer.altLanguages)) {
|
|
this.data.recognizer.altLanguages = [];
|
|
}
|
|
} else {
|
|
this.data.recognizer = {hints: [], altLanguages: []};
|
|
}
|
|
|
|
/* buffer for soniox transcripts */
|
|
this._sonioxTranscripts = [];
|
|
/*bug name prefix */
|
|
this.bugname_prefix = '';
|
|
|
|
// stt latency calculator
|
|
this.stt_latency_ms = '';
|
|
|
|
}
|
|
|
|
async exec(cs, {ep, ep2}) {
|
|
super.exec(cs);
|
|
this.ep = ep;
|
|
this.ep2 = ep2;
|
|
|
|
// start vad from stt latency calculator
|
|
if (this.name !== TaskName.Gather ||
|
|
this.name === TaskName.Gather && this.needsStt) {
|
|
cs.startSttLatencyVad();
|
|
}
|
|
|
|
// use session preferences if we don't have specific verb-level settings.
|
|
if (cs.recognizer) {
|
|
for (const k in cs.recognizer) {
|
|
const newValue = this.data.recognizer && this.data.recognizer[k] !== undefined ?
|
|
this.data.recognizer[k] :
|
|
cs.recognizer[k];
|
|
|
|
if (Array.isArray(newValue)) {
|
|
this.data.recognizer[k] = [...(this.data.recognizer[k] || []), ...cs.recognizer[k]];
|
|
} else if (typeof newValue === 'object' && newValue !== null) {
|
|
this.data.recognizer[k] = { ...(this.data.recognizer[k] || {}), ...cs.recognizer[k] };
|
|
} else {
|
|
this.data.recognizer[k] = newValue;
|
|
}
|
|
}
|
|
}
|
|
if ('default' === this.vendor || !this.vendor) {
|
|
this.vendor = cs.speechRecognizerVendor;
|
|
if (this.data.recognizer) this.data.recognizer.vendor = this.vendor;
|
|
}
|
|
if ('default' === this.language || !this.language) {
|
|
this.language = cs.speechRecognizerLanguage;
|
|
if (this.data.recognizer) this.data.recognizer.language = this.language;
|
|
}
|
|
if (!this.taskIncludeRecognizer) {
|
|
this.label = cs.speechRecognizerLabel;
|
|
if (this.data.recognizer) this.data.recognizer.label = this.label;
|
|
}
|
|
// Fallback options
|
|
if ('default' === this.fallbackVendor || !this.fallbackVendor) {
|
|
this.fallbackVendor = cs.fallbackSpeechRecognizerVendor;
|
|
if (this.data.recognizer) this.data.recognizer.fallbackVendor = this.fallbackVendor;
|
|
}
|
|
if ('default' === this.fallbackLanguage || !this.fallbackLanguage) {
|
|
this.fallbackLanguage = cs.fallbackSpeechRecognizerLanguage;
|
|
if (this.data.recognizer) this.data.recognizer.fallbackLanguage = this.fallbackLanguage;
|
|
}
|
|
if (!this.taskIncludeRecognizer) {
|
|
this.fallbackLabel = cs.fallbackSpeechRecognizerLabel;
|
|
if (this.data.recognizer) this.data.recognizer.fallbackLabel = this.fallbackLabel;
|
|
}
|
|
|
|
if (cs.hasFallbackAsr) {
|
|
if (this.taskIncludeRecognizer) {
|
|
// reset fallback ASR from previous run if this verb contains data.recognizer.
|
|
cs.hasFallbackAsr = false;
|
|
} else {
|
|
this.logger.debug('Call session has fallback to 2nd ASR, use 2nd recognizer configuration');
|
|
this.vendor = this.fallbackVendor;
|
|
this.language = this.fallbackLanguage;
|
|
this.label = this.fallbackLabel;
|
|
}
|
|
}
|
|
if (!this.data.recognizer.vendor) {
|
|
this.data.recognizer.vendor = this.vendor;
|
|
}
|
|
if (this.vendor === 'cobalt' && !this.data.recognizer.model) {
|
|
// By default, application saves cobalt model in language
|
|
this.data.recognizer.model = cs.speechRecognizerLanguage;
|
|
}
|
|
|
|
if (
|
|
// not gather task, such as transcribe
|
|
(!this.input ||
|
|
// gather task with speech
|
|
this.input.includes('speech')) &&
|
|
!this.sttCredentials) {
|
|
try {
|
|
this.sttCredentials = await this._initSpeechCredentials(this.cs, this.vendor, this.label);
|
|
} catch (error) {
|
|
if (this.canFallback()) {
|
|
this.notifyError(
|
|
{
|
|
msg: 'ASR error', details:`Invalid vendor ${this.vendor}, Error: ${error}`,
|
|
failover: 'in progress'
|
|
});
|
|
await this._initFallback();
|
|
} else {
|
|
this.notifyError(
|
|
{
|
|
msg: 'ASR error', details:`Invalid vendor ${this.vendor}, Error: ${error}`,
|
|
failover: 'not available'
|
|
});
|
|
throw error;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* when using cobalt model is required */
|
|
if (this.vendor === 'cobalt' && !this.data.recognizer.model) {
|
|
this.notifyError({ msg: 'ASR error', details:'Cobalt requires a model to be specified'});
|
|
throw new Error('Cobalt requires a model to be specified');
|
|
}
|
|
|
|
if (cs.hasAltLanguages) {
|
|
this.data.recognizer.altLanguages = this.data.recognizer.altLanguages.concat(cs.altLanguages);
|
|
this.logger.debug({altLanguages: this.altLanguages},
|
|
'STT:exec - applying altLanguages');
|
|
}
|
|
if (cs.hasGlobalSttPunctuation && !this.data.recognizer.punctuation) {
|
|
this.data.recognizer.punctuation = cs.globalSttPunctuation;
|
|
}
|
|
if (this.vendor === 'gladia') {
|
|
const { api_key, region } = this.sttCredentials;
|
|
const {url} = await this.createGladiaLiveSession({
|
|
api_key, region,
|
|
model: this.data.recognizer.model || 'solaria-1',
|
|
options: this.data.recognizer.gladiaOptions || {}
|
|
});
|
|
const {host, pathname, search} = new URL(url);
|
|
this.sttCredentials.host = host;
|
|
this.sttCredentials.path = `${pathname}${search}`;
|
|
}
|
|
}
|
|
|
|
|
|
async createGladiaLiveSession({
|
|
api_key,
|
|
region = 'us-west',
|
|
model = 'solaria-1',
|
|
options = {},
|
|
}) {
|
|
const url = `https://api.gladia.io/v2/live?region=${region}`;
|
|
const response = await fetch(url, {
|
|
method: 'POST',
|
|
headers: {
|
|
'x-gladia-key': api_key,
|
|
'Content-Type': 'application/json'
|
|
},
|
|
body: JSON.stringify({
|
|
encoding: 'wav/pcm',
|
|
bit_depth: 16,
|
|
sample_rate: 8000,
|
|
channels: 1,
|
|
model,
|
|
...options,
|
|
messages_config: {
|
|
receive_final_transcripts: true,
|
|
receive_speech_events: true,
|
|
receive_errors: true,
|
|
}
|
|
})
|
|
});
|
|
if (!response.ok) {
|
|
const error = await response.text();
|
|
this.logger.error({url, status: response.status, error}, 'Error creating Gladia live session');
|
|
throw new Error(`Error creating Gladia live session: ${response.status} ${error}`);
|
|
}
|
|
|
|
const data = await response.json();
|
|
this.logger.debug({url: data.url}, 'Gladia Call registered');
|
|
return data;
|
|
}
|
|
|
|
addCustomEventListener(ep, event, handler) {
|
|
this.eventHandlers.push({ep, event, handler});
|
|
ep.addCustomEventListener(event, handler);
|
|
}
|
|
|
|
removeCustomEventListeners(ep) {
|
|
if (ep) {
|
|
// for specific endpoint
|
|
this.eventHandlers.filter((h) => h.ep === ep).forEach((h) => {
|
|
h.ep.removeCustomEventListener(h.event, h.handler);
|
|
});
|
|
this.eventHandlers = this.eventHandlers.filter((h) => h.ep !== ep);
|
|
return;
|
|
} else {
|
|
// for all endpoints
|
|
this.eventHandlers.forEach((h) => h.ep.removeCustomEventListener(h.event, h.handler));
|
|
this.eventHandlers = [];
|
|
}
|
|
}
|
|
|
|
async _initSpeechCredentials(cs, vendor, label) {
|
|
const {getNuanceAccessToken, getIbmAccessToken, getAwsAuthToken, getVerbioAccessToken} = cs.srf.locals.dbHelpers;
|
|
let credentials = cs.getSpeechCredentials(vendor, 'stt', label);
|
|
|
|
if (!credentials) {
|
|
const {writeAlerts, AlertType} = cs.srf.locals;
|
|
this.logger.info(`ERROR stt using ${vendor} requested but creds not supplied`);
|
|
writeAlerts({
|
|
account_sid: cs.accountSid,
|
|
alert_type: AlertType.STT_NOT_PROVISIONED,
|
|
vendor,
|
|
label,
|
|
target_sid: cs.callSid
|
|
}).catch((err) => this.logger.info({err}, 'Error generating alert for no stt'));
|
|
// the ASR might have fallback configuration, should not done task here.
|
|
throw new SpeechCredentialError(`No speech-to-text service credentials for ${vendor} have been configured`);
|
|
}
|
|
|
|
if (vendor === 'nuance' && credentials.client_id) {
|
|
/* get nuance access token */
|
|
const {client_id, secret} = credentials;
|
|
const {access_token, servedFromCache} = await getNuanceAccessToken(client_id, secret, 'asr tts');
|
|
this.logger.debug({client_id}, `got nuance access token ${servedFromCache ? 'from cache' : ''}`);
|
|
credentials = {...credentials, access_token};
|
|
}
|
|
else if (vendor == 'ibm' && credentials.stt_api_key) {
|
|
/* get ibm access token */
|
|
const {stt_api_key, stt_region} = credentials;
|
|
const {access_token, servedFromCache} = await getIbmAccessToken(stt_api_key);
|
|
this.logger.debug({stt_api_key}, `got ibm access token ${servedFromCache ? 'from cache' : ''}`);
|
|
credentials = {...credentials, access_token, stt_region};
|
|
} else if (['aws', 'polly'].includes(vendor) && credentials.roleArn) {
|
|
/* get aws access token */
|
|
const {roleArn, region} = credentials;
|
|
const {accessKeyId, secretAccessKey, sessionToken, servedFromCache} =
|
|
await getAwsAuthToken({
|
|
region,
|
|
roleArn
|
|
});
|
|
this.logger.debug({roleArn}, `(roleArn) got aws access token ${servedFromCache ? 'from cache' : ''}`);
|
|
// from role ARN, we will get SessionToken, but feature server use it as securityToken.
|
|
credentials = {...credentials, accessKeyId, secretAccessKey, securityToken: sessionToken};
|
|
}
|
|
else if (vendor === 'verbio' && credentials.client_id && credentials.client_secret) {
|
|
const {access_token, servedFromCache} = await getVerbioAccessToken(credentials);
|
|
this.logger.debug({client_id: credentials.client_id},
|
|
`got verbio access token ${servedFromCache ? 'from cache' : ''}`);
|
|
credentials.access_token = access_token;
|
|
}
|
|
else if (vendor == 'aws' && !JAMBONES_AWS_TRANSCRIBE_USE_GRPC) {
|
|
/* get AWS access token */
|
|
const {speech_credential_sid, accessKeyId, secretAccessKey, securityToken, region } = credentials;
|
|
if (!securityToken) {
|
|
const { servedFromCache, ...newCredentials} = await getAwsAuthToken({
|
|
speech_credential_sid,
|
|
accessKeyId,
|
|
secretAccessKey,
|
|
region});
|
|
this.logger.debug({newCredentials}, `got aws security token ${servedFromCache ? 'from cache' : ''}`);
|
|
credentials = {...newCredentials, region};
|
|
}
|
|
}
|
|
|
|
return credentials;
|
|
}
|
|
|
|
canFallback() {
|
|
return this.fallbackVendor && this.isHandledByPrimaryProvider && !this.cs.hasFallbackAsr;
|
|
}
|
|
|
|
// ep is optional for gather or any verb that have single ep,
|
|
// but transcribe does need as it might has 2 eps
|
|
async _initFallback(ep) {
|
|
assert(this.fallbackVendor, 'fallback failed without fallbackVendor configuration');
|
|
this.logger.info(`Failed to use primary STT provider, fallback to ${this.fallbackVendor}`);
|
|
this.isHandledByPrimaryProvider = false;
|
|
this.cs.hasFallbackAsr = true;
|
|
this.vendor = this.cs.fallbackSpeechRecognizerVendor = this.fallbackVendor;
|
|
this.language = this.cs.fallbackSpeechRecognizerLanguage = this.fallbackLanguage;
|
|
this.label = this.cs.fallbackSpeechRecognizerLabel = this.fallbackLabel;
|
|
this.data.recognizer.vendor = this.vendor;
|
|
this.data.recognizer.language = this.language;
|
|
this.data.recognizer.label = this.label;
|
|
this.sttCredentials = await this._initSpeechCredentials(this.cs, this.vendor, this.label);
|
|
// cleanup previous listener from previous vendor
|
|
this.removeCustomEventListeners(ep);
|
|
}
|
|
|
|
async compileHintsForCobalt(ep, hostport, model, token, hints) {
|
|
const {retrieveKey} = this.cs.srf.locals.dbHelpers;
|
|
const hash = crypto.createHash('sha1');
|
|
hash.update(`${model}:${hints}`);
|
|
const key = `cobalt:${hash.digest('hex')}`;
|
|
this.context = await retrieveKey(key);
|
|
if (this.context) {
|
|
this.logger.debug({model, hints}, 'found cached cobalt context for supplied hints');
|
|
return this.context;
|
|
}
|
|
|
|
this.logger.debug({model, hints}, 'compiling cobalt context for supplied hints');
|
|
|
|
return new Promise((resolve, reject) => {
|
|
this.cobaltCompileResolver = resolve;
|
|
ep.addCustomEventListener(CobaltTranscriptionEvents.CompileContext, this._onCompileContext.bind(this, ep, key));
|
|
ep.api('uuid_cobalt_compile_context', [ep.uuid, hostport, model, token, hints], (err, evt) => {
|
|
if (err || 0 !== evt.getBody().indexOf('+OK')) {
|
|
ep.removeCustomEventListener(CobaltTranscriptionEvents.CompileContext);
|
|
return reject(err);
|
|
}
|
|
});
|
|
});
|
|
}
|
|
|
|
formatOpenAIPrompt(cs, {prompt, hintsTemplate, conversationHistoryTemplate, hints}) {
|
|
let conversationHistoryPrompt, hintsPrompt;
|
|
|
|
/* generate conversation history from template */
|
|
if (conversationHistoryTemplate) {
|
|
const {processed, turns} = processTurnString(conversationHistoryTemplate);
|
|
this.logger.debug({processed, turns}, 'SttTask: processed conversation history template');
|
|
conversationHistoryPrompt = cs.getFormattedConversation(turns || 4);
|
|
//this.logger.debug({conversationHistoryPrompt}, 'SttTask: conversation history');
|
|
if (conversationHistoryPrompt) {
|
|
conversationHistoryPrompt = processed.replace('{{turns}}', `\n${conversationHistoryPrompt}\nuser: `);
|
|
}
|
|
}
|
|
|
|
/* generate hints from template */
|
|
if (hintsTemplate && Array.isArray(hints) && hints.length > 0) {
|
|
hintsPrompt = hintsTemplate.replace('{{hints}}', hints);
|
|
}
|
|
|
|
/* combine into final prompt */
|
|
let finalPrompt = prompt || '';
|
|
if (hintsPrompt) {
|
|
finalPrompt = `${finalPrompt}\n${hintsPrompt}`;
|
|
}
|
|
if (conversationHistoryPrompt) {
|
|
finalPrompt = `${finalPrompt}\n${conversationHistoryPrompt}`;
|
|
}
|
|
|
|
this.logger.debug({
|
|
finalPrompt,
|
|
hints,
|
|
hintsPrompt,
|
|
conversationHistoryTemplate,
|
|
conversationHistoryPrompt
|
|
}, 'SttTask: formatted OpenAI prompt');
|
|
return finalPrompt?.trimStart();
|
|
}
|
|
|
|
/* some STT engines will keep listening after a final response, so no need to restart */
|
|
doesVendorContinueListeningAfterFinalTranscript(vendor) {
|
|
return (vendor.startsWith('custom:') || [
|
|
'soniox',
|
|
'aws',
|
|
'microsoft',
|
|
'deepgram',
|
|
'google',
|
|
'speechmatics',
|
|
'openai',
|
|
].includes(vendor));
|
|
}
|
|
|
|
_onCompileContext(ep, key, evt) {
|
|
const {addKey} = this.cs.srf.locals.dbHelpers;
|
|
this.logger.debug({evt}, `received cobalt compile context event, will cache under ${key}`);
|
|
|
|
this.cobaltCompileResolver(evt.compiled_context);
|
|
ep.removeCustomEventListener(CobaltTranscriptionEvents.CompileContext);
|
|
this.cobaltCompileResolver = null;
|
|
|
|
//cache the compiled context
|
|
addKey(key, evt.compiled_context, 3600 * 12)
|
|
.catch((err) => this.logger.info({err}, `Error caching cobalt context for ${key}`));
|
|
}
|
|
|
|
_doContinuousAsrWithDeepgram(asrTimeout) {
|
|
/* deepgram has an utterance_end_ms property that simplifies things */
|
|
assert(this.vendor === 'deepgram');
|
|
if (asrTimeout < 1000) {
|
|
this.notifyError({
|
|
msg: 'ASR error',
|
|
details:`asrTimeout ${asrTimeout} is too short for deepgram; setting it to 1000ms`
|
|
});
|
|
asrTimeout = 1000;
|
|
}
|
|
else if (asrTimeout > 5000) {
|
|
this.notifyError({
|
|
msg: 'ASR error',
|
|
details:`asrTimeout ${asrTimeout} is too long for deepgram; setting it to 5000ms`
|
|
});
|
|
asrTimeout = 5000;
|
|
}
|
|
this.logger.debug(`_doContinuousAsrWithDeepgram - setting utterance_end_ms to ${asrTimeout}`);
|
|
const dgOptions = this.data.recognizer.deepgramOptions = this.data.recognizer.deepgramOptions || {};
|
|
dgOptions.utteranceEndMs = dgOptions.utteranceEndMs || asrTimeout;
|
|
}
|
|
|
|
_onVendorConnect(cs, _ep) {
|
|
this.logger.debug(`TaskGather:_on${this.vendor}Connect`);
|
|
}
|
|
|
|
_onVendorError(cs, _ep, evt) {
|
|
this.logger.info({evt}, `${this.name}:_on${this.vendor}Error`);
|
|
const {writeAlerts, AlertType} = cs.srf.locals;
|
|
writeAlerts({
|
|
account_sid: cs.accountSid,
|
|
alert_type: AlertType.STT_FAILURE,
|
|
message: 'STT failure reported by vendor',
|
|
detail: evt.error,
|
|
vendor: this.vendor,
|
|
label: this.label,
|
|
target_sid: cs.callSid
|
|
}).catch((err) => this.logger.info({err}, `Error generating alert for ${this.vendor} connection failure`));
|
|
}
|
|
|
|
_onVendorConnectFailure(cs, _ep, evt) {
|
|
const {reason} = evt;
|
|
const {writeAlerts, AlertType} = cs.srf.locals;
|
|
this.logger.info({evt}, `${this.name}:_on${this.vendor}ConnectFailure`);
|
|
writeAlerts({
|
|
account_sid: cs.accountSid,
|
|
alert_type: AlertType.STT_FAILURE,
|
|
message: `Failed connecting to ${this.vendor} speech recognizer: ${reason}`,
|
|
vendor: this.vendor,
|
|
label: this.label,
|
|
target_sid: cs.callSid
|
|
}).catch((err) => this.logger.info({err}, `Error generating alert for ${this.vendor} connection failure`));
|
|
}
|
|
}
|
|
|
|
module.exports = SttTask;
|