mirror of
https://github.com/jambonz/jambonz-feature-server.git
synced 2025-12-20 08:40:38 +00:00
initial changes for openai stt (#1127)
* initial changes for openai stt * wip * wip * wip * wip * wip * make minBargeinWordCount work for openai * wip * wip * wip * wip * wip * wip * wip * wipp * wip * wip * wip * openai stt: support for prompt templates * lint * wip * support openai semantic_vad * wip * transcribe supports openai stt * sip * wip * wip * refactor list of stt vendors that dont need to be restarted after a final transcript * cleanup * wip * cleanup * wip * wip * wip * remove credentials from log * comment
This commit is contained in:
@@ -12,6 +12,7 @@ const {
|
||||
JambonzTranscriptionEvents,
|
||||
AssemblyAiTranscriptionEvents,
|
||||
VoxistTranscriptionEvents,
|
||||
OpenAITranscriptionEvents,
|
||||
VadDetection,
|
||||
VerbioTranscriptionEvents,
|
||||
SpeechmaticsTranscriptionEvents
|
||||
@@ -83,6 +84,7 @@ class TaskGather extends SttTask {
|
||||
this._bufferedTranscripts = [];
|
||||
this.partialTranscriptsCount = 0;
|
||||
this.bugname_prefix = 'gather_';
|
||||
|
||||
}
|
||||
|
||||
get name() { return TaskName.Gather; }
|
||||
@@ -239,6 +241,7 @@ class TaskGather extends SttTask {
|
||||
const {span, ctx} = this.startChildSpan(`nested:${this.sayTask.summary}`);
|
||||
const process = () => {
|
||||
this.logger.debug('Gather: nested say task completed');
|
||||
this.playComplete = true;
|
||||
if (!this.listenDuringPrompt) {
|
||||
startDtmfListener();
|
||||
}
|
||||
@@ -269,6 +272,7 @@ class TaskGather extends SttTask {
|
||||
const {span, ctx} = this.startChildSpan(`nested:${this.playTask.summary}`);
|
||||
const process = () => {
|
||||
this.logger.debug('Gather: nested play task completed');
|
||||
this.playComplete = true;
|
||||
if (!this.listenDuringPrompt) {
|
||||
startDtmfListener();
|
||||
}
|
||||
@@ -559,6 +563,31 @@ class TaskGather extends SttTask {
|
||||
|
||||
break;
|
||||
|
||||
case 'openai':
|
||||
this.bugname = `${this.bugname_prefix}openai_transcribe`;
|
||||
this.addCustomEventListener(
|
||||
ep, OpenAITranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
|
||||
this.addCustomEventListener(
|
||||
ep, OpenAITranscriptionEvents.SpeechStarted, this._onOpenAISpeechStarted.bind(this, cs, ep));
|
||||
this.addCustomEventListener(
|
||||
ep, OpenAITranscriptionEvents.SpeechStopped, this._onOpenAISpeechStopped.bind(this, cs, ep));
|
||||
this.addCustomEventListener(ep, OpenAITranscriptionEvents.Connect,
|
||||
this._onVendorConnect.bind(this, cs, ep));
|
||||
this.addCustomEventListener(ep, OpenAITranscriptionEvents.ConnectFailure,
|
||||
this._onVendorConnectFailure.bind(this, cs, ep));
|
||||
this.addCustomEventListener(ep, OpenAITranscriptionEvents.Error,
|
||||
this._onOpenAIErrror.bind(this, cs, ep));
|
||||
|
||||
/* openai delta transcripts are useful only for minBargeinWordCount eval */
|
||||
if (this.minBargeinWordCount > 1) {
|
||||
this.openaiPartials = [];
|
||||
opts.OPENAI_WANT_PARTIALS = 1;
|
||||
this.addCustomEventListener(
|
||||
ep, OpenAITranscriptionEvents.PartialTranscript, this._onOpenAIPartialTranscript.bind(this, cs, ep));
|
||||
}
|
||||
this.modelSupportsConversationTracking = opts.OPENAI_MODEL !== 'whisper-1';
|
||||
break;
|
||||
|
||||
default:
|
||||
if (this.vendor.startsWith('custom:')) {
|
||||
this.bugname = `${this.bugname_prefix}${this.vendor}_transcribe`;
|
||||
@@ -590,6 +619,25 @@ class TaskGather extends SttTask {
|
||||
bugname: this.bugname
|
||||
}, 'Gather:_startTranscribing');
|
||||
|
||||
|
||||
/* special feature for openai: we can provide a prompt that includes recent conversation history */
|
||||
let prompt;
|
||||
if (this.vendor === 'openai') {
|
||||
if (this.modelSupportsConversationTracking) {
|
||||
prompt = this.formatOpenAIPrompt(this.cs, {
|
||||
prompt: this.data.recognizer?.openaiOptions?.prompt,
|
||||
hintsTemplate: this.data.recognizer?.openaiOptions?.promptTemplates?.hintsTemplate,
|
||||
// eslint-disable-next-line max-len
|
||||
conversationHistoryTemplate: this.data.recognizer?.openaiOptions?.promptTemplates?.conversationHistoryTemplate,
|
||||
hints: this.data.recognizer?.hints,
|
||||
});
|
||||
this.logger.debug({prompt}, 'Gather:_startTranscribing - created an openai prompt');
|
||||
}
|
||||
else if (this.data.recognizer?.hints?.length > 0) {
|
||||
prompt = this.data.recognizer?.hints.join(', ');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Note: we don't need to ask deepgram for interim results, because they
|
||||
* already send us words as they are finalized (is_final=true) even before
|
||||
@@ -601,6 +649,7 @@ class TaskGather extends SttTask {
|
||||
interim: this.interim,
|
||||
bugname: this.bugname,
|
||||
hostport: this.hostport,
|
||||
prompt
|
||||
}).catch((err) => {
|
||||
const {writeAlerts, AlertType} = this.cs.srf.locals;
|
||||
this.logger.error(err, 'TaskGather:_startTranscribing error');
|
||||
@@ -781,7 +830,11 @@ class TaskGather extends SttTask {
|
||||
const bugname = fsEvent.getHeader('media-bugname');
|
||||
const finished = fsEvent.getHeader('transcription-session-finished');
|
||||
this.logger.debug({evt, bugname, finished, vendor: this.vendor}, 'Gather:_onTranscription raw transcript');
|
||||
if (bugname && this.bugname !== bugname) return;
|
||||
if (bugname && this.bugname !== bugname) {
|
||||
this.logger.debug(
|
||||
`Gather:_onTranscription - ignoring transcript from ${bugname} because our bug is ${this.bugname}`);
|
||||
return;
|
||||
}
|
||||
if (finished === 'true') return;
|
||||
|
||||
if (this.vendor === 'ibm' && evt?.state === 'listening') return;
|
||||
@@ -1084,6 +1137,33 @@ class TaskGather extends SttTask {
|
||||
this._onVendorError(cs, _ep, {error: JSON.stringify(e)});
|
||||
}
|
||||
|
||||
async _onOpenAIErrror(cs, _ep, evt) {
|
||||
// eslint-disable-next-line no-unused-vars
|
||||
const {message, ...e} = evt;
|
||||
this._onVendorError(cs, _ep, {error: JSON.stringify(e)});
|
||||
}
|
||||
|
||||
async _onOpenAISpeechStarted(cs, _ep, evt) {
|
||||
this.logger.debug({evt}, 'TaskGather:_onOpenAISpeechStarted');
|
||||
}
|
||||
|
||||
async _onOpenAISpeechStopped(cs, _ep, evt) {
|
||||
this.logger.debug({evt}, 'TaskGather:_onOpenAISpeechStopped');
|
||||
}
|
||||
|
||||
async _onOpenAIPartialTranscript(cs, _ep, evt) {
|
||||
if (!this.playComplete) {
|
||||
const words = evt.delta.split(' ').filter((w) => /[A-Za-z0-0]/.test(w));
|
||||
this.openaiPartials.push(...words);
|
||||
this.logger.debug({words, partials: this.openaiPartials, evt}, 'TaskGather:_onOpenAIPartialTranscript - words');
|
||||
if (this.openaiPartials.length >= this.minBargeinWordCount) {
|
||||
this.logger.debug({partials: this.openaiPartials}, 'killing audio due to speech (openai)');
|
||||
this._killAudio(cs);
|
||||
this.notifyStatus({event: 'speech-bargein-detected', words: this.openaiPartials});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async _onVendorError(cs, _ep, evt) {
|
||||
super._onVendorError(cs, _ep, evt);
|
||||
if (!(await this._startFallback(cs, _ep, evt))) {
|
||||
@@ -1193,6 +1273,7 @@ class TaskGather extends SttTask {
|
||||
}
|
||||
}
|
||||
else if (reason.startsWith('speech')) {
|
||||
this.cs.emit('userSaid', evt.alternatives[0].transcript);
|
||||
if (this.parentTask) this.parentTask.emit('transcription', evt);
|
||||
else {
|
||||
this.emit('transcription', evt);
|
||||
|
||||
@@ -5,6 +5,30 @@ const { TaskPreconditions, CobaltTranscriptionEvents } = require('../utils/const
|
||||
const { SpeechCredentialError } = require('../utils/error');
|
||||
const {JAMBONES_AWS_TRANSCRIBE_USE_GRPC} = require('../config');
|
||||
|
||||
/**
|
||||
* "Please insert turns here: {{turns:4}}"
|
||||
// -> { processed: 'Please insert turns here: {{turns}}', turns: 4 }
|
||||
|
||||
processTurnString("Please insert turns here: {{turns}}"));
|
||||
// -> { processed: 'Please insert turns here: {{turns}}', turns: null }
|
||||
*/
|
||||
const processTurnString = (input) => {
|
||||
const regex = /\{\{turns(?::(\d+))?\}\}/;
|
||||
const match = input.match(regex);
|
||||
|
||||
if (!match) {
|
||||
return {
|
||||
processed: input,
|
||||
turns: null
|
||||
};
|
||||
}
|
||||
|
||||
const turns = match[1] ? parseInt(match[1], 10) : null;
|
||||
const processed = input.replace(regex, '{{turns}}');
|
||||
|
||||
return { processed, turns };
|
||||
};
|
||||
|
||||
class SttTask extends Task {
|
||||
|
||||
constructor(logger, data, parentTask) {
|
||||
@@ -290,6 +314,57 @@ class SttTask extends Task {
|
||||
});
|
||||
}
|
||||
|
||||
formatOpenAIPrompt(cs, {prompt, hintsTemplate, conversationHistoryTemplate, hints}) {
|
||||
let conversationHistoryPrompt, hintsPrompt;
|
||||
|
||||
/* generate conversation history from template */
|
||||
if (conversationHistoryTemplate) {
|
||||
const {processed, turns} = processTurnString(conversationHistoryTemplate);
|
||||
this.logger.debug({processed, turns}, 'SttTask: processed conversation history template');
|
||||
conversationHistoryPrompt = cs.getFormattedConversation(turns || 4);
|
||||
//this.logger.debug({conversationHistoryPrompt}, 'SttTask: conversation history');
|
||||
if (conversationHistoryPrompt) {
|
||||
conversationHistoryPrompt = processed.replace('{{turns}}', `\n${conversationHistoryPrompt}\nuser: `);
|
||||
}
|
||||
}
|
||||
|
||||
/* generate hints from template */
|
||||
if (hintsTemplate && Array.isArray(hints) && hints.length > 0) {
|
||||
hintsPrompt = hintsTemplate.replace('{{hints}}', hints);
|
||||
}
|
||||
|
||||
/* combine into final prompt */
|
||||
let finalPrompt = prompt || '';
|
||||
if (hintsPrompt) {
|
||||
finalPrompt = `${finalPrompt}\n${hintsPrompt}`;
|
||||
}
|
||||
if (conversationHistoryPrompt) {
|
||||
finalPrompt = `${finalPrompt}\n${conversationHistoryPrompt}`;
|
||||
}
|
||||
|
||||
this.logger.debug({
|
||||
finalPrompt,
|
||||
hints,
|
||||
hintsPrompt,
|
||||
conversationHistoryTemplate,
|
||||
conversationHistoryPrompt
|
||||
}, 'SttTask: formatted OpenAI prompt');
|
||||
return finalPrompt?.trimStart();
|
||||
}
|
||||
|
||||
/* some STT engines will keep listening after a final response, so no need to restart */
|
||||
doesVendorContinueListeningAfterFinalTranscript(vendor) {
|
||||
return (vendor.startsWith('custom:') || [
|
||||
'soniox',
|
||||
'aws',
|
||||
'microsoft',
|
||||
'deepgram',
|
||||
'google',
|
||||
'speechmatics',
|
||||
'openai',
|
||||
].includes(vendor));
|
||||
}
|
||||
|
||||
_onCompileContext(ep, key, evt) {
|
||||
const {addKey} = this.cs.srf.locals.dbHelpers;
|
||||
this.logger.debug({evt}, `received cobalt compile context event, will cache under ${key}`);
|
||||
|
||||
@@ -14,6 +14,7 @@ const {
|
||||
TranscribeStatus,
|
||||
AssemblyAiTranscriptionEvents,
|
||||
VoxistTranscriptionEvents,
|
||||
OpenAITranscriptionEvents,
|
||||
VerbioTranscriptionEvents,
|
||||
SpeechmaticsTranscriptionEvents
|
||||
} = require('../utils/constants.json');
|
||||
@@ -330,6 +331,20 @@ class TaskTranscribe extends SttTask {
|
||||
this._onSpeechmaticsError.bind(this, cs, ep));
|
||||
break;
|
||||
|
||||
case 'openai':
|
||||
this.bugname = `${this.bugname_prefix}openai_transcribe`;
|
||||
this.addCustomEventListener(
|
||||
ep, OpenAITranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep, channel));
|
||||
this.addCustomEventListener(ep, OpenAITranscriptionEvents.Connect,
|
||||
this._onVendorConnect.bind(this, cs, ep));
|
||||
this.addCustomEventListener(ep, OpenAITranscriptionEvents.ConnectFailure,
|
||||
this._onVendorConnectFailure.bind(this, cs, ep));
|
||||
this.addCustomEventListener(ep, OpenAITranscriptionEvents.Error,
|
||||
this._onOpenAIErrror.bind(this, cs, ep));
|
||||
|
||||
this.modelSupportsConversationTracking = opts.OPENAI_MODEL !== 'whisper-1';
|
||||
break;
|
||||
|
||||
default:
|
||||
if (this.vendor.startsWith('custom:')) {
|
||||
this.bugname = `${this.bugname_prefix}${this.vendor}_transcribe`;
|
||||
@@ -365,6 +380,25 @@ class TaskTranscribe extends SttTask {
|
||||
async _transcribe(ep) {
|
||||
this.logger.debug(
|
||||
`TaskTranscribe:_transcribe - starting transcription vendor ${this.vendor} bugname ${this.bugname}`);
|
||||
|
||||
/* special feature for openai: we can provide a prompt that includes recent conversation history */
|
||||
let prompt;
|
||||
if (this.vendor === 'openai') {
|
||||
if (this.modelSupportsConversationTracking) {
|
||||
prompt = this.formatOpenAIPrompt(this.cs, {
|
||||
prompt: this.data.recognizer?.openaiOptions?.prompt,
|
||||
hintsTemplate: this.data.recognizer?.openaiOptions?.promptTemplates?.hintsTemplate,
|
||||
// eslint-disable-next-line max-len
|
||||
conversationHistoryTemplate: this.data.recognizer?.openaiOptions?.promptTemplates?.conversationHistoryTemplate,
|
||||
hints: this.data.recognizer?.hints,
|
||||
});
|
||||
this.logger.debug({prompt}, 'Gather:_startTranscribing - created an openai prompt');
|
||||
}
|
||||
else if (this.data.recognizer?.hints?.length > 0) {
|
||||
prompt = this.data.recognizer?.hints.join(', ');
|
||||
}
|
||||
}
|
||||
|
||||
await ep.startTranscription({
|
||||
vendor: this.vendor,
|
||||
interim: this.interim ? true : false,
|
||||
@@ -456,8 +490,9 @@ class TaskTranscribe extends SttTask {
|
||||
this._startAsrTimer(channel);
|
||||
|
||||
/* some STT engines will keep listening after a final response, so no need to restart */
|
||||
if (!['soniox', 'aws', 'microsoft', 'deepgram', 'google', 'speechmatics']
|
||||
.includes(this.vendor)) this._startTranscribing(cs, ep, channel);
|
||||
if (!this.doesVendorContinueListeningAfterFinalTranscript(this.vendor)) {
|
||||
this._startTranscribing(cs, ep, channel);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (this.vendor === 'soniox') {
|
||||
@@ -480,9 +515,7 @@ class TaskTranscribe extends SttTask {
|
||||
this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - sending final transcript');
|
||||
this._resolve(channel, evt);
|
||||
|
||||
/* some STT engines will keep listening after a final response, so no need to restart */
|
||||
if (!['soniox', 'aws', 'microsoft', 'deepgram', 'google', 'speechmatics'].includes(this.vendor) &&
|
||||
!this.vendor.startsWith('custom:')) {
|
||||
if (!this.doesVendorContinueListeningAfterFinalTranscript(this.vendor)) {
|
||||
this.logger.debug('TaskTranscribe:_onTranscription - restarting transcribe');
|
||||
this._startTranscribing(cs, ep, channel);
|
||||
}
|
||||
@@ -733,6 +766,12 @@ class TaskTranscribe extends SttTask {
|
||||
this._onVendorError(cs, _ep, {error: JSON.stringify(e)});
|
||||
}
|
||||
|
||||
async _onOpenAIErrror(cs, _ep, evt) {
|
||||
// eslint-disable-next-line no-unused-vars
|
||||
const {message, ...e} = evt;
|
||||
this._onVendorError(cs, _ep, {error: JSON.stringify(e)});
|
||||
}
|
||||
|
||||
_startAsrTimer(channel) {
|
||||
if (this.vendor === 'deepgram') return; // no need
|
||||
assert(this.isContinuousAsr);
|
||||
|
||||
@@ -42,6 +42,11 @@ class TtsTask extends Task {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const fullText = Array.isArray(this.text) ? this.text.join(' ') : this.text;
|
||||
if (fullText.length > 0) {
|
||||
cs.emit('botSaid', fullText);
|
||||
}
|
||||
}
|
||||
|
||||
getTtsVendorData(cs) {
|
||||
|
||||
Reference in New Issue
Block a user