initial changes for openai stt (#1127)

* initial changes for openai stt

* wip

* wip

* wip

* wip

* wip

* make minBargeinWordCount work for openai

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wipp

* wip

* wip

* wip

* openai stt: support for prompt templates

* lint

* wip

* support openai semantic_vad

* wip

* transcribe supports openai stt

* sip

* wip

* wip

* refactor list of stt vendors that dont need to be restarted after a final transcript

* cleanup

* wip

* cleanup

* wip

* wip

* wip

* remove credentials from log

* comment
This commit is contained in:
Dave Horton
2025-03-28 13:14:58 -04:00
committed by GitHub
parent ee846b283d
commit fcaf2e59e7
11 changed files with 382 additions and 29 deletions

View File

@@ -12,6 +12,7 @@ const {
JambonzTranscriptionEvents,
AssemblyAiTranscriptionEvents,
VoxistTranscriptionEvents,
OpenAITranscriptionEvents,
VadDetection,
VerbioTranscriptionEvents,
SpeechmaticsTranscriptionEvents
@@ -83,6 +84,7 @@ class TaskGather extends SttTask {
this._bufferedTranscripts = [];
this.partialTranscriptsCount = 0;
this.bugname_prefix = 'gather_';
}
get name() { return TaskName.Gather; }
@@ -239,6 +241,7 @@ class TaskGather extends SttTask {
const {span, ctx} = this.startChildSpan(`nested:${this.sayTask.summary}`);
const process = () => {
this.logger.debug('Gather: nested say task completed');
this.playComplete = true;
if (!this.listenDuringPrompt) {
startDtmfListener();
}
@@ -269,6 +272,7 @@ class TaskGather extends SttTask {
const {span, ctx} = this.startChildSpan(`nested:${this.playTask.summary}`);
const process = () => {
this.logger.debug('Gather: nested play task completed');
this.playComplete = true;
if (!this.listenDuringPrompt) {
startDtmfListener();
}
@@ -559,6 +563,31 @@ class TaskGather extends SttTask {
break;
case 'openai':
this.bugname = `${this.bugname_prefix}openai_transcribe`;
this.addCustomEventListener(
ep, OpenAITranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
this.addCustomEventListener(
ep, OpenAITranscriptionEvents.SpeechStarted, this._onOpenAISpeechStarted.bind(this, cs, ep));
this.addCustomEventListener(
ep, OpenAITranscriptionEvents.SpeechStopped, this._onOpenAISpeechStopped.bind(this, cs, ep));
this.addCustomEventListener(ep, OpenAITranscriptionEvents.Connect,
this._onVendorConnect.bind(this, cs, ep));
this.addCustomEventListener(ep, OpenAITranscriptionEvents.ConnectFailure,
this._onVendorConnectFailure.bind(this, cs, ep));
this.addCustomEventListener(ep, OpenAITranscriptionEvents.Error,
this._onOpenAIErrror.bind(this, cs, ep));
/* openai delta transcripts are useful only for minBargeinWordCount eval */
if (this.minBargeinWordCount > 1) {
this.openaiPartials = [];
opts.OPENAI_WANT_PARTIALS = 1;
this.addCustomEventListener(
ep, OpenAITranscriptionEvents.PartialTranscript, this._onOpenAIPartialTranscript.bind(this, cs, ep));
}
this.modelSupportsConversationTracking = opts.OPENAI_MODEL !== 'whisper-1';
break;
default:
if (this.vendor.startsWith('custom:')) {
this.bugname = `${this.bugname_prefix}${this.vendor}_transcribe`;
@@ -590,6 +619,25 @@ class TaskGather extends SttTask {
bugname: this.bugname
}, 'Gather:_startTranscribing');
/* special feature for openai: we can provide a prompt that includes recent conversation history */
let prompt;
if (this.vendor === 'openai') {
if (this.modelSupportsConversationTracking) {
prompt = this.formatOpenAIPrompt(this.cs, {
prompt: this.data.recognizer?.openaiOptions?.prompt,
hintsTemplate: this.data.recognizer?.openaiOptions?.promptTemplates?.hintsTemplate,
// eslint-disable-next-line max-len
conversationHistoryTemplate: this.data.recognizer?.openaiOptions?.promptTemplates?.conversationHistoryTemplate,
hints: this.data.recognizer?.hints,
});
this.logger.debug({prompt}, 'Gather:_startTranscribing - created an openai prompt');
}
else if (this.data.recognizer?.hints?.length > 0) {
prompt = this.data.recognizer?.hints.join(', ');
}
}
/**
* Note: we don't need to ask deepgram for interim results, because they
* already send us words as they are finalized (is_final=true) even before
@@ -601,6 +649,7 @@ class TaskGather extends SttTask {
interim: this.interim,
bugname: this.bugname,
hostport: this.hostport,
prompt
}).catch((err) => {
const {writeAlerts, AlertType} = this.cs.srf.locals;
this.logger.error(err, 'TaskGather:_startTranscribing error');
@@ -781,7 +830,11 @@ class TaskGather extends SttTask {
const bugname = fsEvent.getHeader('media-bugname');
const finished = fsEvent.getHeader('transcription-session-finished');
this.logger.debug({evt, bugname, finished, vendor: this.vendor}, 'Gather:_onTranscription raw transcript');
if (bugname && this.bugname !== bugname) return;
if (bugname && this.bugname !== bugname) {
this.logger.debug(
`Gather:_onTranscription - ignoring transcript from ${bugname} because our bug is ${this.bugname}`);
return;
}
if (finished === 'true') return;
if (this.vendor === 'ibm' && evt?.state === 'listening') return;
@@ -1084,6 +1137,33 @@ class TaskGather extends SttTask {
this._onVendorError(cs, _ep, {error: JSON.stringify(e)});
}
async _onOpenAIErrror(cs, _ep, evt) {
// eslint-disable-next-line no-unused-vars
const {message, ...e} = evt;
this._onVendorError(cs, _ep, {error: JSON.stringify(e)});
}
async _onOpenAISpeechStarted(cs, _ep, evt) {
this.logger.debug({evt}, 'TaskGather:_onOpenAISpeechStarted');
}
async _onOpenAISpeechStopped(cs, _ep, evt) {
this.logger.debug({evt}, 'TaskGather:_onOpenAISpeechStopped');
}
async _onOpenAIPartialTranscript(cs, _ep, evt) {
if (!this.playComplete) {
const words = evt.delta.split(' ').filter((w) => /[A-Za-z0-0]/.test(w));
this.openaiPartials.push(...words);
this.logger.debug({words, partials: this.openaiPartials, evt}, 'TaskGather:_onOpenAIPartialTranscript - words');
if (this.openaiPartials.length >= this.minBargeinWordCount) {
this.logger.debug({partials: this.openaiPartials}, 'killing audio due to speech (openai)');
this._killAudio(cs);
this.notifyStatus({event: 'speech-bargein-detected', words: this.openaiPartials});
}
}
}
async _onVendorError(cs, _ep, evt) {
super._onVendorError(cs, _ep, evt);
if (!(await this._startFallback(cs, _ep, evt))) {
@@ -1193,6 +1273,7 @@ class TaskGather extends SttTask {
}
}
else if (reason.startsWith('speech')) {
this.cs.emit('userSaid', evt.alternatives[0].transcript);
if (this.parentTask) this.parentTask.emit('transcription', evt);
else {
this.emit('transcription', evt);

View File

@@ -5,6 +5,30 @@ const { TaskPreconditions, CobaltTranscriptionEvents } = require('../utils/const
const { SpeechCredentialError } = require('../utils/error');
const {JAMBONES_AWS_TRANSCRIBE_USE_GRPC} = require('../config');
/**
* "Please insert turns here: {{turns:4}}"
// -> { processed: 'Please insert turns here: {{turns}}', turns: 4 }
processTurnString("Please insert turns here: {{turns}}"));
// -> { processed: 'Please insert turns here: {{turns}}', turns: null }
*/
const processTurnString = (input) => {
const regex = /\{\{turns(?::(\d+))?\}\}/;
const match = input.match(regex);
if (!match) {
return {
processed: input,
turns: null
};
}
const turns = match[1] ? parseInt(match[1], 10) : null;
const processed = input.replace(regex, '{{turns}}');
return { processed, turns };
};
class SttTask extends Task {
constructor(logger, data, parentTask) {
@@ -290,6 +314,57 @@ class SttTask extends Task {
});
}
formatOpenAIPrompt(cs, {prompt, hintsTemplate, conversationHistoryTemplate, hints}) {
let conversationHistoryPrompt, hintsPrompt;
/* generate conversation history from template */
if (conversationHistoryTemplate) {
const {processed, turns} = processTurnString(conversationHistoryTemplate);
this.logger.debug({processed, turns}, 'SttTask: processed conversation history template');
conversationHistoryPrompt = cs.getFormattedConversation(turns || 4);
//this.logger.debug({conversationHistoryPrompt}, 'SttTask: conversation history');
if (conversationHistoryPrompt) {
conversationHistoryPrompt = processed.replace('{{turns}}', `\n${conversationHistoryPrompt}\nuser: `);
}
}
/* generate hints from template */
if (hintsTemplate && Array.isArray(hints) && hints.length > 0) {
hintsPrompt = hintsTemplate.replace('{{hints}}', hints);
}
/* combine into final prompt */
let finalPrompt = prompt || '';
if (hintsPrompt) {
finalPrompt = `${finalPrompt}\n${hintsPrompt}`;
}
if (conversationHistoryPrompt) {
finalPrompt = `${finalPrompt}\n${conversationHistoryPrompt}`;
}
this.logger.debug({
finalPrompt,
hints,
hintsPrompt,
conversationHistoryTemplate,
conversationHistoryPrompt
}, 'SttTask: formatted OpenAI prompt');
return finalPrompt?.trimStart();
}
/* some STT engines will keep listening after a final response, so no need to restart */
doesVendorContinueListeningAfterFinalTranscript(vendor) {
return (vendor.startsWith('custom:') || [
'soniox',
'aws',
'microsoft',
'deepgram',
'google',
'speechmatics',
'openai',
].includes(vendor));
}
_onCompileContext(ep, key, evt) {
const {addKey} = this.cs.srf.locals.dbHelpers;
this.logger.debug({evt}, `received cobalt compile context event, will cache under ${key}`);

View File

@@ -14,6 +14,7 @@ const {
TranscribeStatus,
AssemblyAiTranscriptionEvents,
VoxistTranscriptionEvents,
OpenAITranscriptionEvents,
VerbioTranscriptionEvents,
SpeechmaticsTranscriptionEvents
} = require('../utils/constants.json');
@@ -330,6 +331,20 @@ class TaskTranscribe extends SttTask {
this._onSpeechmaticsError.bind(this, cs, ep));
break;
case 'openai':
this.bugname = `${this.bugname_prefix}openai_transcribe`;
this.addCustomEventListener(
ep, OpenAITranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep, channel));
this.addCustomEventListener(ep, OpenAITranscriptionEvents.Connect,
this._onVendorConnect.bind(this, cs, ep));
this.addCustomEventListener(ep, OpenAITranscriptionEvents.ConnectFailure,
this._onVendorConnectFailure.bind(this, cs, ep));
this.addCustomEventListener(ep, OpenAITranscriptionEvents.Error,
this._onOpenAIErrror.bind(this, cs, ep));
this.modelSupportsConversationTracking = opts.OPENAI_MODEL !== 'whisper-1';
break;
default:
if (this.vendor.startsWith('custom:')) {
this.bugname = `${this.bugname_prefix}${this.vendor}_transcribe`;
@@ -365,6 +380,25 @@ class TaskTranscribe extends SttTask {
async _transcribe(ep) {
this.logger.debug(
`TaskTranscribe:_transcribe - starting transcription vendor ${this.vendor} bugname ${this.bugname}`);
/* special feature for openai: we can provide a prompt that includes recent conversation history */
let prompt;
if (this.vendor === 'openai') {
if (this.modelSupportsConversationTracking) {
prompt = this.formatOpenAIPrompt(this.cs, {
prompt: this.data.recognizer?.openaiOptions?.prompt,
hintsTemplate: this.data.recognizer?.openaiOptions?.promptTemplates?.hintsTemplate,
// eslint-disable-next-line max-len
conversationHistoryTemplate: this.data.recognizer?.openaiOptions?.promptTemplates?.conversationHistoryTemplate,
hints: this.data.recognizer?.hints,
});
this.logger.debug({prompt}, 'Gather:_startTranscribing - created an openai prompt');
}
else if (this.data.recognizer?.hints?.length > 0) {
prompt = this.data.recognizer?.hints.join(', ');
}
}
await ep.startTranscription({
vendor: this.vendor,
interim: this.interim ? true : false,
@@ -456,8 +490,9 @@ class TaskTranscribe extends SttTask {
this._startAsrTimer(channel);
/* some STT engines will keep listening after a final response, so no need to restart */
if (!['soniox', 'aws', 'microsoft', 'deepgram', 'google', 'speechmatics']
.includes(this.vendor)) this._startTranscribing(cs, ep, channel);
if (!this.doesVendorContinueListeningAfterFinalTranscript(this.vendor)) {
this._startTranscribing(cs, ep, channel);
}
}
else {
if (this.vendor === 'soniox') {
@@ -480,9 +515,7 @@ class TaskTranscribe extends SttTask {
this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - sending final transcript');
this._resolve(channel, evt);
/* some STT engines will keep listening after a final response, so no need to restart */
if (!['soniox', 'aws', 'microsoft', 'deepgram', 'google', 'speechmatics'].includes(this.vendor) &&
!this.vendor.startsWith('custom:')) {
if (!this.doesVendorContinueListeningAfterFinalTranscript(this.vendor)) {
this.logger.debug('TaskTranscribe:_onTranscription - restarting transcribe');
this._startTranscribing(cs, ep, channel);
}
@@ -733,6 +766,12 @@ class TaskTranscribe extends SttTask {
this._onVendorError(cs, _ep, {error: JSON.stringify(e)});
}
async _onOpenAIErrror(cs, _ep, evt) {
// eslint-disable-next-line no-unused-vars
const {message, ...e} = evt;
this._onVendorError(cs, _ep, {error: JSON.stringify(e)});
}
_startAsrTimer(channel) {
if (this.vendor === 'deepgram') return; // no need
assert(this.isContinuousAsr);

View File

@@ -42,6 +42,11 @@ class TtsTask extends Task {
}
}
}
const fullText = Array.isArray(this.text) ? this.text.join(' ') : this.text;
if (fullText.length > 0) {
cs.emit('botSaid', fullText);
}
}
getTtsVendorData(cs) {