mirror of
https://github.com/jambonz/jambonz-feature-server.git
synced 2025-12-20 08:40:38 +00:00
initial changes for openai stt (#1127)
* initial changes for openai stt * wip * wip * wip * wip * wip * make minBargeinWordCount work for openai * wip * wip * wip * wip * wip * wip * wip * wipp * wip * wip * wip * openai stt: support for prompt templates * lint * wip * support openai semantic_vad * wip * transcribe supports openai stt * sip * wip * wip * refactor list of stt vendors that dont need to be restarted after a final transcript * cleanup * wip * cleanup * wip * wip * wip * remove credentials from log * comment
This commit is contained in:
@@ -12,6 +12,7 @@ const {
|
||||
JambonzTranscriptionEvents,
|
||||
AssemblyAiTranscriptionEvents,
|
||||
VoxistTranscriptionEvents,
|
||||
OpenAITranscriptionEvents,
|
||||
VadDetection,
|
||||
VerbioTranscriptionEvents,
|
||||
SpeechmaticsTranscriptionEvents
|
||||
@@ -83,6 +84,7 @@ class TaskGather extends SttTask {
|
||||
this._bufferedTranscripts = [];
|
||||
this.partialTranscriptsCount = 0;
|
||||
this.bugname_prefix = 'gather_';
|
||||
|
||||
}
|
||||
|
||||
get name() { return TaskName.Gather; }
|
||||
@@ -239,6 +241,7 @@ class TaskGather extends SttTask {
|
||||
const {span, ctx} = this.startChildSpan(`nested:${this.sayTask.summary}`);
|
||||
const process = () => {
|
||||
this.logger.debug('Gather: nested say task completed');
|
||||
this.playComplete = true;
|
||||
if (!this.listenDuringPrompt) {
|
||||
startDtmfListener();
|
||||
}
|
||||
@@ -269,6 +272,7 @@ class TaskGather extends SttTask {
|
||||
const {span, ctx} = this.startChildSpan(`nested:${this.playTask.summary}`);
|
||||
const process = () => {
|
||||
this.logger.debug('Gather: nested play task completed');
|
||||
this.playComplete = true;
|
||||
if (!this.listenDuringPrompt) {
|
||||
startDtmfListener();
|
||||
}
|
||||
@@ -559,6 +563,31 @@ class TaskGather extends SttTask {
|
||||
|
||||
break;
|
||||
|
||||
case 'openai':
|
||||
this.bugname = `${this.bugname_prefix}openai_transcribe`;
|
||||
this.addCustomEventListener(
|
||||
ep, OpenAITranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
|
||||
this.addCustomEventListener(
|
||||
ep, OpenAITranscriptionEvents.SpeechStarted, this._onOpenAISpeechStarted.bind(this, cs, ep));
|
||||
this.addCustomEventListener(
|
||||
ep, OpenAITranscriptionEvents.SpeechStopped, this._onOpenAISpeechStopped.bind(this, cs, ep));
|
||||
this.addCustomEventListener(ep, OpenAITranscriptionEvents.Connect,
|
||||
this._onVendorConnect.bind(this, cs, ep));
|
||||
this.addCustomEventListener(ep, OpenAITranscriptionEvents.ConnectFailure,
|
||||
this._onVendorConnectFailure.bind(this, cs, ep));
|
||||
this.addCustomEventListener(ep, OpenAITranscriptionEvents.Error,
|
||||
this._onOpenAIErrror.bind(this, cs, ep));
|
||||
|
||||
/* openai delta transcripts are useful only for minBargeinWordCount eval */
|
||||
if (this.minBargeinWordCount > 1) {
|
||||
this.openaiPartials = [];
|
||||
opts.OPENAI_WANT_PARTIALS = 1;
|
||||
this.addCustomEventListener(
|
||||
ep, OpenAITranscriptionEvents.PartialTranscript, this._onOpenAIPartialTranscript.bind(this, cs, ep));
|
||||
}
|
||||
this.modelSupportsConversationTracking = opts.OPENAI_MODEL !== 'whisper-1';
|
||||
break;
|
||||
|
||||
default:
|
||||
if (this.vendor.startsWith('custom:')) {
|
||||
this.bugname = `${this.bugname_prefix}${this.vendor}_transcribe`;
|
||||
@@ -590,6 +619,25 @@ class TaskGather extends SttTask {
|
||||
bugname: this.bugname
|
||||
}, 'Gather:_startTranscribing');
|
||||
|
||||
|
||||
/* special feature for openai: we can provide a prompt that includes recent conversation history */
|
||||
let prompt;
|
||||
if (this.vendor === 'openai') {
|
||||
if (this.modelSupportsConversationTracking) {
|
||||
prompt = this.formatOpenAIPrompt(this.cs, {
|
||||
prompt: this.data.recognizer?.openaiOptions?.prompt,
|
||||
hintsTemplate: this.data.recognizer?.openaiOptions?.promptTemplates?.hintsTemplate,
|
||||
// eslint-disable-next-line max-len
|
||||
conversationHistoryTemplate: this.data.recognizer?.openaiOptions?.promptTemplates?.conversationHistoryTemplate,
|
||||
hints: this.data.recognizer?.hints,
|
||||
});
|
||||
this.logger.debug({prompt}, 'Gather:_startTranscribing - created an openai prompt');
|
||||
}
|
||||
else if (this.data.recognizer?.hints?.length > 0) {
|
||||
prompt = this.data.recognizer?.hints.join(', ');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Note: we don't need to ask deepgram for interim results, because they
|
||||
* already send us words as they are finalized (is_final=true) even before
|
||||
@@ -601,6 +649,7 @@ class TaskGather extends SttTask {
|
||||
interim: this.interim,
|
||||
bugname: this.bugname,
|
||||
hostport: this.hostport,
|
||||
prompt
|
||||
}).catch((err) => {
|
||||
const {writeAlerts, AlertType} = this.cs.srf.locals;
|
||||
this.logger.error(err, 'TaskGather:_startTranscribing error');
|
||||
@@ -781,7 +830,11 @@ class TaskGather extends SttTask {
|
||||
const bugname = fsEvent.getHeader('media-bugname');
|
||||
const finished = fsEvent.getHeader('transcription-session-finished');
|
||||
this.logger.debug({evt, bugname, finished, vendor: this.vendor}, 'Gather:_onTranscription raw transcript');
|
||||
if (bugname && this.bugname !== bugname) return;
|
||||
if (bugname && this.bugname !== bugname) {
|
||||
this.logger.debug(
|
||||
`Gather:_onTranscription - ignoring transcript from ${bugname} because our bug is ${this.bugname}`);
|
||||
return;
|
||||
}
|
||||
if (finished === 'true') return;
|
||||
|
||||
if (this.vendor === 'ibm' && evt?.state === 'listening') return;
|
||||
@@ -1084,6 +1137,33 @@ class TaskGather extends SttTask {
|
||||
this._onVendorError(cs, _ep, {error: JSON.stringify(e)});
|
||||
}
|
||||
|
||||
async _onOpenAIErrror(cs, _ep, evt) {
|
||||
// eslint-disable-next-line no-unused-vars
|
||||
const {message, ...e} = evt;
|
||||
this._onVendorError(cs, _ep, {error: JSON.stringify(e)});
|
||||
}
|
||||
|
||||
async _onOpenAISpeechStarted(cs, _ep, evt) {
|
||||
this.logger.debug({evt}, 'TaskGather:_onOpenAISpeechStarted');
|
||||
}
|
||||
|
||||
async _onOpenAISpeechStopped(cs, _ep, evt) {
|
||||
this.logger.debug({evt}, 'TaskGather:_onOpenAISpeechStopped');
|
||||
}
|
||||
|
||||
async _onOpenAIPartialTranscript(cs, _ep, evt) {
|
||||
if (!this.playComplete) {
|
||||
const words = evt.delta.split(' ').filter((w) => /[A-Za-z0-0]/.test(w));
|
||||
this.openaiPartials.push(...words);
|
||||
this.logger.debug({words, partials: this.openaiPartials, evt}, 'TaskGather:_onOpenAIPartialTranscript - words');
|
||||
if (this.openaiPartials.length >= this.minBargeinWordCount) {
|
||||
this.logger.debug({partials: this.openaiPartials}, 'killing audio due to speech (openai)');
|
||||
this._killAudio(cs);
|
||||
this.notifyStatus({event: 'speech-bargein-detected', words: this.openaiPartials});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async _onVendorError(cs, _ep, evt) {
|
||||
super._onVendorError(cs, _ep, evt);
|
||||
if (!(await this._startFallback(cs, _ep, evt))) {
|
||||
@@ -1193,6 +1273,7 @@ class TaskGather extends SttTask {
|
||||
}
|
||||
}
|
||||
else if (reason.startsWith('speech')) {
|
||||
this.cs.emit('userSaid', evt.alternatives[0].transcript);
|
||||
if (this.parentTask) this.parentTask.emit('transcription', evt);
|
||||
else {
|
||||
this.emit('transcription', evt);
|
||||
|
||||
Reference in New Issue
Block a user