mirror of
https://github.com/jambonz/jambonz-feature-server.git
synced 2025-12-20 08:40:38 +00:00
Feat/tts streaming (#994)
* wip * add TtsStreamingBuffer class to abstract handling of streaming tokens * wip * add throttling support * support background ttsStream (#995) * wip * add TtsStreamingBuffer class to abstract handling of streaming tokens * wip * support background ttsStream * wip --------- Co-authored-by: Dave Horton <daveh@beachdognet.com> * wip * dont send if we have nothing to send * initial testing with cartesia * wip --------- Co-authored-by: Hoan Luu Huu <110280845+xquanluu@users.noreply.github.com>
This commit is contained in:
@@ -16,7 +16,8 @@ class TaskConfig extends Task {
|
||||
'fillerNoise',
|
||||
'actionHookDelayAction',
|
||||
'boostAudioSignal',
|
||||
'vad'
|
||||
'vad',
|
||||
'ttsStream'
|
||||
].forEach((k) => this[k] = this.data[k] || {});
|
||||
|
||||
if ('notifyEvents' in this.data) {
|
||||
@@ -45,6 +46,12 @@ class TaskConfig extends Task {
|
||||
};
|
||||
delete this.transcribeOpts.enable;
|
||||
}
|
||||
if (this.ttsStream.enable) {
|
||||
this.sayOpts = {
|
||||
verb: 'say',
|
||||
stream: true
|
||||
};
|
||||
}
|
||||
|
||||
if (this.data.reset) {
|
||||
if (typeof this.data.reset === 'string') this.data.reset = [this.data.reset];
|
||||
@@ -75,6 +82,7 @@ class TaskConfig extends Task {
|
||||
get hasVad() { return Object.keys(this.vad).length; }
|
||||
get hasFillerNoise() { return Object.keys(this.fillerNoise).length; }
|
||||
get hasReferHook() { return Object.keys(this.data).includes('referHook'); }
|
||||
get hasTtsStream() { return Object.keys(this.ttsStream).length; }
|
||||
|
||||
get summary() {
|
||||
const phrase = [];
|
||||
@@ -106,6 +114,9 @@ class TaskConfig extends Task {
|
||||
if (this.onHoldMusic) phrase.push(`onHoldMusic: ${this.onHoldMusic}`);
|
||||
if ('boostAudioSignal' in this.data) phrase.push(`setGain ${this.data.boostAudioSignal}`);
|
||||
if (this.hasReferHook) phrase.push('set referHook');
|
||||
if (this.hasTtsStream) {
|
||||
phrase.push(`${this.ttsStream.enable ? 'enable' : 'disable'} ttsStream`);
|
||||
}
|
||||
return `${this.name}{${phrase.join(',')}}`;
|
||||
}
|
||||
|
||||
@@ -305,6 +316,22 @@ class TaskConfig extends Task {
|
||||
if (this.hasReferHook) {
|
||||
cs.referHook = this.data.referHook;
|
||||
}
|
||||
|
||||
if (this.ttsStream.enable && this.sayOpts) {
|
||||
this.sayOpts.synthesizer = this.hasSynthesizer ? this.synthesizer : {
|
||||
vendor: cs.speechSynthesisVendor,
|
||||
language: cs.speechSynthesisLanguage,
|
||||
voice: cs.speechSynthesisVoice,
|
||||
...(cs.speechSynthesisLabel && {
|
||||
label: cs.speechSynthesisLabel
|
||||
})
|
||||
};
|
||||
this.logger.info({opts: this.gatherOpts}, 'Config: enabling ttsStream');
|
||||
cs.enableBackgroundTtsStream(this.sayOpts);
|
||||
} else if (!this.ttsStream.enable) {
|
||||
this.logger.info('Config: disabling ttsStream');
|
||||
cs.disableTtsStream();
|
||||
}
|
||||
}
|
||||
|
||||
async kill(cs) {
|
||||
|
||||
@@ -723,6 +723,7 @@ class TaskGather extends SttTask {
|
||||
this._fillerNoiseOn = false; // in a race, if we just started audio it may sneak through here
|
||||
this.ep.api('uuid_break', this.ep.uuid)
|
||||
.catch((err) => this.logger.info(err, 'Error killing audio'));
|
||||
cs.clearTtsStream();
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -1170,7 +1171,6 @@ class TaskGather extends SttTask {
|
||||
} catch (err) { /*already logged error*/ }
|
||||
|
||||
// Gather got response from hook, cancel actionHookDelay processing
|
||||
this.logger.debug('TaskGather:_resolve - checking ahd');
|
||||
if (this.cs.actionHookDelayProcessor) {
|
||||
if (returnedVerbs) {
|
||||
this.logger.debug('TaskGather:_resolve - got response from action hook, cancelling actionHookDelay');
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
const assert = require('assert');
|
||||
const TtsTask = require('./tts-task');
|
||||
const {TaskName, TaskPreconditions} = require('../utils/constants');
|
||||
const pollySSMLSplit = require('polly-ssml-split');
|
||||
@@ -35,24 +36,40 @@ class TaskSay extends TtsTask {
|
||||
super(logger, opts, parentTask);
|
||||
this.preconditions = TaskPreconditions.Endpoint;
|
||||
|
||||
this.text = (Array.isArray(this.data.text) ? this.data.text : [this.data.text])
|
||||
.map((t) => breakLengthyTextIfNeeded(this.logger, t))
|
||||
.flat();
|
||||
assert.ok((typeof this.data.text === 'string' || Array.isArray(this.data.text)) || this.data.stream === true,
|
||||
'Say: either text or stream:true is required');
|
||||
|
||||
this.loop = this.data.loop || 1;
|
||||
this.isHandledByPrimaryProvider = true;
|
||||
|
||||
if (this.data.stream === true) {
|
||||
this._isStreamingTts = true;
|
||||
this.closeOnStreamEmpty = this.data.closeOnStreamEmpty !== false;
|
||||
}
|
||||
else {
|
||||
this._isStreamingTts = false;
|
||||
this.text = (Array.isArray(this.data.text) ? this.data.text : [this.data.text])
|
||||
.map((t) => breakLengthyTextIfNeeded(this.logger, t))
|
||||
.flat();
|
||||
|
||||
this.loop = this.data.loop || 1;
|
||||
this.isHandledByPrimaryProvider = true;
|
||||
}
|
||||
}
|
||||
|
||||
get name() { return TaskName.Say; }
|
||||
|
||||
get summary() {
|
||||
for (let i = 0; i < this.text.length; i++) {
|
||||
if (this.text[i].startsWith('silence_stream')) continue;
|
||||
return `${this.name}{text=${this.text[i].slice(0, 15)}${this.text[i].length > 15 ? '...' : ''}}`;
|
||||
if (this.isStreamingTts) return `${this.name} streaming`;
|
||||
else {
|
||||
for (let i = 0; i < this.text.length; i++) {
|
||||
if (this.text[i].startsWith('silence_stream')) continue;
|
||||
return `${this.name}{text=${this.text[i].slice(0, 15)}${this.text[i].length > 15 ? '...' : ''}}`;
|
||||
}
|
||||
return `${this.name}{${this.text[0]}}`;
|
||||
}
|
||||
return `${this.name}{${this.text[0]}}`;
|
||||
}
|
||||
|
||||
get isStreamingTts() { return this._isStreamingTts; }
|
||||
|
||||
_validateURL(urlString) {
|
||||
try {
|
||||
new URL(urlString);
|
||||
@@ -63,14 +80,19 @@ class TaskSay extends TtsTask {
|
||||
}
|
||||
|
||||
async exec(cs, obj) {
|
||||
if (this.isStreamingTts && !cs.appIsUsingWebsockets) {
|
||||
throw new Error('Say: streaming say verb requires applications to use the websocket API');
|
||||
}
|
||||
|
||||
try {
|
||||
await this.handling(cs, obj);
|
||||
if (this.isStreamingTts) await this.handlingStreaming(cs, obj);
|
||||
else await this.handling(cs, obj);
|
||||
this.emit('playDone');
|
||||
} catch (error) {
|
||||
if (error instanceof SpeechCredentialError) {
|
||||
// if say failed due to speech credentials, alarm is writtern and error notification is sent
|
||||
// finished this say to move to next task.
|
||||
this.logger.info('Say failed due to SpeechCredentialError, finished!');
|
||||
this.logger.info({error}, 'Say failed due to SpeechCredentialError, finished!');
|
||||
this.emit('playDone');
|
||||
return;
|
||||
}
|
||||
@@ -78,6 +100,35 @@ class TaskSay extends TtsTask {
|
||||
}
|
||||
}
|
||||
|
||||
async handlingStreaming(cs, {ep}) {
|
||||
const {vendor, language, voice, label} = this.getTtsVendorData(cs);
|
||||
const credentials = cs.getSpeechCredentials(vendor, 'tts', label);
|
||||
if (!credentials) {
|
||||
throw new SpeechCredentialError(
|
||||
`No text-to-speech service credentials for ${vendor} with labels: ${label} have been configured`);
|
||||
}
|
||||
|
||||
try {
|
||||
|
||||
await this.setTtsStreamingChannelVars(vendor, language, voice, credentials, ep);
|
||||
|
||||
await cs.startTtsStream();
|
||||
|
||||
cs.requestor?.request('tts:streaming-event', '/streaming-event', {event_type: 'stream_open'})
|
||||
.catch((err) => this.logger.info({err}, 'TaskSay:handlingStreaming - Error sending'));
|
||||
} catch (err) {
|
||||
this.logger.info({err}, 'TaskSay:handlingStreaming - Error setting channel vars');
|
||||
cs.requestor?.request('tts:streaming-event', '/streaming-event', {event_type: 'stream_closed'})
|
||||
.catch((err) => this.logger.info({err}, 'TaskSay:handlingStreaming - Error sending'));
|
||||
|
||||
//TODO: send tts:streaming-event with error?
|
||||
this.notifyTaskDone();
|
||||
}
|
||||
|
||||
await this.awaitTaskDone();
|
||||
this.logger.info('TaskSay:handlingStreaming - done');
|
||||
}
|
||||
|
||||
async handling(cs, {ep}) {
|
||||
const {srf, accountSid:account_sid, callSid:target_sid} = cs;
|
||||
const {writeAlerts, AlertType} = srf.locals;
|
||||
@@ -96,7 +147,7 @@ class TaskSay extends TtsTask {
|
||||
let voice = this.synthesizer.voice && this.synthesizer.voice !== 'default' ?
|
||||
this.synthesizer.voice :
|
||||
cs.speechSynthesisVoice;
|
||||
let label = this.taskInlcudeSynthesizer ? this.synthesizer.label : cs.speechSynthesisLabel;
|
||||
let label = this.taskIncludeSynthesizer ? this.synthesizer.label : cs.speechSynthesisLabel;
|
||||
|
||||
const fallbackVendor = this.synthesizer.fallbackVendor && this.synthesizer.fallbackVendor !== 'default' ?
|
||||
this.synthesizer.fallbackVendor :
|
||||
@@ -107,7 +158,7 @@ class TaskSay extends TtsTask {
|
||||
const fallbackVoice = this.synthesizer.fallbackVoice && this.synthesizer.fallbackVoice !== 'default' ?
|
||||
this.synthesizer.fallbackVoice :
|
||||
cs.fallbackSpeechSynthesisVoice;
|
||||
const fallbackLabel = this.taskInlcudeSynthesizer ?
|
||||
const fallbackLabel = this.taskIncludeSynthesizer ?
|
||||
this.synthesizer.fallbackLabel : cs.fallbackSpeechSynthesisLabel;
|
||||
|
||||
if (cs.hasFallbackTts) {
|
||||
@@ -253,6 +304,7 @@ class TaskSay extends TtsTask {
|
||||
this._playResolve = null;
|
||||
}
|
||||
}
|
||||
this.notifyTaskDone();
|
||||
}
|
||||
|
||||
_addStreamingTtsAttributes(span, evt) {
|
||||
@@ -273,6 +325,13 @@ class TaskSay extends TtsTask {
|
||||
delete attrs['cache_filename']; //no value in adding this to the span
|
||||
span.setAttributes(attrs);
|
||||
}
|
||||
|
||||
notifyTtsStreamIsEmpty() {
|
||||
if (this.isStreamingTts && this.closeOnStreamEmpty) {
|
||||
this.logger.info('TaskSay:notifyTtsStreamIsEmpty - stream is empty, killing task');
|
||||
this.notifyTaskDone();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const spanMapping = {
|
||||
|
||||
@@ -13,11 +13,11 @@ class TtsTask extends Task {
|
||||
|
||||
this.earlyMedia = this.data.earlyMedia === true || (parentTask && parentTask.earlyMedia);
|
||||
/**
|
||||
* Task use taskInlcudeSynthesizer to identify
|
||||
* if taskInlcudeSynthesizer === true, use label from verb.synthesizer, even it's empty
|
||||
* if taskInlcudeSynthesizer === false, use label from application.synthesizer
|
||||
* Task use taskIncludeSynthesizer to identify
|
||||
* if taskIncludeSynthesizer === true, use label from verb.synthesizer, even it's empty
|
||||
* if taskIncludeSynthesizer === false, use label from application.synthesizer
|
||||
*/
|
||||
this.taskInlcudeSynthesizer = !!this.data.synthesizer;
|
||||
this.taskIncludeSynthesizer = !!this.data.synthesizer;
|
||||
this.synthesizer = this.data.synthesizer || {};
|
||||
this.disableTtsCache = this.data.disableTtsCache;
|
||||
this.options = this.synthesizer.options || {};
|
||||
@@ -44,6 +44,47 @@ class TtsTask extends Task {
|
||||
}
|
||||
}
|
||||
|
||||
getTtsVendorData(cs) {
|
||||
const vendor = this.synthesizer.vendor && this.synthesizer.vendor !== 'default' ?
|
||||
this.synthesizer.vendor :
|
||||
cs.speechSynthesisVendor;
|
||||
const language = this.synthesizer.language && this.synthesizer.language !== 'default' ?
|
||||
this.synthesizer.language :
|
||||
cs.speechSynthesisLanguage ;
|
||||
const voice = this.synthesizer.voice && this.synthesizer.voice !== 'default' ?
|
||||
this.synthesizer.voice :
|
||||
cs.speechSynthesisVoice;
|
||||
const label = this.taskIncludeSynthesizer ? this.synthesizer.label : cs.speechSynthesisLabel;
|
||||
return {vendor, language, voice, label};
|
||||
}
|
||||
|
||||
async setTtsStreamingChannelVars(vendor, language, voice, credentials, ep) {
|
||||
const {api_key, cartesia_model_id, cartesia_voice_id} = credentials;
|
||||
let obj;
|
||||
|
||||
switch (vendor) {
|
||||
case 'deepgram':
|
||||
obj = {
|
||||
DEEPGRAM_API_KEY: api_key,
|
||||
DEEPGRAM_TTS_STREAMING_MODEL: voice
|
||||
};
|
||||
break;
|
||||
case 'cartesia':
|
||||
obj = {
|
||||
CARTESIA_API_KEY: api_key,
|
||||
CARTESIA_TTS_STREAMING_MODEL_ID: cartesia_model_id,
|
||||
CARTESIA_TTS_STREAMING_VOICE_ID: cartesia_voice_id,
|
||||
CARTESIA_TTS_STREAMING_LANGUAGE: language || 'en'
|
||||
};
|
||||
break;
|
||||
default:
|
||||
throw new Error(`vendor ${vendor} is not supported for tts streaming yet`);
|
||||
}
|
||||
this.logger.info({vendor, credentials, obj}, 'setTtsStreamingChannelVars');
|
||||
|
||||
await ep.set(obj);
|
||||
}
|
||||
|
||||
async _synthesizeWithSpecificVendor(cs, ep, {vendor, language, voice, label, preCache = false}) {
|
||||
const {srf, accountSid:account_sid} = cs;
|
||||
const {writeAlerts, AlertType, stats} = srf.locals;
|
||||
|
||||
Reference in New Issue
Block a user