Feat/tts streaming (#994)

* wip

* add TtsStreamingBuffer class to abstract handling of streaming tokens

* wip

* add throttling support

* support background ttsStream (#995)

* wip

* add TtsStreamingBuffer class to abstract handling of streaming tokens

* wip

* support background ttsStream

* wip

---------

Co-authored-by: Dave Horton <daveh@beachdognet.com>

* wip

* dont send if we have nothing to send

* initial testing with cartesia

* wip

---------

Co-authored-by: Hoan Luu Huu <110280845+xquanluu@users.noreply.github.com>
This commit is contained in:
Dave Horton
2024-12-18 14:44:37 -05:00
committed by GitHub
parent f37e1540ee
commit ba3f46df64
11 changed files with 731 additions and 64 deletions

View File

@@ -16,7 +16,8 @@ class TaskConfig extends Task {
'fillerNoise',
'actionHookDelayAction',
'boostAudioSignal',
'vad'
'vad',
'ttsStream'
].forEach((k) => this[k] = this.data[k] || {});
if ('notifyEvents' in this.data) {
@@ -45,6 +46,12 @@ class TaskConfig extends Task {
};
delete this.transcribeOpts.enable;
}
if (this.ttsStream.enable) {
this.sayOpts = {
verb: 'say',
stream: true
};
}
if (this.data.reset) {
if (typeof this.data.reset === 'string') this.data.reset = [this.data.reset];
@@ -75,6 +82,7 @@ class TaskConfig extends Task {
get hasVad() { return Object.keys(this.vad).length; }
get hasFillerNoise() { return Object.keys(this.fillerNoise).length; }
get hasReferHook() { return Object.keys(this.data).includes('referHook'); }
get hasTtsStream() { return Object.keys(this.ttsStream).length; }
get summary() {
const phrase = [];
@@ -106,6 +114,9 @@ class TaskConfig extends Task {
if (this.onHoldMusic) phrase.push(`onHoldMusic: ${this.onHoldMusic}`);
if ('boostAudioSignal' in this.data) phrase.push(`setGain ${this.data.boostAudioSignal}`);
if (this.hasReferHook) phrase.push('set referHook');
if (this.hasTtsStream) {
phrase.push(`${this.ttsStream.enable ? 'enable' : 'disable'} ttsStream`);
}
return `${this.name}{${phrase.join(',')}}`;
}
@@ -305,6 +316,22 @@ class TaskConfig extends Task {
if (this.hasReferHook) {
cs.referHook = this.data.referHook;
}
if (this.ttsStream.enable && this.sayOpts) {
this.sayOpts.synthesizer = this.hasSynthesizer ? this.synthesizer : {
vendor: cs.speechSynthesisVendor,
language: cs.speechSynthesisLanguage,
voice: cs.speechSynthesisVoice,
...(cs.speechSynthesisLabel && {
label: cs.speechSynthesisLabel
})
};
this.logger.info({opts: this.gatherOpts}, 'Config: enabling ttsStream');
cs.enableBackgroundTtsStream(this.sayOpts);
} else if (!this.ttsStream.enable) {
this.logger.info('Config: disabling ttsStream');
cs.disableTtsStream();
}
}
async kill(cs) {

View File

@@ -723,6 +723,7 @@ class TaskGather extends SttTask {
this._fillerNoiseOn = false; // in a race, if we just started audio it may sneak through here
this.ep.api('uuid_break', this.ep.uuid)
.catch((err) => this.logger.info(err, 'Error killing audio'));
cs.clearTtsStream();
}
return;
}
@@ -1170,7 +1171,6 @@ class TaskGather extends SttTask {
} catch (err) { /*already logged error*/ }
// Gather got response from hook, cancel actionHookDelay processing
this.logger.debug('TaskGather:_resolve - checking ahd');
if (this.cs.actionHookDelayProcessor) {
if (returnedVerbs) {
this.logger.debug('TaskGather:_resolve - got response from action hook, cancelling actionHookDelay');

View File

@@ -1,3 +1,4 @@
const assert = require('assert');
const TtsTask = require('./tts-task');
const {TaskName, TaskPreconditions} = require('../utils/constants');
const pollySSMLSplit = require('polly-ssml-split');
@@ -35,24 +36,40 @@ class TaskSay extends TtsTask {
super(logger, opts, parentTask);
this.preconditions = TaskPreconditions.Endpoint;
this.text = (Array.isArray(this.data.text) ? this.data.text : [this.data.text])
.map((t) => breakLengthyTextIfNeeded(this.logger, t))
.flat();
assert.ok((typeof this.data.text === 'string' || Array.isArray(this.data.text)) || this.data.stream === true,
'Say: either text or stream:true is required');
this.loop = this.data.loop || 1;
this.isHandledByPrimaryProvider = true;
if (this.data.stream === true) {
this._isStreamingTts = true;
this.closeOnStreamEmpty = this.data.closeOnStreamEmpty !== false;
}
else {
this._isStreamingTts = false;
this.text = (Array.isArray(this.data.text) ? this.data.text : [this.data.text])
.map((t) => breakLengthyTextIfNeeded(this.logger, t))
.flat();
this.loop = this.data.loop || 1;
this.isHandledByPrimaryProvider = true;
}
}
get name() { return TaskName.Say; }
get summary() {
for (let i = 0; i < this.text.length; i++) {
if (this.text[i].startsWith('silence_stream')) continue;
return `${this.name}{text=${this.text[i].slice(0, 15)}${this.text[i].length > 15 ? '...' : ''}}`;
if (this.isStreamingTts) return `${this.name} streaming`;
else {
for (let i = 0; i < this.text.length; i++) {
if (this.text[i].startsWith('silence_stream')) continue;
return `${this.name}{text=${this.text[i].slice(0, 15)}${this.text[i].length > 15 ? '...' : ''}}`;
}
return `${this.name}{${this.text[0]}}`;
}
return `${this.name}{${this.text[0]}}`;
}
get isStreamingTts() { return this._isStreamingTts; }
_validateURL(urlString) {
try {
new URL(urlString);
@@ -63,14 +80,19 @@ class TaskSay extends TtsTask {
}
async exec(cs, obj) {
if (this.isStreamingTts && !cs.appIsUsingWebsockets) {
throw new Error('Say: streaming say verb requires applications to use the websocket API');
}
try {
await this.handling(cs, obj);
if (this.isStreamingTts) await this.handlingStreaming(cs, obj);
else await this.handling(cs, obj);
this.emit('playDone');
} catch (error) {
if (error instanceof SpeechCredentialError) {
// if say failed due to speech credentials, alarm is writtern and error notification is sent
// finished this say to move to next task.
this.logger.info('Say failed due to SpeechCredentialError, finished!');
this.logger.info({error}, 'Say failed due to SpeechCredentialError, finished!');
this.emit('playDone');
return;
}
@@ -78,6 +100,35 @@ class TaskSay extends TtsTask {
}
}
async handlingStreaming(cs, {ep}) {
const {vendor, language, voice, label} = this.getTtsVendorData(cs);
const credentials = cs.getSpeechCredentials(vendor, 'tts', label);
if (!credentials) {
throw new SpeechCredentialError(
`No text-to-speech service credentials for ${vendor} with labels: ${label} have been configured`);
}
try {
await this.setTtsStreamingChannelVars(vendor, language, voice, credentials, ep);
await cs.startTtsStream();
cs.requestor?.request('tts:streaming-event', '/streaming-event', {event_type: 'stream_open'})
.catch((err) => this.logger.info({err}, 'TaskSay:handlingStreaming - Error sending'));
} catch (err) {
this.logger.info({err}, 'TaskSay:handlingStreaming - Error setting channel vars');
cs.requestor?.request('tts:streaming-event', '/streaming-event', {event_type: 'stream_closed'})
.catch((err) => this.logger.info({err}, 'TaskSay:handlingStreaming - Error sending'));
//TODO: send tts:streaming-event with error?
this.notifyTaskDone();
}
await this.awaitTaskDone();
this.logger.info('TaskSay:handlingStreaming - done');
}
async handling(cs, {ep}) {
const {srf, accountSid:account_sid, callSid:target_sid} = cs;
const {writeAlerts, AlertType} = srf.locals;
@@ -96,7 +147,7 @@ class TaskSay extends TtsTask {
let voice = this.synthesizer.voice && this.synthesizer.voice !== 'default' ?
this.synthesizer.voice :
cs.speechSynthesisVoice;
let label = this.taskInlcudeSynthesizer ? this.synthesizer.label : cs.speechSynthesisLabel;
let label = this.taskIncludeSynthesizer ? this.synthesizer.label : cs.speechSynthesisLabel;
const fallbackVendor = this.synthesizer.fallbackVendor && this.synthesizer.fallbackVendor !== 'default' ?
this.synthesizer.fallbackVendor :
@@ -107,7 +158,7 @@ class TaskSay extends TtsTask {
const fallbackVoice = this.synthesizer.fallbackVoice && this.synthesizer.fallbackVoice !== 'default' ?
this.synthesizer.fallbackVoice :
cs.fallbackSpeechSynthesisVoice;
const fallbackLabel = this.taskInlcudeSynthesizer ?
const fallbackLabel = this.taskIncludeSynthesizer ?
this.synthesizer.fallbackLabel : cs.fallbackSpeechSynthesisLabel;
if (cs.hasFallbackTts) {
@@ -253,6 +304,7 @@ class TaskSay extends TtsTask {
this._playResolve = null;
}
}
this.notifyTaskDone();
}
_addStreamingTtsAttributes(span, evt) {
@@ -273,6 +325,13 @@ class TaskSay extends TtsTask {
delete attrs['cache_filename']; //no value in adding this to the span
span.setAttributes(attrs);
}
notifyTtsStreamIsEmpty() {
if (this.isStreamingTts && this.closeOnStreamEmpty) {
this.logger.info('TaskSay:notifyTtsStreamIsEmpty - stream is empty, killing task');
this.notifyTaskDone();
}
}
}
const spanMapping = {

View File

@@ -13,11 +13,11 @@ class TtsTask extends Task {
this.earlyMedia = this.data.earlyMedia === true || (parentTask && parentTask.earlyMedia);
/**
* Task use taskInlcudeSynthesizer to identify
* if taskInlcudeSynthesizer === true, use label from verb.synthesizer, even it's empty
* if taskInlcudeSynthesizer === false, use label from application.synthesizer
* Task use taskIncludeSynthesizer to identify
* if taskIncludeSynthesizer === true, use label from verb.synthesizer, even it's empty
* if taskIncludeSynthesizer === false, use label from application.synthesizer
*/
this.taskInlcudeSynthesizer = !!this.data.synthesizer;
this.taskIncludeSynthesizer = !!this.data.synthesizer;
this.synthesizer = this.data.synthesizer || {};
this.disableTtsCache = this.data.disableTtsCache;
this.options = this.synthesizer.options || {};
@@ -44,6 +44,47 @@ class TtsTask extends Task {
}
}
getTtsVendorData(cs) {
const vendor = this.synthesizer.vendor && this.synthesizer.vendor !== 'default' ?
this.synthesizer.vendor :
cs.speechSynthesisVendor;
const language = this.synthesizer.language && this.synthesizer.language !== 'default' ?
this.synthesizer.language :
cs.speechSynthesisLanguage ;
const voice = this.synthesizer.voice && this.synthesizer.voice !== 'default' ?
this.synthesizer.voice :
cs.speechSynthesisVoice;
const label = this.taskIncludeSynthesizer ? this.synthesizer.label : cs.speechSynthesisLabel;
return {vendor, language, voice, label};
}
async setTtsStreamingChannelVars(vendor, language, voice, credentials, ep) {
const {api_key, cartesia_model_id, cartesia_voice_id} = credentials;
let obj;
switch (vendor) {
case 'deepgram':
obj = {
DEEPGRAM_API_KEY: api_key,
DEEPGRAM_TTS_STREAMING_MODEL: voice
};
break;
case 'cartesia':
obj = {
CARTESIA_API_KEY: api_key,
CARTESIA_TTS_STREAMING_MODEL_ID: cartesia_model_id,
CARTESIA_TTS_STREAMING_VOICE_ID: cartesia_voice_id,
CARTESIA_TTS_STREAMING_LANGUAGE: language || 'en'
};
break;
default:
throw new Error(`vendor ${vendor} is not supported for tts streaming yet`);
}
this.logger.info({vendor, credentials, obj}, 'setTtsStreamingChannelVars');
await ep.set(obj);
}
async _synthesizeWithSpecificVendor(cs, ep, {vendor, language, voice, label, preCache = false}) {
const {srf, accountSid:account_sid} = cs;
const {writeAlerts, AlertType, stats} = srf.locals;