Feat/tts streaming (#994)

* wip * add TtsStreamingBuffer class to abstract handling of streaming tokens * wip * add throttling support * support background ttsStream (#995) * wip * add TtsStreamingBuffer class to abstract handling of streaming tokens * wip * support background ttsStream * wip --------- Co-authored-by: Dave Horton <daveh@beachdognet.com> * wip * dont send if we have nothing to send * initial testing with cartesia * wip --------- Co-authored-by: Hoan Luu Huu <110280845+xquanluu@users.noreply.github.com>
2025-12-20 08:40:38 +00:00 · 2024-12-18 14:44:37 -05:00
parent f37e1540ee
commit ba3f46df64
11 changed files with 731 additions and 64 deletions
--- a/lib/tasks/config.js
+++ b/lib/tasks/config.js
@@ -16,7 +16,8 @@ class TaskConfig extends Task {
      'fillerNoise',
      'actionHookDelayAction',
      'boostAudioSignal',
-      'vad'
+      'vad',
+      'ttsStream'
    ].forEach((k) => this[k] = this.data[k] || {});

    if ('notifyEvents' in this.data) {
@@ -45,6 +46,12 @@ class TaskConfig extends Task {
      };
      delete this.transcribeOpts.enable;
    }
+    if (this.ttsStream.enable) {
+      this.sayOpts = {
+        verb: 'say',
+        stream: true
+      };
+    }

    if (this.data.reset) {
      if (typeof this.data.reset === 'string') this.data.reset = [this.data.reset];
@@ -75,6 +82,7 @@ class TaskConfig extends Task {
  get hasVad() { return Object.keys(this.vad).length; }
  get hasFillerNoise() { return Object.keys(this.fillerNoise).length; }
  get hasReferHook() { return Object.keys(this.data).includes('referHook'); }
+  get hasTtsStream() { return Object.keys(this.ttsStream).length; }

  get summary() {
    const phrase = [];
@@ -106,6 +114,9 @@ class TaskConfig extends Task {
    if (this.onHoldMusic) phrase.push(`onHoldMusic: ${this.onHoldMusic}`);
    if ('boostAudioSignal' in this.data) phrase.push(`setGain ${this.data.boostAudioSignal}`);
    if (this.hasReferHook) phrase.push('set referHook');
+    if (this.hasTtsStream) {
+      phrase.push(`${this.ttsStream.enable ? 'enable' : 'disable'} ttsStream`);
+    }
    return `${this.name}{${phrase.join(',')}}`;
  }

@@ -305,6 +316,22 @@ class TaskConfig extends Task {
    if (this.hasReferHook) {
      cs.referHook = this.data.referHook;
    }
+
+    if (this.ttsStream.enable && this.sayOpts) {
+      this.sayOpts.synthesizer = this.hasSynthesizer ? this.synthesizer : {
+        vendor: cs.speechSynthesisVendor,
+        language: cs.speechSynthesisLanguage,
+        voice: cs.speechSynthesisVoice,
+        ...(cs.speechSynthesisLabel && {
+          label: cs.speechSynthesisLabel
+        })
+      };
+      this.logger.info({opts: this.gatherOpts}, 'Config: enabling ttsStream');
+      cs.enableBackgroundTtsStream(this.sayOpts);
+    } else if (!this.ttsStream.enable) {
+      this.logger.info('Config: disabling ttsStream');
+      cs.disableTtsStream();
+    }
  }

  async kill(cs) {
--- a/lib/tasks/gather.js
+++ b/lib/tasks/gather.js
@@ -723,6 +723,7 @@ class TaskGather extends SttTask {
        this._fillerNoiseOn = false;  // in a race, if we just started audio it may sneak through here
        this.ep.api('uuid_break', this.ep.uuid)
          .catch((err) => this.logger.info(err, 'Error killing audio'));
+        cs.clearTtsStream();
      }
      return;
    }
@@ -1170,7 +1171,6 @@ class TaskGather extends SttTask {
    } catch (err) {  /*already logged error*/ }

    // Gather got response from hook, cancel actionHookDelay processing
-    this.logger.debug('TaskGather:_resolve - checking ahd');
    if (this.cs.actionHookDelayProcessor) {
      if (returnedVerbs) {
        this.logger.debug('TaskGather:_resolve - got response from action hook, cancelling actionHookDelay');
--- a/lib/tasks/say.js
+++ b/lib/tasks/say.js
@@ -1,3 +1,4 @@
+const assert = require('assert');
 const TtsTask = require('./tts-task');
 const {TaskName, TaskPreconditions} = require('../utils/constants');
 const pollySSMLSplit = require('polly-ssml-split');
@@ -35,24 +36,40 @@ class TaskSay extends TtsTask {
    super(logger, opts, parentTask);
    this.preconditions = TaskPreconditions.Endpoint;

-    this.text = (Array.isArray(this.data.text) ? this.data.text : [this.data.text])
-      .map((t) => breakLengthyTextIfNeeded(this.logger, t))
-      .flat();
+    assert.ok((typeof this.data.text === 'string' || Array.isArray(this.data.text)) || this.data.stream === true,
+      'Say: either text or stream:true is required');

-    this.loop = this.data.loop || 1;
-    this.isHandledByPrimaryProvider = true;
+
+    if (this.data.stream === true) {
+      this._isStreamingTts = true;
+      this.closeOnStreamEmpty = this.data.closeOnStreamEmpty !== false;
+    }
+    else {
+      this._isStreamingTts = false;
+      this.text = (Array.isArray(this.data.text) ? this.data.text : [this.data.text])
+        .map((t) => breakLengthyTextIfNeeded(this.logger, t))
+        .flat();
+
+      this.loop = this.data.loop || 1;
+      this.isHandledByPrimaryProvider = true;
+    }
  }

  get name() { return TaskName.Say; }

  get summary() {
-    for (let i = 0; i < this.text.length; i++) {
-      if (this.text[i].startsWith('silence_stream')) continue;
-      return `${this.name}{text=${this.text[i].slice(0, 15)}${this.text[i].length > 15 ? '...' : ''}}`;
+    if (this.isStreamingTts) return `${this.name} streaming`;
+    else {
+      for (let i = 0; i < this.text.length; i++) {
+        if (this.text[i].startsWith('silence_stream')) continue;
+        return `${this.name}{text=${this.text[i].slice(0, 15)}${this.text[i].length > 15 ? '...' : ''}}`;
+      }
+      return `${this.name}{${this.text[0]}}`;
    }
-    return `${this.name}{${this.text[0]}}`;
  }

+  get isStreamingTts() { return this._isStreamingTts; }
+
  _validateURL(urlString) {
    try {
      new URL(urlString);
@@ -63,14 +80,19 @@ class TaskSay extends TtsTask {
  }

  async exec(cs, obj) {
+    if (this.isStreamingTts && !cs.appIsUsingWebsockets) {
+      throw new Error('Say: streaming say verb requires applications to use the websocket API');
+    }
+
    try {
-      await this.handling(cs, obj);
+      if (this.isStreamingTts) await this.handlingStreaming(cs, obj);
+      else await this.handling(cs, obj);
      this.emit('playDone');
    } catch (error) {
      if (error instanceof SpeechCredentialError) {
        // if say failed due to speech credentials, alarm is writtern and error notification is sent
        // finished this say to move to next task.
-        this.logger.info('Say failed due to SpeechCredentialError, finished!');
+        this.logger.info({error}, 'Say failed due to SpeechCredentialError, finished!');
        this.emit('playDone');
        return;
      }
@@ -78,6 +100,35 @@ class TaskSay extends TtsTask {
    }
  }

+  async handlingStreaming(cs, {ep}) {
+    const {vendor, language, voice, label} = this.getTtsVendorData(cs);
+    const  credentials = cs.getSpeechCredentials(vendor, 'tts', label);
+    if (!credentials) {
+      throw new SpeechCredentialError(
+        `No text-to-speech service credentials for ${vendor} with labels: ${label} have been configured`);
+    }
+
+    try {
+
+      await this.setTtsStreamingChannelVars(vendor, language, voice, credentials, ep);
+
+      await cs.startTtsStream();
+
+      cs.requestor?.request('tts:streaming-event', '/streaming-event', {event_type: 'stream_open'})
+        .catch((err) => this.logger.info({err}, 'TaskSay:handlingStreaming - Error sending'));
+    } catch (err) {
+      this.logger.info({err}, 'TaskSay:handlingStreaming - Error setting channel vars');
+      cs.requestor?.request('tts:streaming-event', '/streaming-event', {event_type: 'stream_closed'})
+        .catch((err) => this.logger.info({err}, 'TaskSay:handlingStreaming - Error sending'));
+
+      //TODO: send tts:streaming-event with error?
+      this.notifyTaskDone();
+    }
+
+    await this.awaitTaskDone();
+    this.logger.info('TaskSay:handlingStreaming - done');
+  }
+
  async handling(cs, {ep}) {
    const {srf, accountSid:account_sid, callSid:target_sid} = cs;
    const {writeAlerts, AlertType} = srf.locals;
@@ -96,7 +147,7 @@ class TaskSay extends TtsTask {
    let voice =  this.synthesizer.voice && this.synthesizer.voice !== 'default' ?
      this.synthesizer.voice :
      cs.speechSynthesisVoice;
-    let label = this.taskInlcudeSynthesizer ? this.synthesizer.label : cs.speechSynthesisLabel;
+    let label = this.taskIncludeSynthesizer ? this.synthesizer.label : cs.speechSynthesisLabel;

    const fallbackVendor = this.synthesizer.fallbackVendor && this.synthesizer.fallbackVendor !== 'default' ?
      this.synthesizer.fallbackVendor :
@@ -107,7 +158,7 @@ class TaskSay extends TtsTask {
    const fallbackVoice =  this.synthesizer.fallbackVoice && this.synthesizer.fallbackVoice !== 'default' ?
      this.synthesizer.fallbackVoice :
      cs.fallbackSpeechSynthesisVoice;
-    const fallbackLabel = this.taskInlcudeSynthesizer ?
+    const fallbackLabel = this.taskIncludeSynthesizer ?
      this.synthesizer.fallbackLabel : cs.fallbackSpeechSynthesisLabel;

    if (cs.hasFallbackTts) {
@@ -253,6 +304,7 @@ class TaskSay extends TtsTask {
        this._playResolve = null;
      }
    }
+    this.notifyTaskDone();
  }

  _addStreamingTtsAttributes(span, evt) {
@@ -273,6 +325,13 @@ class TaskSay extends TtsTask {
    delete attrs['cache_filename']; //no value in adding this to the span
    span.setAttributes(attrs);
  }
+
+  notifyTtsStreamIsEmpty() {
+    if (this.isStreamingTts && this.closeOnStreamEmpty) {
+      this.logger.info('TaskSay:notifyTtsStreamIsEmpty - stream is empty, killing task');
+      this.notifyTaskDone();
+    }
+  }
 }

 const spanMapping = {
--- a/lib/tasks/tts-task.js
+++ b/lib/tasks/tts-task.js
@@ -13,11 +13,11 @@ class TtsTask extends Task {

    this.earlyMedia = this.data.earlyMedia === true || (parentTask && parentTask.earlyMedia);
    /**
-     * Task use taskInlcudeSynthesizer to identify
-     * if taskInlcudeSynthesizer === true, use label from verb.synthesizer, even it's empty
-     * if taskInlcudeSynthesizer === false, use label from application.synthesizer
+     * Task use taskIncludeSynthesizer to identify
+     * if taskIncludeSynthesizer === true, use label from verb.synthesizer, even it's empty
+     * if taskIncludeSynthesizer === false, use label from application.synthesizer
     */
-    this.taskInlcudeSynthesizer = !!this.data.synthesizer;
+    this.taskIncludeSynthesizer = !!this.data.synthesizer;
    this.synthesizer = this.data.synthesizer || {};
    this.disableTtsCache = this.data.disableTtsCache;
    this.options = this.synthesizer.options || {};
@@ -44,6 +44,47 @@ class TtsTask extends Task {
    }
  }

+  getTtsVendorData(cs) {
+    const vendor = this.synthesizer.vendor && this.synthesizer.vendor !== 'default' ?
+      this.synthesizer.vendor :
+      cs.speechSynthesisVendor;
+    const language = this.synthesizer.language && this.synthesizer.language !== 'default' ?
+      this.synthesizer.language :
+      cs.speechSynthesisLanguage ;
+    const voice =  this.synthesizer.voice && this.synthesizer.voice !== 'default' ?
+      this.synthesizer.voice :
+      cs.speechSynthesisVoice;
+    const label = this.taskIncludeSynthesizer ? this.synthesizer.label : cs.speechSynthesisLabel;
+    return {vendor, language, voice, label};
+  }
+
+  async setTtsStreamingChannelVars(vendor, language, voice, credentials, ep) {
+    const {api_key, cartesia_model_id, cartesia_voice_id} = credentials;
+    let obj;
+
+    switch (vendor) {
+      case 'deepgram':
+        obj = {
+          DEEPGRAM_API_KEY: api_key,
+          DEEPGRAM_TTS_STREAMING_MODEL: voice
+        };
+        break;
+      case 'cartesia':
+        obj = {
+          CARTESIA_API_KEY: api_key,
+          CARTESIA_TTS_STREAMING_MODEL_ID: cartesia_model_id,
+          CARTESIA_TTS_STREAMING_VOICE_ID: cartesia_voice_id,
+          CARTESIA_TTS_STREAMING_LANGUAGE: language || 'en'
+        };
+        break;
+      default:
+        throw new Error(`vendor ${vendor} is not supported for tts streaming yet`);
+    }
+    this.logger.info({vendor, credentials, obj}, 'setTtsStreamingChannelVars');
+
+    await ep.set(obj);
+  }
+
  async _synthesizeWithSpecificVendor(cs, ep, {vendor, language, voice, label, preCache = false}) {
    const {srf, accountSid:account_sid} = cs;
    const {writeAlerts, AlertType, stats} = srf.locals;