Feat/tts streaming (#994)

* wip * add TtsStreamingBuffer class to abstract handling of streaming tokens * wip * add throttling support * support background ttsStream (#995) * wip * add TtsStreamingBuffer class to abstract handling of streaming tokens * wip * support background ttsStream * wip --------- Co-authored-by: Dave Horton <daveh@beachdognet.com> * wip * dont send if we have nothing to send * initial testing with cartesia * wip --------- Co-authored-by: Hoan Luu Huu <110280845+xquanluu@users.noreply.github.com>
2025-12-20 08:40:38 +00:00 · 2024-12-18 14:44:37 -05:00
parent f37e1540ee
commit ba3f46df64
11 changed files with 731 additions and 64 deletions
--- a/lib/tasks/tts-task.js
+++ b/lib/tasks/tts-task.js
@@ -13,11 +13,11 @@ class TtsTask extends Task {

    this.earlyMedia = this.data.earlyMedia === true || (parentTask && parentTask.earlyMedia);
    /**
-     * Task use taskInlcudeSynthesizer to identify
-     * if taskInlcudeSynthesizer === true, use label from verb.synthesizer, even it's empty
-     * if taskInlcudeSynthesizer === false, use label from application.synthesizer
+     * Task use taskIncludeSynthesizer to identify
+     * if taskIncludeSynthesizer === true, use label from verb.synthesizer, even it's empty
+     * if taskIncludeSynthesizer === false, use label from application.synthesizer
     */
-    this.taskInlcudeSynthesizer = !!this.data.synthesizer;
+    this.taskIncludeSynthesizer = !!this.data.synthesizer;
    this.synthesizer = this.data.synthesizer || {};
    this.disableTtsCache = this.data.disableTtsCache;
    this.options = this.synthesizer.options || {};
@@ -44,6 +44,47 @@ class TtsTask extends Task {
    }
  }

+  getTtsVendorData(cs) {
+    const vendor = this.synthesizer.vendor && this.synthesizer.vendor !== 'default' ?
+      this.synthesizer.vendor :
+      cs.speechSynthesisVendor;
+    const language = this.synthesizer.language && this.synthesizer.language !== 'default' ?
+      this.synthesizer.language :
+      cs.speechSynthesisLanguage ;
+    const voice =  this.synthesizer.voice && this.synthesizer.voice !== 'default' ?
+      this.synthesizer.voice :
+      cs.speechSynthesisVoice;
+    const label = this.taskIncludeSynthesizer ? this.synthesizer.label : cs.speechSynthesisLabel;
+    return {vendor, language, voice, label};
+  }
+
+  async setTtsStreamingChannelVars(vendor, language, voice, credentials, ep) {
+    const {api_key, cartesia_model_id, cartesia_voice_id} = credentials;
+    let obj;
+
+    switch (vendor) {
+      case 'deepgram':
+        obj = {
+          DEEPGRAM_API_KEY: api_key,
+          DEEPGRAM_TTS_STREAMING_MODEL: voice
+        };
+        break;
+      case 'cartesia':
+        obj = {
+          CARTESIA_API_KEY: api_key,
+          CARTESIA_TTS_STREAMING_MODEL_ID: cartesia_model_id,
+          CARTESIA_TTS_STREAMING_VOICE_ID: cartesia_voice_id,
+          CARTESIA_TTS_STREAMING_LANGUAGE: language || 'en'
+        };
+        break;
+      default:
+        throw new Error(`vendor ${vendor} is not supported for tts streaming yet`);
+    }
+    this.logger.info({vendor, credentials, obj}, 'setTtsStreamingChannelVars');
+
+    await ep.set(obj);
+  }
+
  async _synthesizeWithSpecificVendor(cs, ep, {vendor, language, voice, label, preCache = false}) {
    const {srf, accountSid:account_sid} = cs;
    const {writeAlerts, AlertType, stats} = srf.locals;