Feat/ambient sounds (#678)

* initial support for coaching mode in conference * wip * wip * add support for answer verb * wip * wip * wip * wip * wip * updates to rename option to dub * wip * wip * wip * update verb-specs * wip * wip * wip * wip * wip * wip * wip * wip * add option to boost audio signal in main channel * wip * wip * wip * wip * wip * wip * for now, bypass use of streaming apis when generating tts audio for dub tracks * add nested dub to dial * wip * add support for filler noise * kill filler noise when gather killed * wip * wip * while using sayOnTrack, we have to enclose the say command in double quotes * disableTtsStreaming = false * allow transcribe of b leg only on dial verb * dub.say can either be text or object like say verb with text and synthesizer * remove loop for sayOnTrack * update speech-utils * fixes for testing transcribe verb and support for dub and boostAudioSignal in lcc commands * add dial.boostAudioSignal * fix bug where session-level recognizer settings incorrectly overwrite verb-level settings * update verb specs * update dial to support array of dub verbs * fix bug setting gain * lint * wip * update speech-utils * use new endpoint methods for mod_dub --------- Co-authored-by: Dave Horton <daveh@beachdognet.com>
2026-01-25 02:07:56 +00:00 · 2024-03-24 03:23:57 +07:00
parent ec58232b61
commit 5b1d8a8ff3
18 changed files with 915 additions and 278 deletions
--- a/lib/tasks/tts-task.js
+++ b/lib/tasks/tts-task.js
@@ -0,0 +1,180 @@
+const Task = require('./task');
+const { TaskPreconditions } = require('../utils/constants');
+
+class TtsTask extends Task {
+
+  constructor(logger, data, parentTask) {
+    super(logger, data);
+    this.parentTask = parentTask;
+
+    this.preconditions = TaskPreconditions.Endpoint;
+
+    this.earlyMedia = this.data.earlyMedia === true || (parentTask && parentTask.earlyMedia);
+    this.synthesizer = this.data.synthesizer || {};
+    this.disableTtsCache = this.data.disableTtsCache;
+    this.options = this.synthesizer.options || {};
+  }
+
+  async exec(cs) {
+    super.exec(cs);
+  }
+
+  async _synthesizeWithSpecificVendor(cs, ep, {
+    vendor,
+    language,
+    voice,
+    label,
+    disableTtsStreaming,
+    preCache
+  }) {
+    const {srf, accountSid:account_sid} = cs;
+    const {updateSpeechCredentialLastUsed} = require('../utils/db-utils')(this.logger, srf);
+    const {writeAlerts, AlertType, stats} = srf.locals;
+    const {synthAudio} = srf.locals.dbHelpers;
+    const engine = this.synthesizer.engine || 'standard';
+    const salt = cs.callSid;
+
+    let credentials = cs.getSpeechCredentials(vendor, 'tts', label);
+    /* parse Nuance voices into name and model */
+    let model;
+    if (vendor === 'nuance' && voice) {
+      const arr = /([A-Za-z-]*)\s+-\s+(enhanced|standard)/.exec(voice);
+      if (arr) {
+        voice = arr[1];
+        model = arr[2];
+      }
+    } else if (vendor === 'deepgram') {
+      model = voice;
+    }
+
+    /* allow for microsoft custom region voice and api_key to be specified as an override */
+    if (vendor === 'microsoft' && this.options.deploymentId) {
+      credentials = credentials || {};
+      credentials.use_custom_tts = true;
+      credentials.custom_tts_endpoint = this.options.deploymentId;
+      credentials.api_key = this.options.apiKey || credentials.apiKey;
+      credentials.region = this.options.region || credentials.region;
+      voice = this.options.voice || voice;
+    } else if (vendor === 'elevenlabs') {
+      credentials = credentials || {};
+      credentials.model_id = this.options.model_id || credentials.model_id;
+      credentials.voice_settings = this.options.voice_settings || {};
+      credentials.optimize_streaming_latency = this.options.optimize_streaming_latency
+      || credentials.optimize_streaming_latency;
+      voice = this.options.voice_id || voice;
+    }
+
+    ep.set({
+      tts_engine: vendor,
+      tts_voice: voice,
+      cache_speech_handles: 1,
+    }).catch((err) => this.logger.info({err}, `${this.name}: Error setting tts_engine on endpoint`));
+
+    if (!preCache) this.logger.info({vendor, language, voice, model}, `${this.name}:exec`);
+    try {
+      if (!credentials) {
+        writeAlerts({
+          account_sid,
+          alert_type: AlertType.TTS_NOT_PROVISIONED,
+          vendor
+        }).catch((err) => this.logger.info({err}, 'Error generating alert for no tts'));
+        this.notifyError({
+          msg: 'TTS error',
+          details:`No speech credentials provisioned for selected vendor ${vendor}`
+        });
+        throw new Error('no provisioned speech credentials for TTS');
+      }
+      // synthesize all of the text elements
+      let lastUpdated = false;
+
+      /* produce an audio segment from the provided text */
+      const generateAudio = async(text) => {
+        if (this.killed) return;
+        if (text.startsWith('silence_stream://')) return text;
+
+        /* otel: trace time for tts */
+        if (!preCache && !this.parentTask)  {
+          const {span} = this.startChildSpan('tts-generation', {
+            'tts.vendor': vendor,
+            'tts.language': language,
+            'tts.voice': voice
+          });
+          this.otelSpan = span;
+        }
+        try {
+          const {filePath, servedFromCache, rtt} = await synthAudio(stats, {
+            account_sid,
+            text,
+            vendor,
+            language,
+            voice,
+            engine,
+            model,
+            salt,
+            credentials,
+            options: this.options,
+            disableTtsCache : this.disableTtsCache,
+            disableTtsStreaming,
+            preCache
+          });
+          if (!filePath.startsWith('say:')) {
+            this.logger.debug(`file ${filePath}, served from cache ${servedFromCache}`);
+            if (filePath) cs.trackTmpFile(filePath);
+            if (this.otelSpan) {
+              this.otelSpan.setAttributes({'tts.cached': servedFromCache});
+              this.otelSpan.end();
+              this.otelSpan = null;
+            }
+            if (!servedFromCache && !lastUpdated) {
+              lastUpdated = true;
+              updateSpeechCredentialLastUsed(credentials.speech_credential_sid).catch(() => {/* logged error */});
+            }
+            if (!servedFromCache && rtt && !preCache) {
+              this.notifyStatus({
+                event: 'synthesized-audio',
+                vendor,
+                language,
+                characters: text.length,
+                elapsedTime: rtt
+              });
+            }
+          }
+          else {
+            this.logger.debug('a streaming tts api will be used');
+            const modifiedPath = filePath.replace('say:{', `say:{session-uuid=${ep.uuid},`);
+            return modifiedPath;
+          }
+          return filePath;
+        } catch (err) {
+          this.logger.info({err}, 'Error synthesizing tts');
+          if (this.otelSpan) this.otelSpan.end();
+          writeAlerts({
+            account_sid: cs.accountSid,
+            alert_type: AlertType.TTS_FAILURE,
+            vendor,
+            detail: err.message
+          }).catch((err) => this.logger.info({err}, 'Error generating alert for tts failure'));
+          this.notifyError({msg: 'TTS error', details: err.message || err});
+          throw err;
+        }
+      };
+
+      const arr = this.text.map((t) => (this._validateURL(t) ? t : generateAudio(t)));
+      return (await Promise.all(arr)).filter((fp) => fp && fp.length);
+    } catch (err) {
+      this.logger.info(err, 'TaskSay:exec error');
+      throw err;
+    }
+  }
+
+  _validateURL(urlString) {
+    try {
+      new URL(urlString);
+      return true;
+    } catch (e) {
+      return false;
+    }
+  }
+}
+
+module.exports = TtsTask;