Feature/precache audio (#609)

* wip * fix for establishing vendor etc * more fixes * avoid a pre-caching attempt if synth settings change
2025-12-20 16:50:39 +00:00 · 2024-01-13 12:51:25 -05:00
parent d3d494191f
commit 09a83e3a31
3 changed files with 50 additions and 11 deletions
--- a/lib/config.js
+++ b/lib/config.js
@@ -130,6 +130,8 @@ const JAMBONZ_RECORD_WS_PASSWORD = process.env.JAMBONZ_RECORD_WS_PASSWORD || pro
 const JAMBONZ_DISABLE_DIAL_PAI_HEADER = process.env.JAMBONZ_DISABLE_DIAL_PAI_HEADER || false;
 const JAMBONES_DISABLE_DIRECT_P2P_CALL = process.env.JAMBONES_DISABLE_DIRECT_P2P_CALL || false;

+const JAMBONES_EAGERLY_PRE_CACHE_AUDIO = process.env.JAMBONES_EAGERLY_PRE_CACHE_AUDIO;
+
 module.exports = {
  JAMBONES_MYSQL_HOST,
  JAMBONES_MYSQL_USER,
@@ -152,6 +154,7 @@ module.exports = {
  JAMBONES_API_BASE_URL,
  JAMBONES_TIME_SERIES_HOST,
  JAMBONES_INJECT_CONTENT,
+  JAMBONES_EAGERLY_PRE_CACHE_AUDIO,
  JAMBONES_ESL_LISTEN_ADDRESS,
  JAMBONES_SBCS,
  JAMBONES_OTEL_ENABLED,
--- a/lib/session/call-session.js
+++ b/lib/session/call-session.js
@@ -19,6 +19,7 @@ const HttpRequestor = require('../utils/http-requestor');
 const WsRequestor = require('../utils/ws-requestor');
 const {
  JAMBONES_INJECT_CONTENT,
+  JAMBONES_EAGERLY_PRE_CACHE_AUDIO,
  AWS_REGION,
 } = require('../config');
 const BackgroundTaskManager = require('../utils/background-task-manager');
@@ -1330,6 +1331,35 @@ Duration=${duration} `
    this.taskIdx = 0;
  }

+  _preCacheAudio(newTasks) {
+    for (const task of newTasks) {
+      if (task.name === TaskName.Config && task.hasSynthesizer) {
+        /* if they change synthesizer settings don't try to precache */
+        break;
+      }
+      if (task.name === TaskName.Say) {
+        /* identify vendor language, voice, and label */
+        const vendor = task.synthesizer.vendor && task.synthesizer.vendor !== 'default' ?
+          task.synthesizer.vendor :
+          this.speechSynthesisVendor;
+        const language = task.synthesizer.language && task.synthesizer.language !== 'default' ?
+          task.synthesizer.language :
+          this.speechSynthesisLanguage ;
+        const voice =  task.synthesizer.voice && task.synthesizer.voice !== 'default' ?
+          task.synthesizer.voice :
+          this.speechSynthesisVoice;
+        const label = task.synthesizer.label && task.synthesizer.label !== 'default' ?
+          task.synthesizer.label :
+          this.speechSynthesisLabel;
+
+        this.logger.info({vendor, language, voice, label},
+          'CallSession:_preCacheAudio - precaching audio for future prompt');
+        task._synthesizeWithSpecificVendor(this, this.ep, {vendor, language, voice, label, preCache: true})
+          .catch((err) => this.logger.error(err, 'CallSession:_preCacheAudio - error precaching audio'));
+      }
+    }
+  }
+
  /**
   * Append tasks to the current execution stack UNLESS there is a gather in the stack.
   * in that case, insert the tasks before the gather AND if the tasks include
@@ -1387,10 +1417,12 @@ Duration=${duration} `
            this.replaceApplication(t);
          }
          else if (JAMBONES_INJECT_CONTENT) {
+            if (JAMBONES_EAGERLY_PRE_CACHE_AUDIO) this._preCacheAudio(t);
            this._injectTasks(t);
            this.logger.info({tasks: listTaskNames(this.tasks)}, 'CallSession:_onCommand - updated task list');
          }
          else {
+            if (JAMBONES_EAGERLY_PRE_CACHE_AUDIO) this._preCacheAudio(t);
            this.tasks.push(...t);
            this.logger.info({tasks: listTaskNames(this.tasks)}, 'CallSession:_onCommand - updated task list');
          }
--- a/lib/tasks/say.js
+++ b/lib/tasks/say.js
@@ -59,7 +59,7 @@ class TaskSay extends Task {
    }
  }

-  async _synthesizeWithSpecificVendor(cs, ep, {vendor, language, voice, label}) {
+  async _synthesizeWithSpecificVendor(cs, ep, {vendor, language, voice, label, preCache = false}) {
    const {srf} = cs;
    const {updateSpeechCredentialLastUsed} = require('../utils/db-utils')(this.logger, srf);
    const {writeAlerts, AlertType, stats} = srf.locals;
@@ -97,7 +97,7 @@ class TaskSay extends Task {
      voice = this.options.voice_id || voice;
    }

-    this.logger.info({vendor, language, voice, model}, 'TaskSay:exec');
+    if (!preCache) this.logger.info({vendor, language, voice, model}, 'TaskSay:exec');
    try {
      if (!credentials) {
        writeAlerts({
@@ -120,11 +120,15 @@ class TaskSay extends Task {
        if (text.startsWith('silence_stream://')) return text;

        /* otel: trace time for tts */
-        const {span} = this.startChildSpan('tts-generation', {
-          'tts.vendor': vendor,
-          'tts.language': language,
-          'tts.voice': voice
-        });
+        let otelSpan;
+        if (!preCache)  {
+          const {span} = this.startChildSpan('tts-generation', {
+            'tts.vendor': vendor,
+            'tts.language': language,
+            'tts.voice': voice
+          });
+          otelSpan = span;
+        }
        try {
          const {filePath, servedFromCache, rtt} = await synthAudio(stats, {
            account_sid: cs.accountSid,
@@ -146,9 +150,9 @@ class TaskSay extends Task {
            updateSpeechCredentialLastUsed(credentials.speech_credential_sid)
              .catch(() => {/*already logged error */});
          }
-          span.setAttributes({'tts.cached': servedFromCache});
-          span.end();
-          if (!servedFromCache && rtt) {
+          if (otelSpan) otelSpan.setAttributes({'tts.cached': servedFromCache});
+          if (otelSpan) otelSpan.end();
+          if (!servedFromCache && rtt && !preCache) {
            this.notifyStatus({
              event: 'synthesized-audio',
              vendor,
@@ -160,7 +164,7 @@ class TaskSay extends Task {
          return filePath;
        } catch (err) {
          this.logger.info({err}, 'Error synthesizing tts');
-          span.end();
+          if (otelSpan) otelSpan.end();
          writeAlerts({
            account_sid: cs.accountSid,
            alert_type: AlertType.TTS_FAILURE,