Merge pull request #57 from jambonz/feat/whisper_tts_stream

support whisper streaming
2026-01-25 02:08:26 +00:00 · 2024-02-20 20:33:21 -05:00
parent 1caea60803 ef8ada2793
commit 1609d0b205
1 changed files with 22 additions and 2 deletions
--- a/lib/synth-audio.js
+++ b/lib/synth-audio.js
@@ -210,6 +210,10 @@ async function synthAudio(client, logger, stats, { account_sid,
        }
        break;
      case 'whisper':
+        audioBuffer = await synthWhisper(logger, {credentials, stats, voice, text, renderForCaching});
+        if (typeof audioBuffer === 'object' && audioBuffer.filePath) {
+          return audioBuffer;
+        }
        audioBuffer = await synthWhisper(logger, {credentials, stats, voice, text});
        break;
      case 'deepgram':
@@ -656,8 +660,24 @@ const synthElevenlabs = async(logger, {credentials, options, stats, language, vo
  }
 };

-const synthWhisper = async(logger, {credentials, stats, voice, text}) => {
-  const {api_key, model_id, baseURL, timeout} = credentials;
+const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCaching}) => {
+  const {api_key, model_id, baseURL, timeout, speed} = credentials;
+  /* if the env is set to stream then bag out, unless we are specifically rendering to generate a cache file */
+  if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching) {
+    let params = '';
+    params += `{api_key=${api_key}`;
+    params += `,model_id=${model_id}`;
+    params += `,voice=${voice}`;
+    params += ',write_cache_file=1';
+    if (speed) params += `,speed=${speed}`;
+    params += '}';
+
+    return {
+      filePath: `say:${params}${text.replace(/\n/g, ' ')}`,
+      servedFromCache: false,
+      rtt: 0
+    };
+  }
  try {
    const openai = new OpenAI.OpenAI({
      apiKey: api_key,