support openai whisper instructions

2025-12-19 03:37:49 +00:00 · 2025-05-13 15:23:59 +07:00
parent 7dc3bbdb01
commit 9409405769
3 changed files with 77 additions and 149 deletions
--- a/lib/synth-audio.js
+++ b/lib/synth-audio.js
@@ -89,7 +89,7 @@ const trimTrailingSilence = (buffer) => {
 */
 async function synthAudio(client, createHash, retrieveHash, logger, stats, { account_sid,
  vendor, language, voice, gender, text, engine, salt, model, credentials, deploymentId,
-  disableTtsCache, renderForCaching = false, disableTtsStreaming, options
+  disableTtsCache, renderForCaching = false, disableTtsStreaming, options, instructions
 }) {
  let audioData;
  let servedFromCache = false;
@@ -242,7 +242,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
        break;
      case 'whisper':
        audioData = await synthWhisper(logger, {
-          credentials, stats, voice, text, renderForCaching, disableTtsStreaming});
+          credentials, stats, voice, text, instructions, renderForCaching, disableTtsStreaming});
        break;
      case 'verbio':
        audioData = await synthVerbio(client, logger, {
@@ -1048,7 +1048,8 @@ const synthVerbio = async(client, logger, {credentials, stats, voice, text, rend
  }
 };

-const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCaching, disableTtsStreaming}) => {
+const synthWhisper = async(logger, {credentials, stats, voice, text, instructions,
+  renderForCaching, disableTtsStreaming}) => {
  const {api_key, model_id, baseURL, timeout, speed} = credentials;
  /* if the env is set to stream then bag out, unless we are specifically rendering to generate a cache file */
  if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
@@ -1059,6 +1060,7 @@ const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCa
    params += `,voice=${voice}`;
    params += ',write_cache_file=1';
    if (speed) params += `,speed=${speed}`;
+    if (instructions) params += `,instructions=${instructions}`;
    params += '}';

    return {
@@ -1078,6 +1080,7 @@ const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCa
      model: model_id,
      voice,
      input: text,
+      ...(instructions && {instructions}),
      response_format: 'mp3'
    });
    return {