support openai whisper instructions

This commit is contained in:
Quan HL
2025-05-13 15:23:59 +07:00
parent 7dc3bbdb01
commit 9409405769
3 changed files with 77 additions and 149 deletions

View File

@@ -89,7 +89,7 @@ const trimTrailingSilence = (buffer) => {
*/
async function synthAudio(client, createHash, retrieveHash, logger, stats, { account_sid,
vendor, language, voice, gender, text, engine, salt, model, credentials, deploymentId,
disableTtsCache, renderForCaching = false, disableTtsStreaming, options
disableTtsCache, renderForCaching = false, disableTtsStreaming, options, instructions
}) {
let audioData;
let servedFromCache = false;
@@ -242,7 +242,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
break;
case 'whisper':
audioData = await synthWhisper(logger, {
credentials, stats, voice, text, renderForCaching, disableTtsStreaming});
credentials, stats, voice, text, instructions, renderForCaching, disableTtsStreaming});
break;
case 'verbio':
audioData = await synthVerbio(client, logger, {
@@ -1048,7 +1048,8 @@ const synthVerbio = async(client, logger, {credentials, stats, voice, text, rend
}
};
const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCaching, disableTtsStreaming}) => {
const synthWhisper = async(logger, {credentials, stats, voice, text, instructions,
renderForCaching, disableTtsStreaming}) => {
const {api_key, model_id, baseURL, timeout, speed} = credentials;
/* if the env is set to stream then bag out, unless we are specifically rendering to generate a cache file */
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
@@ -1059,6 +1060,7 @@ const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCa
params += `,voice=${voice}`;
params += ',write_cache_file=1';
if (speed) params += `,speed=${speed}`;
if (instructions) params += `,instructions=${instructions}`;
params += '}';
return {
@@ -1078,6 +1080,7 @@ const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCa
model: model_id,
voice,
input: text,
...(instructions && {instructions}),
response_format: 'mp3'
});
return {