diff --git a/lib/synth-audio.js b/lib/synth-audio.js index a4330c4..f7f4a96 100644 --- a/lib/synth-audio.js +++ b/lib/synth-audio.js @@ -183,7 +183,9 @@ async function synthAudio(client, logger, stats, { account_sid, case 'azure': case 'microsoft': vendorLabel = 'microsoft'; - audioBuffer = await synthMicrosoft(logger, {credentials, stats, language, voice, text, deploymentId, filePath}); + audioBuffer = await synthMicrosoft(logger, {credentials, stats, language, voice, text, deploymentId, + filePath, renderForCaching, disableTtsStreaming}); + if (audioBuffer?.filePath) return audioBuffer; break; case 'nuance': model = model || 'enhanced'; @@ -381,10 +383,46 @@ const synthMicrosoft = async(logger, { language, voice, text, - filePath + filePath, + renderForCaching, + disableTtsStreaming }) => { try { const {api_key: apiKey, region, use_custom_tts, custom_tts_endpoint, custom_tts_endpoint_url} = credentials; + // let clean up the text + let content = text; + if (use_custom_tts && !content.startsWith('${text}`; + } + + if (content.startsWith('')) { + /* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */ + const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' '); + // eslint-disable-next-line max-len + content = `${words}`; + logger.info({content}, 'synthMicrosoft'); + } + if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { + let params = ''; + params += `{api_key=${apiKey}`; + params += `,language=${language}`; + params += ',vendor=microsoft'; + params += `,voice=${voice}`; + if (region) params += `,region=${region}`; + if (custom_tts_endpoint) params += `,endpointId=${custom_tts_endpoint}`; + if (process.env.JAMBONES_HTTP_PROXY_IP) params += `,http_proxy_ip=${process.env.JAMBONES_HTTP_PROXY_IP}`; + if (process.env.JAMBONES_HTTP_PROXY_PORT) params += `,http_proxy_port=${process.env.JAMBONES_HTTP_PROXY_PORT}`; + params += '}'; + return { + filePath: `say:${params}${content.replace(/\n/g, ' ')}`, + servedFromCache: false, + rtt: 0 + }; + } if (use_custom_tts && custom_tts_endpoint_url) { return await _synthOnPremMicrosoft(logger, { credentials, @@ -396,20 +434,12 @@ const synthMicrosoft = async(logger, { }); } const trimSilence = filePath.endsWith('.r8'); - let content = text; const speechConfig = SpeechConfig.fromSubscription(apiKey, region); speechConfig.speechSynthesisLanguage = language; speechConfig.speechSynthesisVoiceName = voice; if (use_custom_tts && custom_tts_endpoint) { speechConfig.endpointId = custom_tts_endpoint; } - if (use_custom_tts && !content.startsWith('${text}`; - } speechConfig.speechSynthesisOutputFormat = trimSilence ? SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm : SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3; @@ -421,14 +451,6 @@ const synthMicrosoft = async(logger, { } const synthesizer = new SpeechSynthesizer(speechConfig); - if (content.startsWith('')) { - /* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */ - const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' '); - // eslint-disable-next-line max-len - content = `${words}`; - logger.info({content}, 'synthMicrosoft'); - } - return new Promise((resolve, reject) => { const speakAsync = content.startsWith(' { language: 'en-US', voice: 'en-US-ChristopherNeural', text: longText, + renderForCaching: true }); t.ok(!opts.servedFromCache, `successfully synthesized microsoft audio to ${opts.filePath}`); if (process.env.JAMBONES_HTTP_PROXY_IP && process.env.JAMBONES_HTTP_PROXY_PORT) { @@ -203,6 +204,7 @@ test('Azure speech synth tests', async(t) => { language: 'en-US', voice: 'en-US-ChristopherNeural', text: longText, + renderForCaching: true }); t.ok(opts.servedFromCache, `successfully retrieved microsoft audio from cache ${opts.filePath}`); } catch (err) { @@ -237,6 +239,7 @@ test('Azure SSML tests', async(t) => { language: 'en-US', voice: 'en-US-ChristopherNeural', text, + renderForCaching: true }); t.ok(!opts.servedFromCache, `successfully synthesized microsoft audio to ${opts.filePath}`); if (process.env.JAMBONES_HTTP_PROXY_IP && process.env.JAMBONES_HTTP_PROXY_PORT) { @@ -252,6 +255,7 @@ test('Azure SSML tests', async(t) => { language: 'en-US', voice: 'en-US-ChristopherNeural', text, + renderForCaching: true }); t.ok(opts.servedFromCache, `successfully retrieved microsoft audio from cache ${opts.filePath}`); } catch (err) { @@ -283,6 +287,7 @@ test('Azure custom voice speech synth tests', async(t) => { language: 'en-US', voice: process.env.MICROSOFT_CUSTOM_VOICE, text, + renderForCaching: true }); t.ok(!opts.servedFromCache, `successfully synthesized microsoft audio to ${opts.filePath}`); @@ -297,6 +302,7 @@ test('Azure custom voice speech synth tests', async(t) => { language: 'en-US', voice: process.env.MICROSOFT_CUSTOM_VOICE, text, + renderForCaching: true }); t.ok(opts.servedFromCache, `successfully retrieved microsoft custom voice audio from cache ${opts.filePath}`); } catch (err) {