diff --git a/lib/synth-audio.js b/lib/synth-audio.js
index a4330c4..f7f4a96 100644
--- a/lib/synth-audio.js
+++ b/lib/synth-audio.js
@@ -183,7 +183,9 @@ async function synthAudio(client, logger, stats, { account_sid,
case 'azure':
case 'microsoft':
vendorLabel = 'microsoft';
- audioBuffer = await synthMicrosoft(logger, {credentials, stats, language, voice, text, deploymentId, filePath});
+ audioBuffer = await synthMicrosoft(logger, {credentials, stats, language, voice, text, deploymentId,
+ filePath, renderForCaching, disableTtsStreaming});
+ if (audioBuffer?.filePath) return audioBuffer;
break;
case 'nuance':
model = model || 'enhanced';
@@ -381,10 +383,46 @@ const synthMicrosoft = async(logger, {
language,
voice,
text,
- filePath
+ filePath,
+ renderForCaching,
+ disableTtsStreaming
}) => {
try {
const {api_key: apiKey, region, use_custom_tts, custom_tts_endpoint, custom_tts_endpoint_url} = credentials;
+ // let clean up the text
+ let content = text;
+ if (use_custom_tts && !content.startsWith('${text}`;
+ }
+
+ if (content.startsWith('')) {
+ /* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */
+ const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' ');
+ // eslint-disable-next-line max-len
+ content = `${words}`;
+ logger.info({content}, 'synthMicrosoft');
+ }
+ if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
+ let params = '';
+ params += `{api_key=${apiKey}`;
+ params += `,language=${language}`;
+ params += ',vendor=microsoft';
+ params += `,voice=${voice}`;
+ if (region) params += `,region=${region}`;
+ if (custom_tts_endpoint) params += `,endpointId=${custom_tts_endpoint}`;
+ if (process.env.JAMBONES_HTTP_PROXY_IP) params += `,http_proxy_ip=${process.env.JAMBONES_HTTP_PROXY_IP}`;
+ if (process.env.JAMBONES_HTTP_PROXY_PORT) params += `,http_proxy_port=${process.env.JAMBONES_HTTP_PROXY_PORT}`;
+ params += '}';
+ return {
+ filePath: `say:${params}${content.replace(/\n/g, ' ')}`,
+ servedFromCache: false,
+ rtt: 0
+ };
+ }
if (use_custom_tts && custom_tts_endpoint_url) {
return await _synthOnPremMicrosoft(logger, {
credentials,
@@ -396,20 +434,12 @@ const synthMicrosoft = async(logger, {
});
}
const trimSilence = filePath.endsWith('.r8');
- let content = text;
const speechConfig = SpeechConfig.fromSubscription(apiKey, region);
speechConfig.speechSynthesisLanguage = language;
speechConfig.speechSynthesisVoiceName = voice;
if (use_custom_tts && custom_tts_endpoint) {
speechConfig.endpointId = custom_tts_endpoint;
}
- if (use_custom_tts && !content.startsWith('${text}`;
- }
speechConfig.speechSynthesisOutputFormat = trimSilence ?
SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm :
SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3;
@@ -421,14 +451,6 @@ const synthMicrosoft = async(logger, {
}
const synthesizer = new SpeechSynthesizer(speechConfig);
- if (content.startsWith('')) {
- /* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */
- const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' ');
- // eslint-disable-next-line max-len
- content = `${words}`;
- logger.info({content}, 'synthMicrosoft');
- }
-
return new Promise((resolve, reject) => {
const speakAsync = content.startsWith(' {
language: 'en-US',
voice: 'en-US-ChristopherNeural',
text: longText,
+ renderForCaching: true
});
t.ok(!opts.servedFromCache, `successfully synthesized microsoft audio to ${opts.filePath}`);
if (process.env.JAMBONES_HTTP_PROXY_IP && process.env.JAMBONES_HTTP_PROXY_PORT) {
@@ -203,6 +204,7 @@ test('Azure speech synth tests', async(t) => {
language: 'en-US',
voice: 'en-US-ChristopherNeural',
text: longText,
+ renderForCaching: true
});
t.ok(opts.servedFromCache, `successfully retrieved microsoft audio from cache ${opts.filePath}`);
} catch (err) {
@@ -237,6 +239,7 @@ test('Azure SSML tests', async(t) => {
language: 'en-US',
voice: 'en-US-ChristopherNeural',
text,
+ renderForCaching: true
});
t.ok(!opts.servedFromCache, `successfully synthesized microsoft audio to ${opts.filePath}`);
if (process.env.JAMBONES_HTTP_PROXY_IP && process.env.JAMBONES_HTTP_PROXY_PORT) {
@@ -252,6 +255,7 @@ test('Azure SSML tests', async(t) => {
language: 'en-US',
voice: 'en-US-ChristopherNeural',
text,
+ renderForCaching: true
});
t.ok(opts.servedFromCache, `successfully retrieved microsoft audio from cache ${opts.filePath}`);
} catch (err) {
@@ -283,6 +287,7 @@ test('Azure custom voice speech synth tests', async(t) => {
language: 'en-US',
voice: process.env.MICROSOFT_CUSTOM_VOICE,
text,
+ renderForCaching: true
});
t.ok(!opts.servedFromCache, `successfully synthesized microsoft audio to ${opts.filePath}`);
@@ -297,6 +302,7 @@ test('Azure custom voice speech synth tests', async(t) => {
language: 'en-US',
voice: process.env.MICROSOFT_CUSTOM_VOICE,
text,
+ renderForCaching: true
});
t.ok(opts.servedFromCache, `successfully retrieved microsoft custom voice audio from cache ${opts.filePath}`);
} catch (err) {