From 9e74760c39c0ba9cbd837a997506e9db1d535053 Mon Sep 17 00:00:00 2001 From: Quan HL Date: Mon, 26 Feb 2024 13:33:22 +0700 Subject: [PATCH 1/9] support azure streaming --- lib/synth-audio.js | 52 +++++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/lib/synth-audio.js b/lib/synth-audio.js index 472d311..e525a97 100644 --- a/lib/synth-audio.js +++ b/lib/synth-audio.js @@ -388,10 +388,44 @@ const synthMicrosoft = async(logger, { language, voice, text, - filePath + filePath, + renderForCaching }) => { try { const {api_key: apiKey, region, use_custom_tts, custom_tts_endpoint, custom_tts_endpoint_url} = credentials; + // let clean up the text + let content = text; + if (use_custom_tts && !content.startsWith('${text}`; + } + + if (content.startsWith('')) { + /* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */ + const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' '); + // eslint-disable-next-line max-len + content = `${words}`; + logger.info({content}, 'synthMicrosoft'); + } + + if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching) { + let params = ''; + params += `{api_key=${apiKey}`; + params += `,region=${region}`; + params += `,language=${language}`; + if (custom_tts_endpoint) params += `,endpointId=${custom_tts_endpoint}`; + if (process.env.JAMBONES_HTTP_PROXY_IP) params += `,http_proxy_ip=${process.env.JAMBONES_HTTP_PROXY_IP}`; + if (process.env.JAMBONES_HTTP_PROXY_PORT) params += `,http_proxy_port=${process.env.JAMBONES_HTTP_PROXY_PORT}`; + params += '}'; + return { + filePath: `say:${params}${content.replace(/\n/g, ' ')}`, + servedFromCache: false, + rtt: 0 + }; + } if (use_custom_tts && custom_tts_endpoint_url) { return await _synthOnPremMicrosoft(logger, { credentials, @@ -403,20 +437,12 @@ const synthMicrosoft = async(logger, { }); } const trimSilence = filePath.endsWith('.r8'); - let content = text; const speechConfig = SpeechConfig.fromSubscription(apiKey, region); speechConfig.speechSynthesisLanguage = language; speechConfig.speechSynthesisVoiceName = voice; if (use_custom_tts && custom_tts_endpoint) { speechConfig.endpointId = custom_tts_endpoint; } - if (use_custom_tts && !content.startsWith('${text}`; - } speechConfig.speechSynthesisOutputFormat = trimSilence ? SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm : SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3; @@ -428,14 +454,6 @@ const synthMicrosoft = async(logger, { } const synthesizer = new SpeechSynthesizer(speechConfig); - if (content.startsWith('')) { - /* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */ - const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' '); - // eslint-disable-next-line max-len - content = `${words}`; - logger.info({content}, 'synthMicrosoft'); - } - return new Promise((resolve, reject) => { const speakAsync = content.startsWith(' Date: Mon, 26 Feb 2024 13:49:37 +0700 Subject: [PATCH 2/9] wip --- lib/synth-audio.js | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/synth-audio.js b/lib/synth-audio.js index e525a97..6fdd5f9 100644 --- a/lib/synth-audio.js +++ b/lib/synth-audio.js @@ -183,7 +183,15 @@ async function synthAudio(client, logger, stats, { account_sid, case 'azure': case 'microsoft': vendorLabel = 'microsoft'; - audioBuffer = await synthMicrosoft(logger, {credentials, stats, language, voice, text, deploymentId, filePath}); + audioBuffer = await synthMicrosoft(logger, {credentials, stats, language, voice, text, deploymentId, + filePath, renderForCaching}); + if (typeof audioBuffer === 'object' && audioBuffer.filePath) { + return audioBuffer; + } + else { + audioBuffer = await synthMicrosoft(logger, {credentials, stats, language, voice, text, deploymentId, + filePath}); + } break; case 'nuance': model = model || 'enhanced'; From be8053db4f1c8e62d218e6ac38a8f6496180715c Mon Sep 17 00:00:00 2001 From: Quan HL Date: Mon, 26 Feb 2024 13:54:34 +0700 Subject: [PATCH 3/9] wip --- test/synth.js | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/synth.js b/test/synth.js index 7bd2b18..4988879 100644 --- a/test/synth.js +++ b/test/synth.js @@ -188,6 +188,7 @@ test('Azure speech synth tests', async(t) => { language: 'en-US', voice: 'en-US-ChristopherNeural', text: longText, + renderForCaching: true }); t.ok(!opts.servedFromCache, `successfully synthesized microsoft audio to ${opts.filePath}`); if (process.env.JAMBONES_HTTP_PROXY_IP && process.env.JAMBONES_HTTP_PROXY_PORT) { @@ -203,6 +204,7 @@ test('Azure speech synth tests', async(t) => { language: 'en-US', voice: 'en-US-ChristopherNeural', text: longText, + renderForCaching: true }); t.ok(opts.servedFromCache, `successfully retrieved microsoft audio from cache ${opts.filePath}`); } catch (err) { From 3560a6d4d990176374d80a9353a858879f5b6c42 Mon Sep 17 00:00:00 2001 From: Quan HL Date: Mon, 26 Feb 2024 14:01:53 +0700 Subject: [PATCH 4/9] wip --- lib/synth-audio.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/synth-audio.js b/lib/synth-audio.js index 6fdd5f9..7bd509f 100644 --- a/lib/synth-audio.js +++ b/lib/synth-audio.js @@ -190,7 +190,7 @@ async function synthAudio(client, logger, stats, { account_sid, } else { audioBuffer = await synthMicrosoft(logger, {credentials, stats, language, voice, text, deploymentId, - filePath}); + filePath, renderForCaching}); } break; case 'nuance': From 31a54f595b0dd98f2e1b2ad7657708f5142312a5 Mon Sep 17 00:00:00 2001 From: Quan HL Date: Mon, 26 Feb 2024 15:42:09 +0700 Subject: [PATCH 5/9] wip --- lib/synth-audio.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/synth-audio.js b/lib/synth-audio.js index 7bd509f..f21dfed 100644 --- a/lib/synth-audio.js +++ b/lib/synth-audio.js @@ -422,8 +422,8 @@ const synthMicrosoft = async(logger, { if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching) { let params = ''; params += `{api_key=${apiKey}`; - params += `,region=${region}`; params += `,language=${language}`; + if (region) params += `,region=${region}`; if (custom_tts_endpoint) params += `,endpointId=${custom_tts_endpoint}`; if (process.env.JAMBONES_HTTP_PROXY_IP) params += `,http_proxy_ip=${process.env.JAMBONES_HTTP_PROXY_IP}`; if (process.env.JAMBONES_HTTP_PROXY_PORT) params += `,http_proxy_port=${process.env.JAMBONES_HTTP_PROXY_PORT}`; From f06f96a6f0ab84b059371039cefea710e7147d34 Mon Sep 17 00:00:00 2001 From: Quan HL Date: Sun, 10 Mar 2024 06:41:46 +0700 Subject: [PATCH 6/9] wip --- lib/synth-audio.js | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/lib/synth-audio.js b/lib/synth-audio.js index 6409831..670f694 100644 --- a/lib/synth-audio.js +++ b/lib/synth-audio.js @@ -184,14 +184,8 @@ async function synthAudio(client, logger, stats, { account_sid, case 'microsoft': vendorLabel = 'microsoft'; audioBuffer = await synthMicrosoft(logger, {credentials, stats, language, voice, text, deploymentId, - filePath, renderForCaching}); - if (typeof audioBuffer === 'object' && audioBuffer.filePath) { - return audioBuffer; - } - else { - audioBuffer = await synthMicrosoft(logger, {credentials, stats, language, voice, text, deploymentId, - filePath, renderForCaching}); - } + filePath, renderForCaching, disableTtsStreaming}); + if (audioBuffer?.filePath) return audioBuffer; break; case 'nuance': model = model || 'enhanced'; @@ -390,7 +384,8 @@ const synthMicrosoft = async(logger, { voice, text, filePath, - renderForCaching + renderForCaching, + disableTtsStreaming }) => { try { const {api_key: apiKey, region, use_custom_tts, custom_tts_endpoint, custom_tts_endpoint_url} = credentials; @@ -412,7 +407,7 @@ const synthMicrosoft = async(logger, { logger.info({content}, 'synthMicrosoft'); } - if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching) { + if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { let params = ''; params += `{api_key=${apiKey}`; params += `,language=${language}`; From 4d58ca6daff30180bf7376e8ceef6d5864695682 Mon Sep 17 00:00:00 2001 From: Quan HL Date: Sat, 30 Mar 2024 17:34:49 +0700 Subject: [PATCH 7/9] wip --- lib/synth-audio.js | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/synth-audio.js b/lib/synth-audio.js index ac239c7..b05836f 100644 --- a/lib/synth-audio.js +++ b/lib/synth-audio.js @@ -411,6 +411,8 @@ const synthMicrosoft = async(logger, { let params = ''; params += `{api_key=${apiKey}`; params += `,language=${language}`; + params += ',vendor=microsoft'; + params += `,voice=${voice}`; if (region) params += `,region=${region}`; if (custom_tts_endpoint) params += `,endpointId=${custom_tts_endpoint}`; if (process.env.JAMBONES_HTTP_PROXY_IP) params += `,http_proxy_ip=${process.env.JAMBONES_HTTP_PROXY_IP}`; From 7f939b96d25c8addde46bc3c7b6d8822f8222165 Mon Sep 17 00:00:00 2001 From: Quan HL Date: Sat, 30 Mar 2024 17:38:00 +0700 Subject: [PATCH 8/9] wip --- test/synth.js | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/synth.js b/test/synth.js index 06c01b7..c98e9da 100644 --- a/test/synth.js +++ b/test/synth.js @@ -204,7 +204,8 @@ test('Azure speech synth tests', async(t) => { language: 'en-US', voice: 'en-US-ChristopherNeural', text: longText, - renderForCaching: true + renderForCaching: true, + disableTtsStreaming: true }); t.ok(opts.servedFromCache, `successfully retrieved microsoft audio from cache ${opts.filePath}`); } catch (err) { @@ -254,6 +255,8 @@ test('Azure SSML tests', async(t) => { language: 'en-US', voice: 'en-US-ChristopherNeural', text, + renderForCaching: true, + disableTtsStreaming: true }); t.ok(opts.servedFromCache, `successfully retrieved microsoft audio from cache ${opts.filePath}`); } catch (err) { From 2701af102adaa1fdb3d7ee24fb6c6149fb11d279 Mon Sep 17 00:00:00 2001 From: Quan HL Date: Sat, 30 Mar 2024 17:49:14 +0700 Subject: [PATCH 9/9] wip --- lib/synth-audio.js | 1 - test/synth.js | 9 +++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/synth-audio.js b/lib/synth-audio.js index b05836f..f7f4a96 100644 --- a/lib/synth-audio.js +++ b/lib/synth-audio.js @@ -406,7 +406,6 @@ const synthMicrosoft = async(logger, { content = `${words}`; logger.info({content}, 'synthMicrosoft'); } - if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { let params = ''; params += `{api_key=${apiKey}`; diff --git a/test/synth.js b/test/synth.js index c98e9da..44c4782 100644 --- a/test/synth.js +++ b/test/synth.js @@ -204,8 +204,7 @@ test('Azure speech synth tests', async(t) => { language: 'en-US', voice: 'en-US-ChristopherNeural', text: longText, - renderForCaching: true, - disableTtsStreaming: true + renderForCaching: true }); t.ok(opts.servedFromCache, `successfully retrieved microsoft audio from cache ${opts.filePath}`); } catch (err) { @@ -240,6 +239,7 @@ test('Azure SSML tests', async(t) => { language: 'en-US', voice: 'en-US-ChristopherNeural', text, + renderForCaching: true }); t.ok(!opts.servedFromCache, `successfully synthesized microsoft audio to ${opts.filePath}`); if (process.env.JAMBONES_HTTP_PROXY_IP && process.env.JAMBONES_HTTP_PROXY_PORT) { @@ -255,8 +255,7 @@ test('Azure SSML tests', async(t) => { language: 'en-US', voice: 'en-US-ChristopherNeural', text, - renderForCaching: true, - disableTtsStreaming: true + renderForCaching: true }); t.ok(opts.servedFromCache, `successfully retrieved microsoft audio from cache ${opts.filePath}`); } catch (err) { @@ -288,6 +287,7 @@ test('Azure custom voice speech synth tests', async(t) => { language: 'en-US', voice: process.env.MICROSOFT_CUSTOM_VOICE, text, + renderForCaching: true }); t.ok(!opts.servedFromCache, `successfully synthesized microsoft audio to ${opts.filePath}`); @@ -302,6 +302,7 @@ test('Azure custom voice speech synth tests', async(t) => { language: 'en-US', voice: process.env.MICROSOFT_CUSTOM_VOICE, text, + renderForCaching: true }); t.ok(opts.servedFromCache, `successfully retrieved microsoft custom voice audio from cache ${opts.filePath}`); } catch (err) {