mirror of
https://github.com/jambonz/speech-utils.git
synced 2026-07-04 19:31:49 +00:00
Merge pull request #59 from jambonz/feat/azure_tts
support azure streaming
This commit is contained in:
+40
-18
@@ -183,7 +183,9 @@ async function synthAudio(client, logger, stats, { account_sid,
|
||||
case 'azure':
|
||||
case 'microsoft':
|
||||
vendorLabel = 'microsoft';
|
||||
audioBuffer = await synthMicrosoft(logger, {credentials, stats, language, voice, text, deploymentId, filePath});
|
||||
audioBuffer = await synthMicrosoft(logger, {credentials, stats, language, voice, text, deploymentId,
|
||||
filePath, renderForCaching, disableTtsStreaming});
|
||||
if (audioBuffer?.filePath) return audioBuffer;
|
||||
break;
|
||||
case 'nuance':
|
||||
model = model || 'enhanced';
|
||||
@@ -381,10 +383,46 @@ const synthMicrosoft = async(logger, {
|
||||
language,
|
||||
voice,
|
||||
text,
|
||||
filePath
|
||||
filePath,
|
||||
renderForCaching,
|
||||
disableTtsStreaming
|
||||
}) => {
|
||||
try {
|
||||
const {api_key: apiKey, region, use_custom_tts, custom_tts_endpoint, custom_tts_endpoint_url} = credentials;
|
||||
// let clean up the text
|
||||
let content = text;
|
||||
if (use_custom_tts && !content.startsWith('<speak')) {
|
||||
/**
|
||||
* Note: it seems that to use custom voice ssml is required with the voice attribute
|
||||
* Otherwise sending plain text we get "Voice does not match"
|
||||
*/
|
||||
content = `<speak>${text}</speak>`;
|
||||
}
|
||||
|
||||
if (content.startsWith('<speak>')) {
|
||||
/* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */
|
||||
const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' ');
|
||||
// eslint-disable-next-line max-len
|
||||
content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><voice name="${voice}">${words}</voice></speak>`;
|
||||
logger.info({content}, 'synthMicrosoft');
|
||||
}
|
||||
if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
||||
let params = '';
|
||||
params += `{api_key=${apiKey}`;
|
||||
params += `,language=${language}`;
|
||||
params += ',vendor=microsoft';
|
||||
params += `,voice=${voice}`;
|
||||
if (region) params += `,region=${region}`;
|
||||
if (custom_tts_endpoint) params += `,endpointId=${custom_tts_endpoint}`;
|
||||
if (process.env.JAMBONES_HTTP_PROXY_IP) params += `,http_proxy_ip=${process.env.JAMBONES_HTTP_PROXY_IP}`;
|
||||
if (process.env.JAMBONES_HTTP_PROXY_PORT) params += `,http_proxy_port=${process.env.JAMBONES_HTTP_PROXY_PORT}`;
|
||||
params += '}';
|
||||
return {
|
||||
filePath: `say:${params}${content.replace(/\n/g, ' ')}`,
|
||||
servedFromCache: false,
|
||||
rtt: 0
|
||||
};
|
||||
}
|
||||
if (use_custom_tts && custom_tts_endpoint_url) {
|
||||
return await _synthOnPremMicrosoft(logger, {
|
||||
credentials,
|
||||
@@ -396,20 +434,12 @@ const synthMicrosoft = async(logger, {
|
||||
});
|
||||
}
|
||||
const trimSilence = filePath.endsWith('.r8');
|
||||
let content = text;
|
||||
const speechConfig = SpeechConfig.fromSubscription(apiKey, region);
|
||||
speechConfig.speechSynthesisLanguage = language;
|
||||
speechConfig.speechSynthesisVoiceName = voice;
|
||||
if (use_custom_tts && custom_tts_endpoint) {
|
||||
speechConfig.endpointId = custom_tts_endpoint;
|
||||
}
|
||||
if (use_custom_tts && !content.startsWith('<speak')) {
|
||||
/**
|
||||
* Note: it seems that to use custom voice ssml is required with the voice attribute
|
||||
* Otherwise sending plain text we get "Voice does not match"
|
||||
*/
|
||||
content = `<speak>${text}</speak>`;
|
||||
}
|
||||
speechConfig.speechSynthesisOutputFormat = trimSilence ?
|
||||
SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm :
|
||||
SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3;
|
||||
@@ -421,14 +451,6 @@ const synthMicrosoft = async(logger, {
|
||||
}
|
||||
const synthesizer = new SpeechSynthesizer(speechConfig);
|
||||
|
||||
if (content.startsWith('<speak>')) {
|
||||
/* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */
|
||||
const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' ');
|
||||
// eslint-disable-next-line max-len
|
||||
content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><voice name="${voice}">${words}</voice></speak>`;
|
||||
logger.info({content}, 'synthMicrosoft');
|
||||
}
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
const speakAsync = content.startsWith('<speak') ?
|
||||
synthesizer.speakSsmlAsync.bind(synthesizer) :
|
||||
|
||||
@@ -188,6 +188,7 @@ test('Azure speech synth tests', async(t) => {
|
||||
language: 'en-US',
|
||||
voice: 'en-US-ChristopherNeural',
|
||||
text: longText,
|
||||
renderForCaching: true
|
||||
});
|
||||
t.ok(!opts.servedFromCache, `successfully synthesized microsoft audio to ${opts.filePath}`);
|
||||
if (process.env.JAMBONES_HTTP_PROXY_IP && process.env.JAMBONES_HTTP_PROXY_PORT) {
|
||||
@@ -203,6 +204,7 @@ test('Azure speech synth tests', async(t) => {
|
||||
language: 'en-US',
|
||||
voice: 'en-US-ChristopherNeural',
|
||||
text: longText,
|
||||
renderForCaching: true
|
||||
});
|
||||
t.ok(opts.servedFromCache, `successfully retrieved microsoft audio from cache ${opts.filePath}`);
|
||||
} catch (err) {
|
||||
@@ -237,6 +239,7 @@ test('Azure SSML tests', async(t) => {
|
||||
language: 'en-US',
|
||||
voice: 'en-US-ChristopherNeural',
|
||||
text,
|
||||
renderForCaching: true
|
||||
});
|
||||
t.ok(!opts.servedFromCache, `successfully synthesized microsoft audio to ${opts.filePath}`);
|
||||
if (process.env.JAMBONES_HTTP_PROXY_IP && process.env.JAMBONES_HTTP_PROXY_PORT) {
|
||||
@@ -252,6 +255,7 @@ test('Azure SSML tests', async(t) => {
|
||||
language: 'en-US',
|
||||
voice: 'en-US-ChristopherNeural',
|
||||
text,
|
||||
renderForCaching: true
|
||||
});
|
||||
t.ok(opts.servedFromCache, `successfully retrieved microsoft audio from cache ${opts.filePath}`);
|
||||
} catch (err) {
|
||||
@@ -283,6 +287,7 @@ test('Azure custom voice speech synth tests', async(t) => {
|
||||
language: 'en-US',
|
||||
voice: process.env.MICROSOFT_CUSTOM_VOICE,
|
||||
text,
|
||||
renderForCaching: true
|
||||
});
|
||||
t.ok(!opts.servedFromCache, `successfully synthesized microsoft audio to ${opts.filePath}`);
|
||||
|
||||
@@ -297,6 +302,7 @@ test('Azure custom voice speech synth tests', async(t) => {
|
||||
language: 'en-US',
|
||||
voice: process.env.MICROSOFT_CUSTOM_VOICE,
|
||||
text,
|
||||
renderForCaching: true
|
||||
});
|
||||
t.ok(opts.servedFromCache, `successfully retrieved microsoft custom voice audio from cache ${opts.filePath}`);
|
||||
} catch (err) {
|
||||
|
||||
Reference in New Issue
Block a user