support azure streaming

This commit is contained in:
Quan HL
2024-02-26 13:33:22 +07:00
parent ced1a0ef0d
commit 9e74760c39

View File

@@ -388,10 +388,44 @@ const synthMicrosoft = async(logger, {
language,
voice,
text,
filePath
filePath,
renderForCaching
}) => {
try {
const {api_key: apiKey, region, use_custom_tts, custom_tts_endpoint, custom_tts_endpoint_url} = credentials;
// let clean up the text
let content = text;
if (use_custom_tts && !content.startsWith('<speak')) {
/**
* Note: it seems that to use custom voice ssml is required with the voice attribute
* Otherwise sending plain text we get "Voice does not match"
*/
content = `<speak>${text}</speak>`;
}
if (content.startsWith('<speak>')) {
/* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */
const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' ');
// eslint-disable-next-line max-len
content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><voice name="${voice}">${words}</voice></speak>`;
logger.info({content}, 'synthMicrosoft');
}
if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching) {
let params = '';
params += `{api_key=${apiKey}`;
params += `,region=${region}`;
params += `,language=${language}`;
if (custom_tts_endpoint) params += `,endpointId=${custom_tts_endpoint}`;
if (process.env.JAMBONES_HTTP_PROXY_IP) params += `,http_proxy_ip=${process.env.JAMBONES_HTTP_PROXY_IP}`;
if (process.env.JAMBONES_HTTP_PROXY_PORT) params += `,http_proxy_port=${process.env.JAMBONES_HTTP_PROXY_PORT}`;
params += '}';
return {
filePath: `say:${params}${content.replace(/\n/g, ' ')}`,
servedFromCache: false,
rtt: 0
};
}
if (use_custom_tts && custom_tts_endpoint_url) {
return await _synthOnPremMicrosoft(logger, {
credentials,
@@ -403,20 +437,12 @@ const synthMicrosoft = async(logger, {
});
}
const trimSilence = filePath.endsWith('.r8');
let content = text;
const speechConfig = SpeechConfig.fromSubscription(apiKey, region);
speechConfig.speechSynthesisLanguage = language;
speechConfig.speechSynthesisVoiceName = voice;
if (use_custom_tts && custom_tts_endpoint) {
speechConfig.endpointId = custom_tts_endpoint;
}
if (use_custom_tts && !content.startsWith('<speak')) {
/**
* Note: it seems that to use custom voice ssml is required with the voice attribute
* Otherwise sending plain text we get "Voice does not match"
*/
content = `<speak>${text}</speak>`;
}
speechConfig.speechSynthesisOutputFormat = trimSilence ?
SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm :
SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3;
@@ -428,14 +454,6 @@ const synthMicrosoft = async(logger, {
}
const synthesizer = new SpeechSynthesizer(speechConfig);
if (content.startsWith('<speak>')) {
/* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */
const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' ');
// eslint-disable-next-line max-len
content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><voice name="${voice}">${words}</voice></speak>`;
logger.info({content}, 'synthMicrosoft');
}
return new Promise((resolve, reject) => {
const speakAsync = content.startsWith('<speak') ?
synthesizer.speakSsmlAsync.bind(synthesizer) :