mirror of
https://github.com/jambonz/speech-utils.git
synced 2026-01-25 02:08:26 +00:00
wip
This commit is contained in:
@@ -205,7 +205,8 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
|
|||||||
switch (vendor) {
|
switch (vendor) {
|
||||||
case 'google':
|
case 'google':
|
||||||
audioData = await synthGoogle(logger, {
|
audioData = await synthGoogle(logger, {
|
||||||
credentials, stats, language, voice, gender, key, text, model, options, instructions
|
credentials, stats, language, voice, gender, key, text, model, options, instructions,
|
||||||
|
renderForCaching, disableTtsStreaming, disableTtsCache
|
||||||
});
|
});
|
||||||
break;
|
break;
|
||||||
case 'aws':
|
case 'aws':
|
||||||
@@ -413,12 +414,49 @@ const synthPolly = async(createHash, retrieveHash, logger,
|
|||||||
|
|
||||||
|
|
||||||
const synthGoogle = async(logger, {
|
const synthGoogle = async(logger, {
|
||||||
credentials, stats, language, voice, gender, text, model, options, instructions
|
credentials, stats, language, voice, gender, key, text, model, options, instructions,
|
||||||
|
renderForCaching, disableTtsStreaming, disableTtsCache
|
||||||
}) => {
|
}) => {
|
||||||
const client = new ttsGoogle.TextToSpeechClient(credentials);
|
|
||||||
|
|
||||||
const isGemini = !!model;
|
const isGemini = !!model;
|
||||||
const isVoiceCloning = typeof voice === 'object' && voice.voice_cloning_key;
|
const isVoiceCloning = typeof voice === 'object' && voice.voice_cloning_key;
|
||||||
|
// HD voices have pattern like en-US-Chirp3-HD-Charon
|
||||||
|
const isHDVoice = typeof voice === 'string' && voice.includes('-HD-');
|
||||||
|
// Live API is used for Gemini TTS and HD voices
|
||||||
|
const useLiveApi = isGemini || isHDVoice;
|
||||||
|
|
||||||
|
// Streaming support for Google TTS (Gemini, HD voices, and standard voices)
|
||||||
|
// Voice cloning does not support streaming
|
||||||
|
if (!isVoiceCloning && !JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
||||||
|
// Strip SSML tags for Gemini TTS (it doesn't support SSML)
|
||||||
|
let inputText = text;
|
||||||
|
if (isGemini && text.startsWith('<speak>')) {
|
||||||
|
inputText = text.replace(/<[^>]*>/g, '').trim();
|
||||||
|
logger.info('synthGoogle: Gemini TTS does not support SSML, stripped tags from input');
|
||||||
|
}
|
||||||
|
|
||||||
|
let params = '{';
|
||||||
|
params += `credentials=${JSON.stringify(credentials)}`;
|
||||||
|
params += `,playback_id=${key}`;
|
||||||
|
params += ',vendor=google';
|
||||||
|
params += `,voice=${voice}`;
|
||||||
|
params += `,language_code=${language || 'en-US'}`;
|
||||||
|
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
||||||
|
params += `,use_live_api=${useLiveApi ? 1 : 0}`;
|
||||||
|
if (model) params += `,model_name=${model}`;
|
||||||
|
if (gender) params += `,gender=${gender}`;
|
||||||
|
// comma is used to separate parameters in freeswitch tts module
|
||||||
|
const prompt = options?.prompt || instructions;
|
||||||
|
if (prompt) params += `,prompt=${prompt.replace(/\n/g, ' ').replace(/,/g, ';')}`;
|
||||||
|
params += '}';
|
||||||
|
|
||||||
|
return {
|
||||||
|
filePath: `say:${params}${(isGemini ? inputText : text).replace(/\n/g, ' ')}`,
|
||||||
|
servedFromCache: false,
|
||||||
|
rtt: 0
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const client = new ttsGoogle.TextToSpeechClient(credentials);
|
||||||
|
|
||||||
// Build input based on voice type
|
// Build input based on voice type
|
||||||
let input;
|
let input;
|
||||||
|
|||||||
220
test/synth.js
220
test/synth.js
@@ -256,6 +256,226 @@ test('Google Gemini TTS synth tests', async(t) => {
|
|||||||
client.quit();
|
client.quit();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('Google TTS streaming tests (!JAMBONES_DISABLE_TTS_STREAMING)', async(t) => {
|
||||||
|
// Ensure streaming is enabled (default behavior)
|
||||||
|
delete process.env.JAMBONES_DISABLE_TTS_STREAMING;
|
||||||
|
|
||||||
|
// Clear require cache to reload config with new env var
|
||||||
|
delete require.cache[require.resolve('../lib/config')];
|
||||||
|
delete require.cache[require.resolve('../lib/synth-audio')];
|
||||||
|
delete require.cache[require.resolve('..')];
|
||||||
|
|
||||||
|
const fn = require('..');
|
||||||
|
const {synthAudio, client} = fn(opts, logger);
|
||||||
|
|
||||||
|
if (!process.env.GCP_FILE && !process.env.GCP_JSON_KEY) {
|
||||||
|
t.pass('skipping Google TTS streaming tests since neither GCP_FILE nor GCP_JSON_KEY provided');
|
||||||
|
return t.end();
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const str = process.env.GCP_JSON_KEY || fs.readFileSync(process.env.GCP_FILE);
|
||||||
|
const creds = JSON.parse(str);
|
||||||
|
const geminiModel = process.env.GCP_GEMINI_TTS_MODEL || 'gemini-2.5-flash-tts';
|
||||||
|
|
||||||
|
// Test 1: Standard voice streaming (use_live_api=0)
|
||||||
|
let result = await synthAudio(stats, {
|
||||||
|
vendor: 'google',
|
||||||
|
credentials: {
|
||||||
|
credentials: {
|
||||||
|
client_email: creds.client_email,
|
||||||
|
private_key: creds.private_key,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
language: 'en-US',
|
||||||
|
voice: 'en-US-Wavenet-D',
|
||||||
|
gender: 'MALE',
|
||||||
|
text: 'This is a test of standard voice streaming.',
|
||||||
|
disableTtsCache: true
|
||||||
|
});
|
||||||
|
t.ok(result.filePath.startsWith('say:'), 'Standard voice returns streaming say: path');
|
||||||
|
t.ok(result.filePath.includes('vendor=google'), 'Standard voice streaming path contains vendor=google');
|
||||||
|
t.ok(result.filePath.includes('use_live_api=0'), 'Standard voice uses use_live_api=0');
|
||||||
|
t.ok(result.filePath.includes('voice=en-US-Wavenet-D'), 'Standard voice streaming path contains voice');
|
||||||
|
|
||||||
|
// Test 2: HD voice streaming (use_live_api=1)
|
||||||
|
result = await synthAudio(stats, {
|
||||||
|
vendor: 'google',
|
||||||
|
credentials: {
|
||||||
|
credentials: {
|
||||||
|
client_email: creds.client_email,
|
||||||
|
private_key: creds.private_key,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
language: 'en-US',
|
||||||
|
voice: 'en-US-Chirp3-HD-Charon',
|
||||||
|
text: 'This is a test of HD voice streaming.',
|
||||||
|
disableTtsCache: true
|
||||||
|
});
|
||||||
|
t.ok(result.filePath.startsWith('say:'), 'HD voice returns streaming say: path');
|
||||||
|
t.ok(result.filePath.includes('vendor=google'), 'HD voice streaming path contains vendor=google');
|
||||||
|
t.ok(result.filePath.includes('use_live_api=1'), 'HD voice uses use_live_api=1 (Live API)');
|
||||||
|
t.ok(result.filePath.includes('voice=en-US-Chirp3-HD-Charon'), 'HD voice streaming path contains voice');
|
||||||
|
|
||||||
|
// Test 3: Gemini TTS streaming (use_live_api=1)
|
||||||
|
result = await synthAudio(stats, {
|
||||||
|
vendor: 'google',
|
||||||
|
credentials: {
|
||||||
|
credentials: {
|
||||||
|
client_email: creds.client_email,
|
||||||
|
private_key: creds.private_key,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
language: 'en-US',
|
||||||
|
voice: 'Kore',
|
||||||
|
model: geminiModel,
|
||||||
|
text: 'This is a test of Gemini TTS streaming.',
|
||||||
|
instructions: 'Speak naturally.',
|
||||||
|
disableTtsCache: true
|
||||||
|
});
|
||||||
|
t.ok(result.filePath.startsWith('say:'), 'Gemini TTS returns streaming say: path');
|
||||||
|
t.ok(result.filePath.includes('vendor=google'), 'Gemini TTS streaming path contains vendor=google');
|
||||||
|
t.ok(result.filePath.includes('use_live_api=1'), 'Gemini TTS uses use_live_api=1 (Live API)');
|
||||||
|
t.ok(result.filePath.includes(`model_name=${geminiModel}`), 'Gemini TTS streaming path contains model_name');
|
||||||
|
t.ok(result.filePath.includes('prompt=Speak naturally.'), 'Gemini TTS streaming path contains prompt');
|
||||||
|
|
||||||
|
// Test 4: Gemini TTS with SSML stripping in streaming mode
|
||||||
|
result = await synthAudio(stats, {
|
||||||
|
vendor: 'google',
|
||||||
|
credentials: {
|
||||||
|
credentials: {
|
||||||
|
client_email: creds.client_email,
|
||||||
|
private_key: creds.private_key,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
language: 'en-US',
|
||||||
|
voice: 'Leda',
|
||||||
|
model: geminiModel,
|
||||||
|
text: '<speak>This SSML should be stripped.</speak>',
|
||||||
|
instructions: 'Speak naturally.',
|
||||||
|
disableTtsCache: true
|
||||||
|
});
|
||||||
|
t.ok(result.filePath.startsWith('say:'), 'Gemini TTS with SSML returns streaming say: path');
|
||||||
|
t.ok(!result.filePath.includes('<speak>'), 'SSML tags are stripped from streaming path');
|
||||||
|
t.ok(result.filePath.includes('This SSML should be stripped.'), 'Text content is preserved after SSML stripping');
|
||||||
|
|
||||||
|
// Test 5: Gemini TTS with prompt containing special characters
|
||||||
|
result = await synthAudio(stats, {
|
||||||
|
vendor: 'google',
|
||||||
|
credentials: {
|
||||||
|
credentials: {
|
||||||
|
client_email: creds.client_email,
|
||||||
|
private_key: creds.private_key,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
language: 'en-US',
|
||||||
|
voice: 'Kore',
|
||||||
|
model: geminiModel,
|
||||||
|
text: 'Testing special characters in prompt.',
|
||||||
|
options: { prompt: 'Speak in a warm, friendly tone' },
|
||||||
|
disableTtsCache: true
|
||||||
|
});
|
||||||
|
t.ok(result.filePath.startsWith('say:'), 'Gemini TTS with special chars returns streaming say: path');
|
||||||
|
// Commas in prompt should be replaced with semicolons
|
||||||
|
t.ok(result.filePath.includes('prompt=Speak in a warm; friendly tone'), 'Commas in prompt are escaped to semicolons');
|
||||||
|
|
||||||
|
} catch (err) {
|
||||||
|
console.error(err);
|
||||||
|
t.end(err);
|
||||||
|
}
|
||||||
|
client.quit();
|
||||||
|
});
|
||||||
|
|
||||||
|
test('Google TTS non-streaming tests (JAMBONES_DISABLE_TTS_STREAMING=true)', async(t) => {
|
||||||
|
// Enable streaming disable flag
|
||||||
|
process.env.JAMBONES_DISABLE_TTS_STREAMING = 'true';
|
||||||
|
|
||||||
|
// Clear require cache to reload config with new env var
|
||||||
|
delete require.cache[require.resolve('../lib/config')];
|
||||||
|
delete require.cache[require.resolve('../lib/synth-audio')];
|
||||||
|
delete require.cache[require.resolve('..')];
|
||||||
|
|
||||||
|
const fn = require('..');
|
||||||
|
const {synthAudio, client} = fn(opts, logger);
|
||||||
|
|
||||||
|
if (!process.env.GCP_FILE && !process.env.GCP_JSON_KEY) {
|
||||||
|
t.pass('skipping Google TTS non-streaming tests since neither GCP_FILE nor GCP_JSON_KEY provided');
|
||||||
|
delete process.env.JAMBONES_DISABLE_TTS_STREAMING;
|
||||||
|
return t.end();
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const str = process.env.GCP_JSON_KEY || fs.readFileSync(process.env.GCP_FILE);
|
||||||
|
const creds = JSON.parse(str);
|
||||||
|
const geminiModel = process.env.GCP_GEMINI_TTS_MODEL || 'gemini-2.5-flash-tts';
|
||||||
|
|
||||||
|
// Test 1: Standard voice falls back to non-streaming API
|
||||||
|
let result = await synthAudio(stats, {
|
||||||
|
vendor: 'google',
|
||||||
|
credentials: {
|
||||||
|
credentials: {
|
||||||
|
client_email: creds.client_email,
|
||||||
|
private_key: creds.private_key,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
language: 'en-US',
|
||||||
|
voice: 'en-US-Wavenet-D',
|
||||||
|
gender: 'MALE',
|
||||||
|
text: 'This is a test with streaming disabled.',
|
||||||
|
disableTtsCache: true
|
||||||
|
});
|
||||||
|
t.ok(!result.filePath.startsWith('say:'), 'Standard voice does NOT return streaming say: path when disabled');
|
||||||
|
t.ok(result.filePath.endsWith('.mp3'), 'Standard voice returns mp3 file path');
|
||||||
|
|
||||||
|
// Test 2: HD voice falls back to non-streaming API
|
||||||
|
result = await synthAudio(stats, {
|
||||||
|
vendor: 'google',
|
||||||
|
credentials: {
|
||||||
|
credentials: {
|
||||||
|
client_email: creds.client_email,
|
||||||
|
private_key: creds.private_key,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
language: 'en-US',
|
||||||
|
voice: 'en-US-Chirp3-HD-Charon',
|
||||||
|
text: 'This is a test of HD voice with streaming disabled.',
|
||||||
|
disableTtsCache: true
|
||||||
|
});
|
||||||
|
t.ok(!result.filePath.startsWith('say:'), 'HD voice does NOT return streaming say: path when disabled');
|
||||||
|
t.ok(result.filePath.endsWith('.mp3'), 'HD voice returns mp3 file path');
|
||||||
|
|
||||||
|
// Test 3: Gemini TTS falls back to non-streaming API
|
||||||
|
result = await synthAudio(stats, {
|
||||||
|
vendor: 'google',
|
||||||
|
credentials: {
|
||||||
|
credentials: {
|
||||||
|
client_email: creds.client_email,
|
||||||
|
private_key: creds.private_key,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
language: 'en-US',
|
||||||
|
voice: 'Kore',
|
||||||
|
model: geminiModel,
|
||||||
|
text: 'This is a test of Gemini TTS with streaming disabled.',
|
||||||
|
instructions: 'Speak naturally.',
|
||||||
|
disableTtsCache: true
|
||||||
|
});
|
||||||
|
t.ok(!result.filePath.startsWith('say:'), 'Gemini TTS does NOT return streaming say: path when disabled');
|
||||||
|
t.ok(result.filePath.endsWith('.mp3'), 'Gemini TTS returns mp3 file path');
|
||||||
|
|
||||||
|
} catch (err) {
|
||||||
|
console.error(err);
|
||||||
|
t.end(err);
|
||||||
|
} finally {
|
||||||
|
// Clean up: restore default behavior
|
||||||
|
delete process.env.JAMBONES_DISABLE_TTS_STREAMING;
|
||||||
|
delete require.cache[require.resolve('../lib/config')];
|
||||||
|
delete require.cache[require.resolve('../lib/synth-audio')];
|
||||||
|
delete require.cache[require.resolve('..')];
|
||||||
|
}
|
||||||
|
client.quit();
|
||||||
|
});
|
||||||
|
|
||||||
test('AWS speech synth tests', async(t) => {
|
test('AWS speech synth tests', async(t) => {
|
||||||
const fn = require('..');
|
const fn = require('..');
|
||||||
const {synthAudio, client} = fn(opts, logger);
|
const {synthAudio, client} = fn(opts, logger);
|
||||||
|
|||||||
Reference in New Issue
Block a user