diff --git a/lib/synth-audio.js b/lib/synth-audio.js index 17bf6c0..1ae9244 100644 --- a/lib/synth-audio.js +++ b/lib/synth-audio.js @@ -205,7 +205,8 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc switch (vendor) { case 'google': audioData = await synthGoogle(logger, { - credentials, stats, language, voice, gender, key, text, model, options, instructions + credentials, stats, language, voice, gender, key, text, model, options, instructions, + renderForCaching, disableTtsStreaming, disableTtsCache }); break; case 'aws': @@ -413,12 +414,49 @@ const synthPolly = async(createHash, retrieveHash, logger, const synthGoogle = async(logger, { - credentials, stats, language, voice, gender, text, model, options, instructions + credentials, stats, language, voice, gender, key, text, model, options, instructions, + renderForCaching, disableTtsStreaming, disableTtsCache }) => { - const client = new ttsGoogle.TextToSpeechClient(credentials); - const isGemini = !!model; const isVoiceCloning = typeof voice === 'object' && voice.voice_cloning_key; + // HD voices have pattern like en-US-Chirp3-HD-Charon + const isHDVoice = typeof voice === 'string' && voice.includes('-HD-'); + // Live API is used for Gemini TTS and HD voices + const useLiveApi = isGemini || isHDVoice; + + // Streaming support for Google TTS (Gemini, HD voices, and standard voices) + // Voice cloning does not support streaming + if (!isVoiceCloning && !JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { + // Strip SSML tags for Gemini TTS (it doesn't support SSML) + let inputText = text; + if (isGemini && text.startsWith('')) { + inputText = text.replace(/<[^>]*>/g, '').trim(); + logger.info('synthGoogle: Gemini TTS does not support SSML, stripped tags from input'); + } + + let params = '{'; + params += `credentials=${JSON.stringify(credentials)}`; + params += `,playback_id=${key}`; + params += ',vendor=google'; + params += `,voice=${voice}`; + params += `,language_code=${language || 'en-US'}`; + params += `,write_cache_file=${disableTtsCache ? 0 : 1}`; + params += `,use_live_api=${useLiveApi ? 1 : 0}`; + if (model) params += `,model_name=${model}`; + if (gender) params += `,gender=${gender}`; + // comma is used to separate parameters in freeswitch tts module + const prompt = options?.prompt || instructions; + if (prompt) params += `,prompt=${prompt.replace(/\n/g, ' ').replace(/,/g, ';')}`; + params += '}'; + + return { + filePath: `say:${params}${(isGemini ? inputText : text).replace(/\n/g, ' ')}`, + servedFromCache: false, + rtt: 0 + }; + } + + const client = new ttsGoogle.TextToSpeechClient(credentials); // Build input based on voice type let input; diff --git a/test/synth.js b/test/synth.js index 250ab09..9d6fbf9 100644 --- a/test/synth.js +++ b/test/synth.js @@ -256,6 +256,226 @@ test('Google Gemini TTS synth tests', async(t) => { client.quit(); }); +test('Google TTS streaming tests (!JAMBONES_DISABLE_TTS_STREAMING)', async(t) => { + // Ensure streaming is enabled (default behavior) + delete process.env.JAMBONES_DISABLE_TTS_STREAMING; + + // Clear require cache to reload config with new env var + delete require.cache[require.resolve('../lib/config')]; + delete require.cache[require.resolve('../lib/synth-audio')]; + delete require.cache[require.resolve('..')]; + + const fn = require('..'); + const {synthAudio, client} = fn(opts, logger); + + if (!process.env.GCP_FILE && !process.env.GCP_JSON_KEY) { + t.pass('skipping Google TTS streaming tests since neither GCP_FILE nor GCP_JSON_KEY provided'); + return t.end(); + } + + try { + const str = process.env.GCP_JSON_KEY || fs.readFileSync(process.env.GCP_FILE); + const creds = JSON.parse(str); + const geminiModel = process.env.GCP_GEMINI_TTS_MODEL || 'gemini-2.5-flash-tts'; + + // Test 1: Standard voice streaming (use_live_api=0) + let result = await synthAudio(stats, { + vendor: 'google', + credentials: { + credentials: { + client_email: creds.client_email, + private_key: creds.private_key, + }, + }, + language: 'en-US', + voice: 'en-US-Wavenet-D', + gender: 'MALE', + text: 'This is a test of standard voice streaming.', + disableTtsCache: true + }); + t.ok(result.filePath.startsWith('say:'), 'Standard voice returns streaming say: path'); + t.ok(result.filePath.includes('vendor=google'), 'Standard voice streaming path contains vendor=google'); + t.ok(result.filePath.includes('use_live_api=0'), 'Standard voice uses use_live_api=0'); + t.ok(result.filePath.includes('voice=en-US-Wavenet-D'), 'Standard voice streaming path contains voice'); + + // Test 2: HD voice streaming (use_live_api=1) + result = await synthAudio(stats, { + vendor: 'google', + credentials: { + credentials: { + client_email: creds.client_email, + private_key: creds.private_key, + }, + }, + language: 'en-US', + voice: 'en-US-Chirp3-HD-Charon', + text: 'This is a test of HD voice streaming.', + disableTtsCache: true + }); + t.ok(result.filePath.startsWith('say:'), 'HD voice returns streaming say: path'); + t.ok(result.filePath.includes('vendor=google'), 'HD voice streaming path contains vendor=google'); + t.ok(result.filePath.includes('use_live_api=1'), 'HD voice uses use_live_api=1 (Live API)'); + t.ok(result.filePath.includes('voice=en-US-Chirp3-HD-Charon'), 'HD voice streaming path contains voice'); + + // Test 3: Gemini TTS streaming (use_live_api=1) + result = await synthAudio(stats, { + vendor: 'google', + credentials: { + credentials: { + client_email: creds.client_email, + private_key: creds.private_key, + }, + }, + language: 'en-US', + voice: 'Kore', + model: geminiModel, + text: 'This is a test of Gemini TTS streaming.', + instructions: 'Speak naturally.', + disableTtsCache: true + }); + t.ok(result.filePath.startsWith('say:'), 'Gemini TTS returns streaming say: path'); + t.ok(result.filePath.includes('vendor=google'), 'Gemini TTS streaming path contains vendor=google'); + t.ok(result.filePath.includes('use_live_api=1'), 'Gemini TTS uses use_live_api=1 (Live API)'); + t.ok(result.filePath.includes(`model_name=${geminiModel}`), 'Gemini TTS streaming path contains model_name'); + t.ok(result.filePath.includes('prompt=Speak naturally.'), 'Gemini TTS streaming path contains prompt'); + + // Test 4: Gemini TTS with SSML stripping in streaming mode + result = await synthAudio(stats, { + vendor: 'google', + credentials: { + credentials: { + client_email: creds.client_email, + private_key: creds.private_key, + }, + }, + language: 'en-US', + voice: 'Leda', + model: geminiModel, + text: 'This SSML should be stripped.', + instructions: 'Speak naturally.', + disableTtsCache: true + }); + t.ok(result.filePath.startsWith('say:'), 'Gemini TTS with SSML returns streaming say: path'); + t.ok(!result.filePath.includes(''), 'SSML tags are stripped from streaming path'); + t.ok(result.filePath.includes('This SSML should be stripped.'), 'Text content is preserved after SSML stripping'); + + // Test 5: Gemini TTS with prompt containing special characters + result = await synthAudio(stats, { + vendor: 'google', + credentials: { + credentials: { + client_email: creds.client_email, + private_key: creds.private_key, + }, + }, + language: 'en-US', + voice: 'Kore', + model: geminiModel, + text: 'Testing special characters in prompt.', + options: { prompt: 'Speak in a warm, friendly tone' }, + disableTtsCache: true + }); + t.ok(result.filePath.startsWith('say:'), 'Gemini TTS with special chars returns streaming say: path'); + // Commas in prompt should be replaced with semicolons + t.ok(result.filePath.includes('prompt=Speak in a warm; friendly tone'), 'Commas in prompt are escaped to semicolons'); + + } catch (err) { + console.error(err); + t.end(err); + } + client.quit(); +}); + +test('Google TTS non-streaming tests (JAMBONES_DISABLE_TTS_STREAMING=true)', async(t) => { + // Enable streaming disable flag + process.env.JAMBONES_DISABLE_TTS_STREAMING = 'true'; + + // Clear require cache to reload config with new env var + delete require.cache[require.resolve('../lib/config')]; + delete require.cache[require.resolve('../lib/synth-audio')]; + delete require.cache[require.resolve('..')]; + + const fn = require('..'); + const {synthAudio, client} = fn(opts, logger); + + if (!process.env.GCP_FILE && !process.env.GCP_JSON_KEY) { + t.pass('skipping Google TTS non-streaming tests since neither GCP_FILE nor GCP_JSON_KEY provided'); + delete process.env.JAMBONES_DISABLE_TTS_STREAMING; + return t.end(); + } + + try { + const str = process.env.GCP_JSON_KEY || fs.readFileSync(process.env.GCP_FILE); + const creds = JSON.parse(str); + const geminiModel = process.env.GCP_GEMINI_TTS_MODEL || 'gemini-2.5-flash-tts'; + + // Test 1: Standard voice falls back to non-streaming API + let result = await synthAudio(stats, { + vendor: 'google', + credentials: { + credentials: { + client_email: creds.client_email, + private_key: creds.private_key, + }, + }, + language: 'en-US', + voice: 'en-US-Wavenet-D', + gender: 'MALE', + text: 'This is a test with streaming disabled.', + disableTtsCache: true + }); + t.ok(!result.filePath.startsWith('say:'), 'Standard voice does NOT return streaming say: path when disabled'); + t.ok(result.filePath.endsWith('.mp3'), 'Standard voice returns mp3 file path'); + + // Test 2: HD voice falls back to non-streaming API + result = await synthAudio(stats, { + vendor: 'google', + credentials: { + credentials: { + client_email: creds.client_email, + private_key: creds.private_key, + }, + }, + language: 'en-US', + voice: 'en-US-Chirp3-HD-Charon', + text: 'This is a test of HD voice with streaming disabled.', + disableTtsCache: true + }); + t.ok(!result.filePath.startsWith('say:'), 'HD voice does NOT return streaming say: path when disabled'); + t.ok(result.filePath.endsWith('.mp3'), 'HD voice returns mp3 file path'); + + // Test 3: Gemini TTS falls back to non-streaming API + result = await synthAudio(stats, { + vendor: 'google', + credentials: { + credentials: { + client_email: creds.client_email, + private_key: creds.private_key, + }, + }, + language: 'en-US', + voice: 'Kore', + model: geminiModel, + text: 'This is a test of Gemini TTS with streaming disabled.', + instructions: 'Speak naturally.', + disableTtsCache: true + }); + t.ok(!result.filePath.startsWith('say:'), 'Gemini TTS does NOT return streaming say: path when disabled'); + t.ok(result.filePath.endsWith('.mp3'), 'Gemini TTS returns mp3 file path'); + + } catch (err) { + console.error(err); + t.end(err); + } finally { + // Clean up: restore default behavior + delete process.env.JAMBONES_DISABLE_TTS_STREAMING; + delete require.cache[require.resolve('../lib/config')]; + delete require.cache[require.resolve('../lib/synth-audio')]; + delete require.cache[require.resolve('..')]; + } + client.quit(); +}); + test('AWS speech synth tests', async(t) => { const fn = require('..'); const {synthAudio, client} = fn(opts, logger);