diff --git a/lib/synth-audio.js b/lib/synth-audio.js index b2f890c..6197e2b 100644 --- a/lib/synth-audio.js +++ b/lib/synth-audio.js @@ -1343,6 +1343,18 @@ const synthCartesia = async(logger, { try { const client = new CartesiaClient({ apiKey: api_key }); const sampleRate = 48000; + + // omit a control unless explicitly provided (0 is a valid value, so test for nullish only) + const has = (v) => v !== null && v !== undefined; + /* Voice controls are model-family specific: + - sonic-2 takes `experimentalControls` (emotion is an array, no volume). + - sonic-3 family (sonic-3, sonic-3.5, pinned sonic-3.x snapshots) takes + `generationConfig` (emotion is a string, volume supported). Match the same + "starts with sonic-3" predicate the freeswitch streaming module uses + (mod_cartesia_tts_streaming: strncmp(model_id, "sonic-3", ...)), so cached + and streamed synthesis behave identically. + Older models (sonic, sonic-english, sonic-multilingual, sonic-2024-*) take neither. */ + const isSonic3 = model_id?.startsWith('sonic-3'); const mp3Stream = await client.tts.bytes({ modelId: model_id, transcript: text, @@ -1358,16 +1370,16 @@ const synthCartesia = async(logger, { ), ...(model_id === 'sonic-2' && (opts.speed || opts.emotion) && { experimentalControls: { - ...(opts.speed !== null && opts.speed !== undefined && {speed: opts.speed}), + ...(has(opts.speed) && {speed: opts.speed}), ...(opts.emotion && {emotion: [opts.emotion]}), } }), }, - ...(model_id === 'sonic-3' && (opts.speed || opts.emotion || opts.volume) && { + ...(isSonic3 && (has(opts.speed) || has(opts.emotion) || has(opts.volume)) && { generationConfig: { - ...(opts.volume !== null && opts.volume !== undefined && {volume: opts.volume}), - ...(opts.speed !== null && opts.speed !== undefined && {speed: opts.speed}), - ...(opts.emotion !== null && opts.emotion !== undefined && {emotion: opts.emotion}), + ...(has(opts.volume) && {volume: opts.volume}), + ...(has(opts.speed) && {speed: opts.speed}), + ...(has(opts.emotion) && {emotion: opts.emotion}), } }), language: language, @@ -1391,6 +1403,32 @@ const synthCartesia = async(logger, { sampleRate }; } catch (err) { + /* Cartesia's tts.bytes() uses a streaming response, so on an HTTP error the SDK + throws a CartesiaError whose `body` is an unconsumed stream-wrapper object + (async-iterable, yielding Uint8Array chunks) rather than the parsed error + JSON. Read it so callers get a meaningful message instead of a serialized + stream object. */ + if (err && err.body && typeof err.body !== 'string' && typeof err.body[Symbol.asyncIterator] === 'function') { + try { + const chunks = []; + for await (const chunk of err.body) { + chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)); + } + const text = Buffer.concat(chunks).toString('utf8'); + if (text) { + let parsed; + try { + parsed = JSON.parse(text); + } catch { + parsed = null; + } + err.message = (parsed && (parsed.error || parsed.message)) || text; + err.body = parsed || text; + } + } catch (readErr) { + logger.info({readErr}, 'synth Cartesia: failed to read error response body'); + } + } logger.info({err}, 'synth Cartesia returned error'); stats.increment('tts.count', ['vendor:cartesia', 'accepted:no']); throw err;