support cartesia sonic3.5 (#143)

This commit is contained in:
Hoan Luu Huu
2026-06-06 15:42:40 +07:00
committed by GitHub
parent 4f430b9785
commit 644f2918dc
+43 -5
View File
@@ -1343,6 +1343,18 @@ const synthCartesia = async(logger, {
try {
const client = new CartesiaClient({ apiKey: api_key });
const sampleRate = 48000;
// omit a control unless explicitly provided (0 is a valid value, so test for nullish only)
const has = (v) => v !== null && v !== undefined;
/* Voice controls are model-family specific:
- sonic-2 takes `experimentalControls` (emotion is an array, no volume).
- sonic-3 family (sonic-3, sonic-3.5, pinned sonic-3.x snapshots) takes
`generationConfig` (emotion is a string, volume supported). Match the same
"starts with sonic-3" predicate the freeswitch streaming module uses
(mod_cartesia_tts_streaming: strncmp(model_id, "sonic-3", ...)), so cached
and streamed synthesis behave identically.
Older models (sonic, sonic-english, sonic-multilingual, sonic-2024-*) take neither. */
const isSonic3 = model_id?.startsWith('sonic-3');
const mp3Stream = await client.tts.bytes({
modelId: model_id,
transcript: text,
@@ -1358,16 +1370,16 @@ const synthCartesia = async(logger, {
),
...(model_id === 'sonic-2' && (opts.speed || opts.emotion) && {
experimentalControls: {
...(opts.speed !== null && opts.speed !== undefined && {speed: opts.speed}),
...(has(opts.speed) && {speed: opts.speed}),
...(opts.emotion && {emotion: [opts.emotion]}),
}
}),
},
...(model_id === 'sonic-3' && (opts.speed || opts.emotion || opts.volume) && {
...(isSonic3 && (has(opts.speed) || has(opts.emotion) || has(opts.volume)) && {
generationConfig: {
...(opts.volume !== null && opts.volume !== undefined && {volume: opts.volume}),
...(opts.speed !== null && opts.speed !== undefined && {speed: opts.speed}),
...(opts.emotion !== null && opts.emotion !== undefined && {emotion: opts.emotion}),
...(has(opts.volume) && {volume: opts.volume}),
...(has(opts.speed) && {speed: opts.speed}),
...(has(opts.emotion) && {emotion: opts.emotion}),
}
}),
language: language,
@@ -1391,6 +1403,32 @@ const synthCartesia = async(logger, {
sampleRate
};
} catch (err) {
/* Cartesia's tts.bytes() uses a streaming response, so on an HTTP error the SDK
throws a CartesiaError whose `body` is an unconsumed stream-wrapper object
(async-iterable, yielding Uint8Array chunks) rather than the parsed error
JSON. Read it so callers get a meaningful message instead of a serialized
stream object. */
if (err && err.body && typeof err.body !== 'string' && typeof err.body[Symbol.asyncIterator] === 'function') {
try {
const chunks = [];
for await (const chunk of err.body) {
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
}
const text = Buffer.concat(chunks).toString('utf8');
if (text) {
let parsed;
try {
parsed = JSON.parse(text);
} catch {
parsed = null;
}
err.message = (parsed && (parsed.error || parsed.message)) || text;
err.body = parsed || text;
}
} catch (readErr) {
logger.info({readErr}, 'synth Cartesia: failed to read error response body');
}
}
logger.info({err}, 'synth Cartesia returned error');
stats.increment('tts.count', ['vendor:cartesia', 'accepted:no']);
throw err;