mirror of
https://github.com/jambonz/speech-utils.git
synced 2026-07-04 19:31:49 +00:00
support cartesia sonic3.5 (#143)
This commit is contained in:
+43
-5
@@ -1343,6 +1343,18 @@ const synthCartesia = async(logger, {
|
||||
try {
|
||||
const client = new CartesiaClient({ apiKey: api_key });
|
||||
const sampleRate = 48000;
|
||||
|
||||
// omit a control unless explicitly provided (0 is a valid value, so test for nullish only)
|
||||
const has = (v) => v !== null && v !== undefined;
|
||||
/* Voice controls are model-family specific:
|
||||
- sonic-2 takes `experimentalControls` (emotion is an array, no volume).
|
||||
- sonic-3 family (sonic-3, sonic-3.5, pinned sonic-3.x snapshots) takes
|
||||
`generationConfig` (emotion is a string, volume supported). Match the same
|
||||
"starts with sonic-3" predicate the freeswitch streaming module uses
|
||||
(mod_cartesia_tts_streaming: strncmp(model_id, "sonic-3", ...)), so cached
|
||||
and streamed synthesis behave identically.
|
||||
Older models (sonic, sonic-english, sonic-multilingual, sonic-2024-*) take neither. */
|
||||
const isSonic3 = model_id?.startsWith('sonic-3');
|
||||
const mp3Stream = await client.tts.bytes({
|
||||
modelId: model_id,
|
||||
transcript: text,
|
||||
@@ -1358,16 +1370,16 @@ const synthCartesia = async(logger, {
|
||||
),
|
||||
...(model_id === 'sonic-2' && (opts.speed || opts.emotion) && {
|
||||
experimentalControls: {
|
||||
...(opts.speed !== null && opts.speed !== undefined && {speed: opts.speed}),
|
||||
...(has(opts.speed) && {speed: opts.speed}),
|
||||
...(opts.emotion && {emotion: [opts.emotion]}),
|
||||
}
|
||||
}),
|
||||
},
|
||||
...(model_id === 'sonic-3' && (opts.speed || opts.emotion || opts.volume) && {
|
||||
...(isSonic3 && (has(opts.speed) || has(opts.emotion) || has(opts.volume)) && {
|
||||
generationConfig: {
|
||||
...(opts.volume !== null && opts.volume !== undefined && {volume: opts.volume}),
|
||||
...(opts.speed !== null && opts.speed !== undefined && {speed: opts.speed}),
|
||||
...(opts.emotion !== null && opts.emotion !== undefined && {emotion: opts.emotion}),
|
||||
...(has(opts.volume) && {volume: opts.volume}),
|
||||
...(has(opts.speed) && {speed: opts.speed}),
|
||||
...(has(opts.emotion) && {emotion: opts.emotion}),
|
||||
}
|
||||
}),
|
||||
language: language,
|
||||
@@ -1391,6 +1403,32 @@ const synthCartesia = async(logger, {
|
||||
sampleRate
|
||||
};
|
||||
} catch (err) {
|
||||
/* Cartesia's tts.bytes() uses a streaming response, so on an HTTP error the SDK
|
||||
throws a CartesiaError whose `body` is an unconsumed stream-wrapper object
|
||||
(async-iterable, yielding Uint8Array chunks) rather than the parsed error
|
||||
JSON. Read it so callers get a meaningful message instead of a serialized
|
||||
stream object. */
|
||||
if (err && err.body && typeof err.body !== 'string' && typeof err.body[Symbol.asyncIterator] === 'function') {
|
||||
try {
|
||||
const chunks = [];
|
||||
for await (const chunk of err.body) {
|
||||
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
||||
}
|
||||
const text = Buffer.concat(chunks).toString('utf8');
|
||||
if (text) {
|
||||
let parsed;
|
||||
try {
|
||||
parsed = JSON.parse(text);
|
||||
} catch {
|
||||
parsed = null;
|
||||
}
|
||||
err.message = (parsed && (parsed.error || parsed.message)) || text;
|
||||
err.body = parsed || text;
|
||||
}
|
||||
} catch (readErr) {
|
||||
logger.info({readErr}, 'synth Cartesia: failed to read error response body');
|
||||
}
|
||||
}
|
||||
logger.info({err}, 'synth Cartesia returned error');
|
||||
stats.increment('tts.count', ['vendor:cartesia', 'accepted:no']);
|
||||
throw err;
|
||||
|
||||
Reference in New Issue
Block a user