support cartesia sonic3.5 (#143)

2026-07-04 19:31:49 +00:00 · 2026-06-06 15:42:40 +07:00
parent 4f430b9785
commit 644f2918dc
1 changed files with 43 additions and 5 deletions
@@ -1343,6 +1343,18 @@ const synthCartesia = async(logger, {
  try {
    const client = new CartesiaClient({ apiKey: api_key });
    const sampleRate = 48000;
+
+    // omit a control unless explicitly provided (0 is a valid value, so test for nullish only)
+    const has = (v) => v !== null && v !== undefined;
+    /* Voice controls are model-family specific:
+       - sonic-2 takes `experimentalControls` (emotion is an array, no volume).
+       - sonic-3 family (sonic-3, sonic-3.5, pinned sonic-3.x snapshots) takes
+         `generationConfig` (emotion is a string, volume supported). Match the same
+         "starts with sonic-3" predicate the freeswitch streaming module uses
+         (mod_cartesia_tts_streaming: strncmp(model_id, "sonic-3", ...)), so cached
+         and streamed synthesis behave identically.
+       Older models (sonic, sonic-english, sonic-multilingual, sonic-2024-*) take neither. */
+    const isSonic3 = model_id?.startsWith('sonic-3');
    const mp3Stream = await client.tts.bytes({
      modelId: model_id,
      transcript: text,
@@ -1358,16 +1370,16 @@ const synthCartesia = async(logger, {
        ),
        ...(model_id === 'sonic-2' && (opts.speed || opts.emotion) && {
          experimentalControls: {
-            ...(opts.speed !== null && opts.speed !== undefined && {speed: opts.speed}),
+            ...(has(opts.speed) && {speed: opts.speed}),
            ...(opts.emotion && {emotion: [opts.emotion]}),
          }
        }),
      },
-      ...(model_id === 'sonic-3' && (opts.speed || opts.emotion || opts.volume) && {
+      ...(isSonic3 && (has(opts.speed) || has(opts.emotion) || has(opts.volume)) && {
        generationConfig: {
-          ...(opts.volume !== null && opts.volume !== undefined && {volume: opts.volume}),
-          ...(opts.speed !== null && opts.speed !== undefined && {speed: opts.speed}),
-          ...(opts.emotion !== null && opts.emotion !== undefined && {emotion: opts.emotion}),
+          ...(has(opts.volume) && {volume: opts.volume}),
+          ...(has(opts.speed) && {speed: opts.speed}),
+          ...(has(opts.emotion) && {emotion: opts.emotion}),
        }
      }),
      language: language,
@@ -1391,6 +1403,32 @@ const synthCartesia = async(logger, {
      sampleRate
    };
  } catch (err) {
+    /* Cartesia's tts.bytes() uses a streaming response, so on an HTTP error the SDK
+       throws a CartesiaError whose `body` is an unconsumed stream-wrapper object
+       (async-iterable, yielding Uint8Array chunks) rather than the parsed error
+       JSON. Read it so callers get a meaningful message instead of a serialized
+       stream object. */
+    if (err && err.body && typeof err.body !== 'string' && typeof err.body[Symbol.asyncIterator] === 'function') {
+      try {
+        const chunks = [];
+        for await (const chunk of err.body) {
+          chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
+        }
+        const text = Buffer.concat(chunks).toString('utf8');
+        if (text) {
+          let parsed;
+          try {
+            parsed = JSON.parse(text);
+          } catch {
+            parsed = null;
+          }
+          err.message = (parsed && (parsed.error || parsed.message)) || text;
+          err.body = parsed || text;
+        }
+      } catch (readErr) {
+        logger.info({readErr}, 'synth Cartesia: failed to read error response body');
+      }
+    }
    logger.info({err}, 'synth Cartesia returned error');
    stats.increment('tts.count', ['vendor:cartesia', 'accepted:no']);
    throw err;