From 5be6c54339733e042dede8bcc77ced1e419dbd1c Mon Sep 17 00:00:00 2001 From: Hoan Luu Huu <110280845+xquanluu@users.noreply.github.com> Date: Wed, 18 Jun 2025 01:54:26 +0700 Subject: [PATCH] support mod_cartesia_transcribe (#1245) --- lib/session/call-session.js | 1 + lib/tasks/gather.js | 12 +++++++++++ lib/tasks/transcribe.js | 12 +++++++++++ lib/utils/constants.json | 6 ++++++ lib/utils/db-utils.js | 1 + lib/utils/transcription-utils.js | 35 ++++++++++++++++++++++++++++++++ 6 files changed, 67 insertions(+) diff --git a/lib/session/call-session.js b/lib/session/call-session.js index 916a3de5..fccc9b7d 100644 --- a/lib/session/call-session.js +++ b/lib/session/call-session.js @@ -1097,6 +1097,7 @@ class CallSession extends Emitter { return { api_key: credential.api_key, model_id: credential.model_id, + stt_model_id: credential.stt_model_id, embedding: credential.embedding, options: credential.options }; diff --git a/lib/tasks/gather.js b/lib/tasks/gather.js index cbb4d57d..43d4dc7f 100644 --- a/lib/tasks/gather.js +++ b/lib/tasks/gather.js @@ -12,6 +12,7 @@ const { JambonzTranscriptionEvents, AssemblyAiTranscriptionEvents, VoxistTranscriptionEvents, + CartesiaTranscriptionEvents, OpenAITranscriptionEvents, VadDetection, VerbioTranscriptionEvents, @@ -546,6 +547,17 @@ class TaskGather extends SttTask { this._onVendorConnectFailure.bind(this, cs, ep)); break; + case 'cartesia': + this.bugname = `${this.bugname_prefix}cartesia_transcribe`; + this.addCustomEventListener(ep, CartesiaTranscriptionEvents.Transcription, + this._onTranscription.bind(this, cs, ep)); + this.addCustomEventListener( + ep, CartesiaTranscriptionEvents.Connect, this._onVendorConnect.bind(this, cs, ep)); + this.addCustomEventListener(ep, CartesiaTranscriptionEvents.Error, this._onVendorError.bind(this, cs, ep)); + this.addCustomEventListener(ep, CartesiaTranscriptionEvents.ConnectFailure, + this._onVendorConnectFailure.bind(this, cs, ep)); + break; + case 'speechmatics': this.bugname = `${this.bugname_prefix}speechmatics_transcribe`; this.addCustomEventListener( diff --git a/lib/tasks/transcribe.js b/lib/tasks/transcribe.js index 7725cd93..08db9a56 100644 --- a/lib/tasks/transcribe.js +++ b/lib/tasks/transcribe.js @@ -14,6 +14,7 @@ const { TranscribeStatus, AssemblyAiTranscriptionEvents, VoxistTranscriptionEvents, + CartesiaTranscriptionEvents, OpenAITranscriptionEvents, VerbioTranscriptionEvents, SpeechmaticsTranscriptionEvents @@ -312,6 +313,17 @@ class TaskTranscribe extends SttTask { this._onVendorConnectFailure.bind(this, cs, ep, channel)); break; + case 'cartesia': + this.bugname = `${this.bugname_prefix}cartesia_transcribe`; + this.addCustomEventListener(ep, CartesiaTranscriptionEvents.Transcription, + this._onTranscription.bind(this, cs, ep, channel)); + this.addCustomEventListener(ep, + CartesiaTranscriptionEvents.Connect, this._onVendorConnect.bind(this, cs, ep)); + this.addCustomEventListener(ep, CartesiaTranscriptionEvents.Error, this._onVendorError.bind(this, cs, ep)); + this.addCustomEventListener(ep, CartesiaTranscriptionEvents.ConnectFailure, + this._onVendorConnectFailure.bind(this, cs, ep, channel)); + break; + case 'speechmatics': this.bugname = `${this.bugname_prefix}speechmatics_transcribe`; this.addCustomEventListener( diff --git a/lib/utils/constants.json b/lib/utils/constants.json index 88b89282..5ca7e97b 100644 --- a/lib/utils/constants.json +++ b/lib/utils/constants.json @@ -167,6 +167,12 @@ "ConnectFailure": "voxist_transcribe::connect_failed", "Connect": "voxist_transcribe::connect" }, + "CartesiaTranscriptionEvents": { + "Transcription": "cartesia_transcribe::transcription", + "Error": "cartesia_transcribe::error", + "ConnectFailure": "cartesia_transcribe::connect_failed", + "Connect": "cartesia_transcribe::connect" + }, "VadDetection": { "Detection": "vad_detect:detection" }, diff --git a/lib/utils/db-utils.js b/lib/utils/db-utils.js index 6a2d458c..73bc504b 100644 --- a/lib/utils/db-utils.js +++ b/lib/utils/db-utils.js @@ -110,6 +110,7 @@ const speechMapper = (cred) => { const o = JSON.parse(decrypt(credential)); obj.api_key = o.api_key; obj.model_id = o.model_id; + obj.stt_model_id = o.stt_model_id; obj.embedding = o.embedding; obj.options = o.options; } diff --git a/lib/utils/transcription-utils.js b/lib/utils/transcription-utils.js index 7c68c633..efcdbd5e 100644 --- a/lib/utils/transcription-utils.js +++ b/lib/utils/transcription-utils.js @@ -110,6 +110,10 @@ const stickyVars = { voxist: [ 'VOXIST_API_KEY', ], + cartesia: [ + 'CARTESIA_API_KEY', + 'CARTESIA_MODEL_ID' + ], speechmatics: [ 'SPEECHMATICS_API_KEY', 'SPEECHMATICS_HOST', @@ -555,6 +559,25 @@ const normalizeVoxist = (evt, channel, language) => { }; }; +const normalizeCartesia = (evt, channel, language) => { + const copy = JSON.parse(JSON.stringify(evt)); + return { + language_code: language, + channel_tag: channel, + is_final: evt.is_final, + alternatives: [ + { + confidence: 1.00, + transcript: evt.text, + } + ], + vendor: { + name: 'cartesia', + evt: copy + } + }; +}; + const normalizeSpeechmatics = (evt, channel, language) => { const copy = JSON.parse(JSON.stringify(evt)); const is_final = evt.message === 'AddTranscript'; @@ -636,6 +659,8 @@ module.exports = (logger) => { return normalizeAssemblyAi(evt, channel, language, shortUtterance); case 'voxist': return normalizeVoxist(evt, channel, language); + case 'cartesia': + return normalizeCartesia(evt, channel, language); case 'verbio': return normalizeVerbio(evt, channel, language); case 'speechmatics': @@ -1008,6 +1033,16 @@ module.exports = (logger) => { {VOXIST_API_KEY: sttCredentials.api_key}, }; } + else if ('cartesia' === vendor) { + opts = { + ...opts, + ...(sttCredentials.api_key && + {CARTESIA_API_KEY: sttCredentials.api_key}), + ...(sttCredentials.stt_model_id && { + CARTESIA_MODEL_ID: sttCredentials.stt_model_id + }) + }; + } else if ('openai' === vendor) { const {openaiOptions = {}} = rOpts; const model = openaiOptions.model || rOpts.model || sttCredentials.model_id || 'whisper-1';