From ba2049b70578525fd15cdaa4819b29abdd4be988 Mon Sep 17 00:00:00 2001 From: Hoan Luu Huu <110280845+xquanluu@users.noreply.github.com> Date: Wed, 2 Jul 2025 02:46:19 +0700 Subject: [PATCH] support assemblyai v3 (#1265) * support assemblyai v3 * wip * wip * wip * wip * wip * wip --- lib/session/call-session.js | 3 ++- lib/utils/db-utils.js | 1 + lib/utils/transcription-utils.js | 45 +++++++++++++++++++++++++++----- package-lock.json | 8 +++--- package.json | 2 +- 5 files changed, 46 insertions(+), 13 deletions(-) diff --git a/lib/session/call-session.js b/lib/session/call-session.js index edbf09de..bb8d4cd2 100644 --- a/lib/session/call-session.js +++ b/lib/session/call-session.js @@ -1119,7 +1119,8 @@ class CallSession extends Emitter { else if ('assemblyai' === vendor) { return { speech_credential_sid: credential.speech_credential_sid, - api_key: credential.api_key + api_key: credential.api_key, + service_version: credential.service_version }; } else if ('voxist' === vendor) { diff --git a/lib/utils/db-utils.js b/lib/utils/db-utils.js index a5b7889f..ef583770 100644 --- a/lib/utils/db-utils.js +++ b/lib/utils/db-utils.js @@ -129,6 +129,7 @@ const speechMapper = (cred) => { else if ('assemblyai' === obj.vendor) { const o = JSON.parse(decrypt(credential)); obj.api_key = o.api_key; + obj.service_version = o.service_version; } else if ('voxist' === obj.vendor) { const o = JSON.parse(decrypt(credential)); diff --git a/lib/utils/transcription-utils.js b/lib/utils/transcription-utils.js index efcdbd5e..6d633dbc 100644 --- a/lib/utils/transcription-utils.js +++ b/lib/utils/transcription-utils.js @@ -523,16 +523,27 @@ const normalizeAws = (evt, channel, language) => { const normalizeAssemblyAi = (evt, channel, language) => { const copy = JSON.parse(JSON.stringify(evt)); + const alternatives = []; + let is_final = false; + if (evt.type && evt.type === 'Turn') { + // v3 is here + alternatives.push({ + confidence: evt.end_of_turn_confidence, + transcript: evt.transcript, + }); + is_final = evt.end_of_turn; + } else { + alternatives.push({ + confidence: evt.confidence, + transcript: evt.text, + }); + is_final = evt.message_type === 'FinalTranscript'; + } return { language_code: language, channel_tag: channel, - is_final: evt.message_type === 'FinalTranscript', - alternatives: [ - { - confidence: evt.confidence, - transcript: evt.text, - } - ], + is_final, + alternatives, vendor: { name: 'assemblyai', evt: copy @@ -1018,8 +1029,28 @@ module.exports = (logger) => { }; } else if ('assemblyai' === vendor) { + const serviceVersion = rOpts.assemblyAiOptions?.serviceVersion || sttCredentials.service_version || 'v2'; + const { + format_turns, + end_of_turn_confidence_threshold, + min_end_of_turn_silence_when_confident, + max_turn_silence + } = rOpts.assemblyAiOptions || {}; opts = { ...opts, + ASSEMBLYAI_API_VERSION: serviceVersion, + ...(serviceVersion === 'v3' && { + ...(format_turns && { + ASSEMBLYAI_FORMAT_TURNS: format_turns + }), + ...(end_of_turn_confidence_threshold && { + ASSEMBLYAI_END_OF_TURN_CONFIDENCE_THRESHOLD: end_of_turn_confidence_threshold + }), + ASSEMBLYAI_MIN_END_OF_TURN_SILENCE_WHEN_CONFIDENT: min_end_of_turn_silence_when_confident || 500, + ...(max_turn_silence && { + ASSEMBLYAI_MAX_TURN_SILENCE: max_turn_silence + }), + }), ...(sttCredentials.api_key) && {ASSEMBLYAI_API_KEY: sttCredentials.api_key}, ...(rOpts.hints?.length > 0 && diff --git a/package-lock.json b/package-lock.json index 48345657..e6ccc7b9 100644 --- a/package-lock.json +++ b/package-lock.json @@ -18,7 +18,7 @@ "@jambonz/speech-utils": "^0.2.13", "@jambonz/stats-collector": "^0.1.10", "@jambonz/time-series": "^0.2.13", - "@jambonz/verb-specifications": "^0.0.105", + "@jambonz/verb-specifications": "^0.0.106", "@modelcontextprotocol/sdk": "^1.9.0", "@opentelemetry/api": "^1.8.0", "@opentelemetry/exporter-jaeger": "^1.23.0", @@ -1505,9 +1505,9 @@ } }, "node_modules/@jambonz/verb-specifications": { - "version": "0.0.105", - "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.105.tgz", - "integrity": "sha512-MD6RMJyXMoHpR7Wl3xmYmU54P0eF/9LNywRNNsdkAmSf0EogFqSJft4xD/yGeRWlO5O6eAYZEJdaMQeLSxitcg==", + "version": "0.0.106", + "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.106.tgz", + "integrity": "sha512-xBCGKKW5QC7ItZyeF22esytpG2yIhkGWIvBgTaf97CilQmUdLGo3rWG3i7qnRvU9MPXFCtVCMt/aaMt1Ep6V2g==", "license": "MIT", "dependencies": { "debug": "^4.3.4", diff --git a/package.json b/package.json index 2b535e5b..1ce36982 100644 --- a/package.json +++ b/package.json @@ -34,7 +34,7 @@ "@jambonz/speech-utils": "^0.2.13", "@jambonz/stats-collector": "^0.1.10", "@jambonz/time-series": "^0.2.13", - "@jambonz/verb-specifications": "^0.0.105", + "@jambonz/verb-specifications": "^0.0.106", "@modelcontextprotocol/sdk": "^1.9.0", "@opentelemetry/api": "^1.8.0", "@opentelemetry/exporter-jaeger": "^1.23.0",