fix #1466 : (#1467 )

* fix #1466: * do not send tts streaming events when we are not doing tts streaming
Add configurable say chunk size (#1461 )
2026-02-12 09:19:34 +00:00 · 2025-12-09 09:43:53 -05:00 · 2025-12-08 10:54:27 -05:00 · 2025-12-08 10:44:20 -05:00 · 2025-12-03 07:16:25 -05:00 · 2025-12-02 19:46:28 -05:00
14 changed files with 41 additions and 82 deletions
--- a/lib/config.js
+++ b/lib/config.js
@@ -119,7 +119,7 @@ const ENCRYPTION_SECRET = process.env.ENCRYPTION_SECRET;
 const HTTP_POOL = process.env.HTTP_POOL && parseInt(process.env.HTTP_POOL);
 const HTTP_POOLSIZE = parseInt(process.env.HTTP_POOLSIZE, 10) || 10;
 const HTTP_PIPELINING = parseInt(process.env.HTTP_PIPELINING, 10) || 1;
-const HTTP_TIMEOUT = 10000;
+const HTTP_TIMEOUT = parseInt(process.env.JAMBONES_HTTP_TIMEOUT, 10) || 10000;
 const HTTP_PROXY_IP = process.env.JAMBONES_HTTP_PROXY_IP;
 const HTTP_PROXY_PORT = process.env.JAMBONES_HTTP_PROXY_PORT;
 const HTTP_PROXY_PROTOCOL = process.env.JAMBONES_HTTP_PROXY_PROTOCOL || 'http';
@@ -139,6 +139,10 @@ const JAMBONES_USE_FREESWITCH_TIMER_FD = process.env.JAMBONES_USE_FREESWITCH_TIM
 const JAMBONES_DIAL_SBC_FOR_REGISTERED_USER = process.env.JAMBONES_DIAL_SBC_FOR_REGISTERED_USER || false;
 const JAMBONES_MEDIA_TIMEOUT_MS = process.env.JAMBONES_MEDIA_TIMEOUT_MS || 0;
 const JAMBONES_MEDIA_HOLD_TIMEOUT_MS = process.env.JAMBONES_MEDIA_HOLD_TIMEOUT_MS || 0;
+
+/* say / tts */
+const JAMBONES_SAY_CHUNK_SIZE = parseInt(process.env.JAMBONES_SAY_CHUNK_SIZE, 10) || 900;
+
 // jambonz
 const JAMBONES_TRANSCRIBE_EP_DESTROY_DELAY_MS =
  process.env.JAMBONES_TRANSCRIBE_EP_DESTROY_DELAY_MS;
@@ -231,5 +235,6 @@ module.exports = {
  JAMBONES_DIAL_SBC_FOR_REGISTERED_USER,
  JAMBONES_MEDIA_TIMEOUT_MS,
  JAMBONES_MEDIA_HOLD_TIMEOUT_MS,
+  JAMBONES_SAY_CHUNK_SIZE,
  JAMBONES_TRANSCRIBE_EP_DESTROY_DELAY_MS,
 };
--- a/lib/http-routes/api/create-call.js
+++ b/lib/http-routes/api/create-call.js
@@ -291,7 +291,7 @@ router.post('/',
            }, {
              ...(account.enable_debug_log && {level: 'debug'})
            });
-            app.requestor.logger = app.notifier.logger = sipLogger;
+            app.requestor.logger = app.notifier.logger = restDial.logger = sipLogger;
            const callInfo = new CallInfo({
              direction: CallDirection.Outbound,
              req: inviteReq,
--- a/lib/session/call-session.js
+++ b/lib/session/call-session.js
@@ -927,7 +927,7 @@ class CallSession extends Emitter {
        this.logger.debug('CallSession:enableBackgroundTtsStream - ttsStream enabled');
      } else {
        this.logger.debug(
-          'CallSession:enableBackgroundTtsStream - ignoring request as call does not have required conditions');
+          'CallSession:enableBackgroundTtsStream - ignoring request; conditions not met (probably not using ws api)');
      }
    }  catch (err) {
      this.logger.info({err, say}, 'CallSession:enableBackgroundTtsStream - Error creating background tts stream task');
@@ -941,9 +941,11 @@ class CallSession extends Emitter {
    }
  }
  clearTtsStream() {
-    this.requestor?.request('tts:streaming-event', '/streaming-event', {event_type: 'user_interruption'})
-      .catch((err) => this.logger.info({err}, 'CallSession:clearTtsStream - Error sending user_interruption'));
-    this.ttsStreamingBuffer?.clear();
+    if (this.isTtsStreamEnabled) {
+      this.requestor?.request('tts:streaming-event', '/streaming-event', {event_type: 'user_interruption'})
+        .catch((err) => this.logger.info({err}, 'CallSession:clearTtsStream - Error sending user_interruption'));
+      this.ttsStreamingBuffer?.clear();
+    }
  }

  startTtsStream() {
@@ -951,7 +953,7 @@ class CallSession extends Emitter {
  }

  stopTtsStream() {
-    if (this.appIsUsingWebsockets) {
+    if (this.isTtsStreamEnabled) {
      this.requestor?.request('tts:streaming-event', '/streaming-event', {event_type: 'stream_closed'})
        .catch((err) => this.logger.info({err}, 'CallSession:clearTtsStream - Error sending user_interruption'));
      this.ttsStreamingBuffer?.stop();
@@ -1141,7 +1143,6 @@ class CallSession extends Emitter {
          return {
            api_key: credential.api_key,
            model_id: credential.model_id,
-            stt_model_id: credential.stt_model_id,
            api_uri: credential.api_uri,
            options: credential.options
          };
--- a/lib/tasks/dial.js
+++ b/lib/tasks/dial.js
@@ -158,7 +158,7 @@ class TaskDial extends Task {

  get canReleaseMedia() {
    const keepAnchor = this.data.anchorMedia ||
-      this.weAreTranscoding ||
+      this.isTranscoding ||
      this.cs.isBackGroundListen ||
      this.cs.onHoldMusic ||
      ANCHOR_MEDIA_ALWAYS ||
@@ -576,7 +576,7 @@ class TaskDial extends Task {
      proxy: `sip:${sbcAddress}`,
      callingNumber: this.callerId || fromUri.user,
      ...(this.callerName && {callingName: this.callerName}),
-      opusFirst: isOpusFirst(this.cs.ep.remote.sdp),
+      opusFirst: isOpusFirst(this.cs.ep.local.sdp),
      isVideoCall: this.cs.ep.remote.sdp.includes('m=video')
    };

@@ -773,6 +773,15 @@ class TaskDial extends Task {
  }

  async _connectSingleDial(cs, sd) {
+    // start connect with dialed leg, this is the soonest we can identify transcoding
+    if (this.epOther && sd.ep) {
+      const codecA = getLeadingCodec(this.epOther.local.sdp);
+      const codecB = getLeadingCodec(sd.ep.remote.sdp);
+      this.isTranscoding = (codecA !== codecB);
+      if (this.isTranscoding) {
+        this.logger.info(`Dial:_connectSingleDial - transcoding from ${codecA} (A leg) to ${codecB} (B leg)`);
+      }
+    }
    if (!this.bridged && !this.canReleaseMedia) {
      this.logger.debug('Dial:_connectSingleDial bridging endpoints');
      if (this.epOther) {
@@ -930,13 +939,6 @@ class TaskDial extends Task {
        this.logger.info({err}, 'Dial:_selectSingleDial - Error boosting audio signal');
      }
    }
-    /* basic determination to see if call is being transcoded */
-    const codecA = getLeadingCodec(this.epOther.local.sdp);
-    const codecB = getLeadingCodec(this.ep.remote.sdp);
-    this.weAreTranscoding = (codecA !== codecB);
-    if (this.weAreTranscoding) {
-      this.logger.info(`Dial:_selectSingleDial - transcoding from ${codecA} (A leg) to ${codecB} (B leg)`);
-    }
    /* if we can release the media back to the SBC, do so now */
    if (this.canReleaseMedia || this.shouldExitMediaPathEntirely) {
      setTimeout(this._releaseMedia.bind(this, cs, sd, this.shouldExitMediaPathEntirely), 200);
--- a/lib/tasks/gather.js
+++ b/lib/tasks/gather.js
@@ -5,7 +5,6 @@ const {
  AwsTranscriptionEvents,
  AzureTranscriptionEvents,
  DeepgramTranscriptionEvents,
-  ElevenlabsTranscriptionEvents,
  GladiaTranscriptionEvents,
  SonioxTranscriptionEvents,
  CobaltTranscriptionEvents,
@@ -493,17 +492,6 @@ class TaskGather extends SttTask {
        this.addCustomEventListener(ep, DeepgramfluxTranscriptionEvents.Error, this._onVendorError.bind(this, cs, ep));
        break;

-      case 'elevenlabs':
-        this.bugname = `${this.bugname_prefix}elevenlabs_transcribe`;
-        this.addCustomEventListener(
-          ep, ElevenlabsTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
-        this.addCustomEventListener(
-          ep, ElevenlabsTranscriptionEvents.Connect, this._onVendorConnect.bind(this, cs, ep));
-        this.addCustomEventListener(ep, ElevenlabsTranscriptionEvents.ConnectFailure,
-          this._onVendorConnectFailure.bind(this, cs, ep));
-        this.addCustomEventListener(ep, ElevenlabsTranscriptionEvents.Error, this._onVendorError.bind(this, cs, ep));
-        break;
-
      case 'gladia':
        this.bugname = `${this.bugname_prefix}gladia_transcribe`;
        this.addCustomEventListener(
@@ -893,7 +881,7 @@ class TaskGather extends SttTask {
        this._fillerNoiseOn = false;  // in a race, if we just started audio it may sneak through here
        this.ep.api('uuid_break', this.ep.uuid)
          .catch((err) => this.logger.info(err, 'Error killing audio'));
-        cs.clearTtsStream();
+        if (cs.isTtsStreamEnabled) cs.clearTtsStream();
      }
      return;
    }
--- a/lib/tasks/say.js
+++ b/lib/tasks/say.js
@@ -1,6 +1,7 @@
 const assert = require('assert');
 const TtsTask = require('./tts-task');
 const {TaskName, TaskPreconditions} = require('../utils/constants');
+const {JAMBONES_SAY_CHUNK_SIZE} = require('../config');
 const pollySSMLSplit = require('polly-ssml-split');
 const { SpeechCredentialError, NonFatalTaskError } = require('../utils/error');
 const { sleepFor } = require('../utils/helpers');
@@ -31,7 +32,7 @@ const isMatchingEvent = (logger, filename, playbackId, evt) => {
 const breakLengthyTextIfNeeded = (logger, text) => {
  // As The text can be used for tts streaming, we need to break lengthy text into smaller chunks
 // HIGH_WATER_BUFFER_SIZE defined in tts-streaming-buffer.js
-  const chunkSize = 900;
+  const chunkSize = JAMBONES_SAY_CHUNK_SIZE;
  const isSSML = text.startsWith('<speak>');
  const options = {
    softLimit: 100,
--- a/lib/tasks/tts-task.js
+++ b/lib/tasks/tts-task.js
@@ -89,8 +89,9 @@ class TtsTask extends Task {
    // api_key, model_id, api_uri, custom_tts_streaming_url, and auth_token are encoded in the credentials
    // allow them to be overriden via config, using options
    // give preference to options passed in via config
-    const local_options = {...JSON.parse(options), ...this.options};
-    const local_voice_settings = {...JSON.parse(options).voice_settings, ...this.options.voice_settings};
+    const parsed_options = options ? JSON.parse(options) : {};
+    const local_options = {...parsed_options, ...this.options};
+    const local_voice_settings = {...(parsed_options.voice_settings || {}), ...(this.options.voice_settings || {})};
    const local_api_key =  local_options.api_key ?? api_key;
    const local_model_id = local_options.model_id ?? model_id;
    const local_api_uri = local_options.api_uri ?? api_uri;
--- a/lib/utils/constants.json
+++ b/lib/utils/constants.json
@@ -103,12 +103,6 @@
    "Connect": "deepgramflux_transcribe::connect",
    "Error": "deepgramflux_transcribe::error"
  },
-  "ElevenlabsTranscriptionEvents": {
-    "Transcription": "elevenlabs_transcribe::transcription",
-    "ConnectFailure": "elevenlabs_transcribe::connect_failed",
-    "Connect": "elevenlabs_transcribe::connect",
-    "Error": "elevenlabs_transcribe::error"
-  },
  "GladiaTranscriptionEvents": {
    "Transcription": "gladia_transcribe::transcription",
    "ConnectFailure": "gladia_transcribe::connect_failed",
--- a/lib/utils/db-utils.js
+++ b/lib/utils/db-utils.js
@@ -106,7 +106,6 @@ const speechMapper = (cred) => {
      const o = JSON.parse(decrypt(credential));
      obj.api_key = o.api_key;
      obj.model_id = o.model_id;
-      obj.stt_model_id = o.stt_model_id;
      obj.api_uri = o.api_uri;
      obj.options = o.options;
    }
--- a/lib/utils/http-requestor.js
+++ b/lib/utils/http-requestor.js
@@ -191,7 +191,7 @@ class HttpRequestor extends BaseRequestor {
        method,
        headers: hdrs,
        ...('POST' === method && {body: JSON.stringify(payload)}),
-        timeout: HTTP_TIMEOUT,
+        headersTimeout: HTTP_TIMEOUT,
        followRedirects: false
      };

--- a/lib/utils/stt-latency-calculator.js
+++ b/lib/utils/stt-latency-calculator.js
@@ -127,7 +127,6 @@ class SttLatencyCalculator extends Emitter {

  calculateLatency() {
    if (!this.isRunning) {
-      this.logger.debug('Latency calculator is not running, cannot calculate latency, returning default values');
      return null;
    }

--- a/lib/utils/transcription-utils.js
+++ b/lib/utils/transcription-utils.js
@@ -545,23 +545,6 @@ const normalizeVerbio = (evt, channel, language) => {
  };
 };

-const normalizeElevenlabs = (evt, channel, language) => {
-  const copy = JSON.parse(JSON.stringify(evt));
-  return {
-    language_code: language,
-    channel_tag: channel,
-    is_final: evt.message_type === 'committed_transcript',
-    alternatives: [{
-      confidence: 1.0,
-      transcript: evt.text,
-    }],
-    vendor: {
-      name: 'elevenlabs',
-      evt: copy
-    }
-  };
-};
-
 const normalizeMicrosoft = (evt, channel, language, punctuation = true) => {
  const copy = JSON.parse(JSON.stringify(evt));
  const nbest = evt.NBest;
@@ -787,8 +770,6 @@ module.exports = (logger) => {
        return normalizeGladia(evt, channel, language, shortUtterance);
      case 'deepgramflux':
        return normalizeDeepgramFlux(evt, channel, language, shortUtterance);
-      case 'elevenlabs':
-        return normalizeElevenlabs(evt, channel, language);
      case 'microsoft':
        return normalizeMicrosoft(evt, channel, language, punctuation);
      case 'google':
@@ -1104,24 +1085,6 @@ module.exports = (logger) => {
        ...(keyterms && keyterms.length > 0 && {DEEPGRAMFLUX_SPEECH_KEYTERMS: keyterms.join(',')}),
      };
    }
-    else if ('elevenlabs' === vendor) {
-      const {api_key, stt_model_id} = sttCredentials;
-      const {includeTimestamps, commitStrategy, vadSilenceThresholdSecs, vadThreshold,
-        minSpeechDurationMs, minSilenceDurationMs, enableLogging} = rOpts.elevenlabsOptions || {};
-
-      opts = {
-        ...opts,
-        ELEVENLABS_API_KEY: api_key,
-        ELEVENLABS_MODEL_ID: stt_model_id,
-        ELEVENLABS_INCLUDE_TIMESTAMPS: includeTimestamps === true ? true : false,
-        ...(commitStrategy && {ELEVENLABS_COMMIT_STRATEGY: commitStrategy}),
-        ...(vadSilenceThresholdSecs && {ELEVENLABS_VAD_SILENCE_THRESHOLD_SECS: vadSilenceThresholdSecs}),
-        ...(vadThreshold && {ELEVENLABS_VAD_THRESHOLD: vadThreshold}),
-        ...(minSpeechDurationMs && {ELEVENLABS_MIN_SPEECH_DURATION_MS: minSpeechDurationMs}),
-        ...(minSilenceDurationMs && {ELEVENLABS_MIN_SILENCE_DURATION_MS: minSilenceDurationMs}),
-        ...(enableLogging && {ELEVENLABS_ENABLE_LOGGING: enableLogging ? 1 : 0}),
-      };
-    }
    else if ('gladia' === vendor) {
      const {host, path} = sttCredentials;
      opts = {
--- a/lib/utils/tts-streaming-buffer.js
+++ b/lib/utils/tts-streaming-buffer.js
@@ -163,7 +163,6 @@ class TtsStreamingBuffer extends Emitter {
  }

  clear() {
-    this.logger.debug('TtsStreamingBuffer:clear');
    if (this._connectionStatus !== TtsStreamingConnectionStatus.Connected) return;
    clearTimeout(this.timer);
    this._api(this.ep, [this.ep.uuid, 'clear']).catch((err) =>
@@ -437,7 +436,15 @@ class TtsStreamingBuffer extends Emitter {

 const findSentenceBoundary = (text, limit) => {
  // Look for punctuation or double newline that signals sentence end.
-  const sentenceEndRegex = /[.!?](?=\s|$)|\n\n/g;
+  // Includes:
+  //   - ASCII: . ! ?
+  //   - Arabic: ؟ (question mark), ۔ (full stop)
+  //   - Japanese: 。 (full stop), ！, ？ (full-width exclamation/question)
+  //
+  // For languages that use spaces between sentences, we still require
+  // whitespace or end-of-string after the mark. For Japanese (no spaces),
+  // we treat the punctuation itself as a boundary regardless of following char.
+  const sentenceEndRegex = /[.!?؟۔](?=\s|$)|[。！？]|\n\n/g;
  let lastSentenceBoundary = -1;
  let match;
  while ((match = sentenceEndRegex.exec(text)) && match.index < limit) {
--- a/package-lock.json
+++ b/package-lock.json
@@ -8752,7 +8752,6 @@
    },
    "node_modules/unix-dgram": {
      "version": "2.0.6",
-      "hasInstallScript": true,
      "license": "ISC",
      "optional": true,
      "dependencies": {
Author	SHA1	Message	Date
Dave Horton	e396b6aa98	fix #1466 : (#1467 ) * fix #1466: * do not send tts streaming events when we are not doing tts streaming	2025-12-09 09:43:53 -05:00
Vinod Dharashive	9104ebb603	Add configurable say chunk size (#1461 )	2025-12-08 10:54:27 -05:00
Vinod Dharashive	1ad0261336	Enhance TTS sentence boundary detection for Arabic and Japanese (#1464 ) Update sentenceEndRegex to treat the following as sentence boundaries: ASCII .!? followed by whitespace or end-of-text; Arabic question mark (؟) and full stop (۔) with the same rule; Japanese 。, ！, ？ treated as boundaries regardless of following character; and double newlines (\n\n). This improves streaming chunking for mixed-language content.	2025-12-08 10:44:20 -05:00
Hoan Luu Huu	7802822773	fixed dial verb cannot bridge 2 leg endpoints due to transcoding (#1457 ) * fixed dial verb cannot bridge 2 leg endpoints due to transcoding * wip	2025-12-03 07:16:25 -05:00
Hoan Luu Huu	edb4d21ce1	fixed undefine issue when setting tts streaming channel vars (#1456 )	2025-12-02 19:46:28 -05:00
Dave Horton	8048e9cf88	when dialing the B leg we check to see if we are using opus on the A leg, and if so we outdial B with opus first; however we were incorrectly checking the SDP on the A leg invite not the 200 OK we send back (#1455 )	2025-12-02 19:22:20 -05:00
Sam Machin	451feafed4	use timeout on HTTP requests (#1453 )	2025-12-02 07:41:47 -05:00