better handling of flush commands (#1081)

* better handling of flush commands * rework buffering of tokens * gather: when returning low confidence also provide the transcript * better error handling in tts:tokens * special handling of asr timeout for speechmatics * remove some logs that were excessively wordy
2025-12-19 04:17:44 +00:00 · 2025-02-18 09:31:11 -05:00
parent 33bca8e67c
commit 122d267816
2 changed files with 242 additions and 112 deletions
--- a/lib/tasks/gather.js
+++ b/lib/tasks/gather.js
@@ -636,6 +636,13 @@ class TaskGather extends SttTask {
    this._asrTimer = setTimeout(() => {
      this.logger.debug('_startAsrTimer - asr timer went off');
      const evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language, this.vendor);
+
+      /* special case for speechmatics - keep listening if we dont have any transcripts */
+      if (this.vendor === 'speechmatics' && this._bufferedTranscripts.length === 0) {
+        this.logger.debug('Gather:_startAsrTimer - speechmatics, no transcripts yet, keep listening');
+        this._startAsrTimer();
+        return;
+      }
      this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout', evt);
    }, this.asrTimeout);
    this.logger.debug(`_startAsrTimer: set for ${this.asrTimeout}ms`);
@@ -792,7 +799,7 @@ class TaskGather extends SttTask {

    evt = this.normalizeTranscription(evt, this.vendor, 1, this.language,
      this.shortUtterance, this.data.recognizer.punctuation);
-    this.logger.debug({evt, bugname, finished, vendor: this.vendor}, 'Gather:_onTranscription normalized transcript');
+    //this.logger.debug({evt, bugname, finished, vendor: this.vendor}, 'Gather:_onTranscription normalized transcript');

    if (evt.alternatives.length === 0) {
      this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening');
@@ -800,8 +807,6 @@ class TaskGather extends SttTask {
    }
    const confidence = evt.alternatives[0].confidence;
    const minConfidence = this.data.recognizer?.minConfidence;
-    this.logger.debug({evt},
-      `TaskGather:_onTranscription - confidence (${confidence}), minConfidence (${minConfidence})`);
    if (confidence && minConfidence && confidence < minConfidence) {
      this.logger.info({evt},
        'TaskGather:_onTranscription - Transcript confidence ' +
@@ -1188,7 +1193,7 @@ class TaskGather extends SttTask {
        if (this.parentTask) this.parentTask.emit('stt-low-confidence', evt);
        else {
          this.emit('stt-low-confidence', evt);
-          returnedVerbs = await this.performAction({reason: 'stt-low-confidence'});
+          returnedVerbs = await this.performAction({speech:evt, reason: 'stt-low-confidence'});
        }
      }
    } catch (err) {  /*already logged error*/ }
--- a/lib/utils/tts-streaming-buffer.js
+++ b/lib/utils/tts-streaming-buffer.js
@@ -4,37 +4,50 @@ const {
  TtsStreamingEvents,
  TtsStreamingConnectionStatus
 } = require('../utils/constants');
+
 const MAX_CHUNK_SIZE = 1800;
 const HIGH_WATER_BUFFER_SIZE = 1000;
 const LOW_WATER_BUFFER_SIZE = 200;
 const TIMEOUT_RETRY_MSECS = 3000;

+
+const isWhitespace = (str) => /^\s*$/.test(str);
+
+/**
+ * Each queue item is an object:
+ *   - { type: 'text', value: '…' } for text tokens.
+ *   - { type: 'flush' } for a flush command.
+ */
 class TtsStreamingBuffer extends Emitter {
  constructor(cs) {
    super();
    this.cs = cs;
    this.logger = cs.logger;

-    this.tokens = '';
+    // Use an array to hold our structured items.
+    this.queue = [];
+    // Track total number of characters in text items.
+    this.bufferedLength = 0;
    this.eventHandlers = [];
    this._isFull = false;
    this._connectionStatus = TtsStreamingConnectionStatus.NotConnected;
-    this._flushPending = false;
    this.timer = null;
+    // Record the last time the text buffer was updated.
+    this.lastUpdateTime = 0;
  }

  get isEmpty() {
-    return this.tokens.length === 0;
+    return this.queue.length === 0;
+  }
+
+  get size() {
+    return this.bufferedLength;
  }

  get isFull() {
    return this._isFull;
  }

-  get size() {
-    return this.tokens.length;
-  }
-
  get ep() {
    return this.cs?.ep;
  }
@@ -42,7 +55,8 @@ class TtsStreamingBuffer extends Emitter {
  async start() {
    assert.ok(
      this._connectionStatus === TtsStreamingConnectionStatus.NotConnected,
-      'TtsStreamingBuffer:start already started, or has failed');
+      'TtsStreamingBuffer:start already started, or has failed'
+    );

    this.vendor = this.cs.getTsStreamingVendor();
    if (!this.vendor) {
@@ -55,9 +69,9 @@ class TtsStreamingBuffer extends Emitter {
    this._connectionStatus = TtsStreamingConnectionStatus.Connecting;
    try {
      if (this.eventHandlers.length === 0) this._initHandlers(this.ep);
-      await  this._api(this.ep, [this.ep.uuid, 'connect']);
+      await this._api(this.ep, [this.ep.uuid, 'connect']);
    } catch (err) {
-      this.logger.info({err}, 'TtsStreamingBuffer:start Error connecting to TTS streaming');
+      this.logger.info({ err }, 'TtsStreamingBuffer:start Error connecting to TTS streaming');
      this._connectionStatus = TtsStreamingConnectionStatus.Failed;
    }
  }
@@ -67,204 +81,319 @@ class TtsStreamingBuffer extends Emitter {
    this.removeCustomEventListeners();
    if (this.ep) {
      this._api(this.ep, [this.ep.uuid, 'close'])
-        .catch((err) => this.logger.info({err}, 'TtsStreamingBuffer:kill Error closing TTS streaming'));
+        .catch((err) =>
+          this.logger.info({ err }, 'TtsStreamingBuffer:stop Error closing TTS streaming')
+        );
    }
    this.timer = null;
-    this.tokens = '';
+    this.queue = [];
+    this.bufferedLength = 0;
    this._connectionStatus = TtsStreamingConnectionStatus.NotConnected;
  }

  /**
-   * Add tokens to the buffer and start feeding them to the endpoint if necessary.
+   * Buffer new text tokens.
   */
  async bufferTokens(tokens) {
-
    if (this._connectionStatus === TtsStreamingConnectionStatus.Failed) {
      this.logger.info('TtsStreamingBuffer:bufferTokens TTS streaming connection failed, rejecting request');
-      return {status: 'failed', reason: `connection to ${this.vendor} failed`};
+      return { status: 'failed', reason: `connection to ${this.vendor} failed` };
+    }
+
+    if (0 === this.bufferedLength && isWhitespace(tokens)) {
+      this.logger.debug({tokens}, 'TtsStreamingBuffer:bufferTokens discarded whitespace tokens');
+      return { status: 'ok' };
    }

    const displayedTokens = tokens.length <= 40 ? tokens : tokens.substring(0, 40);
    const totalLength = tokens.length;

-    /* if we crossed the high water mark, reject the request */
-    if (this.tokens.length + totalLength > HIGH_WATER_BUFFER_SIZE) {
+    if (this.bufferedLength + totalLength > HIGH_WATER_BUFFER_SIZE) {
      this.logger.info(
-        `TtsStreamingBuffer throttling: buffer is full, rejecting request to buffer ${totalLength} tokens`);
-
+        `TtsStreamingBuffer throttling: buffer is full, rejecting request to buffer ${totalLength} tokens`
+      );
      if (!this._isFull) {
        this._isFull = true;
        this.emit(TtsStreamingEvents.Pause);
      }
-      return {status: 'failed', reason: 'full'};
+      return { status: 'failed', reason: 'full' };
    }

    this.logger.debug(
-      `TtsStreamingBuffer:bufferTokens "${displayedTokens}" (length: ${totalLength}), starting? ${this.isEmpty}`
+      `TtsStreamingBuffer:bufferTokens "${displayedTokens}" (length: ${totalLength})`
    );
-    this.tokens += (tokens || '');
+    this.queue.push({ type: 'text', value: tokens });
+    this.bufferedLength += totalLength;
+    // Update the last update time each time new text is buffered.
+    this.lastUpdateTime = Date.now();

-    await this._feedTokens();
-
-    return {status: 'ok'};
+    await this._feedQueue();
+    return { status: 'ok' };
  }

+  /**
+   * Insert a flush command. If no text is queued, flush immediately.
+   * Otherwise, append a flush marker so that all text preceding it will be sent
+   * (regardless of sentence boundaries) before the flush is issued.
+   */
  flush() {
-    this.logger.debug('TtsStreamingBuffer:flush');
    if (this._connectionStatus === TtsStreamingConnectionStatus.Connecting) {
      this.logger.debug('TtsStreamingBuffer:flush TTS stream is not quite ready - wait for connect');
-      this._flushPending = true;
+      if (this.queue.length === 0 || this.queue[this.queue.length - 1].type !== 'flush') {
+        this.queue.push({ type: 'flush' });
+      }
      return;
    }
    else if (this._connectionStatus === TtsStreamingConnectionStatus.Connected) {
-
-      if (this.size === 0) {
+      if (this.isEmpty) {
        this._doFlush();
      }
      else {
-        /* we have tokens queued, so flush after they have been sent */
-        this._pendingFlush = true;
+        if (this.queue[this.queue.length - 1].type !== 'flush') {
+          this.queue.push({ type: 'flush' });
+          this.logger.debug('TtsStreamingBuffer:flush added flush marker to queue');
+        }
      }
    }
+    else {
+      this.logger.debug(
+        `TtsStreamingBuffer:flush TTS stream is not connected, status: ${this._connectionStatus}`
+      );
+    }
  }

  clear() {
    this.logger.debug('TtsStreamingBuffer:clear');
-
    if (this._connectionStatus !== TtsStreamingConnectionStatus.Connected) return;
    clearTimeout(this.timer);
-    this._api(this.ep, [this.ep.uuid, 'clear'])
-      .catch((err) => this.logger.info({err}, 'TtsStreamingBuffer:clear Error clearing TTS streaming'));
-    this.tokens = '';
+    this._api(this.ep, [this.ep.uuid, 'clear']).catch((err) =>
+      this.logger.info({ err }, 'TtsStreamingBuffer:clear Error clearing TTS streaming')
+    );
+    this.queue = [];
+    this.bufferedLength = 0;
    this.timer = null;
    this._isFull = false;
  }

  /**
-   * Send tokens to the TTS engine in sentence chunks for best playout
+   * Process the queue in two phases.
+   *
+   * Phase 1: Look for flush markers. When a flush marker is found (even if not at the very front),
+   *   send all text tokens that came before it immediately (ignoring sentence boundaries)
+   *   and then send the flush command. Repeat until there are no flush markers left.
+   *
+   * Phase 2: With the remaining queue (now containing only text items), accumulate text
+   *   up to MAX_CHUNK_SIZE and use sentence-boundary logic to determine a chunk.
+   *   Then, remove the exact tokens (or portions thereof) that were consumed.
   */
-  async _feedTokens(handlingTimeout = false) {
-    this.logger.debug({tokens: this.tokens}, '_feedTokens');
-
+  async _feedQueue(handlingTimeout = false) {
+    this.logger.debug({ queue: this.queue }, 'TtsStreamingBuffer:_feedQueue');
    try {
-
-      /* are we in a state where we can feed tokens to the TTS? */
-      if (!this.cs.isTtsStreamOpen || !this.ep || !this.tokens) {
-        this.logger.debug('TTS stream is not open or no tokens to send');
-        return this.tokens?.length || 0;
+      if (!this.cs.isTtsStreamOpen || !this.ep) {
+        this.logger.debug('TtsStreamingBuffer:_feedQueue TTS stream is not open or no endpoint available');
+        return;
      }
-
-      if (this._connectionStatus === TtsStreamingConnectionStatus.NotConnected ||
-        this._connectionStatus === TtsStreamingConnectionStatus.Failed) {
-        this.logger.debug('TtsStreamingBuffer:_feedTokens TTS stream is not connected');
+      if (
+        this._connectionStatus === TtsStreamingConnectionStatus.NotConnected ||
+        this._connectionStatus === TtsStreamingConnectionStatus.Failed
+      ) {
+        this.logger.debug('TtsStreamingBuffer:_feedQueue TTS stream is not connected');
        return;
      }

-      if (this._connectionStatus === TtsStreamingConnectionStatus.Connecting) {
-        this.logger.debug('TtsStreamingBuffer:_feedTokens TTS stream is not ready, waiting for connect');
+      // --- Phase 1: Process flush markers ---
+      // Process any flush marker that isn’t in the very first position.
+      let flushIndex = this.queue.findIndex((item, idx) => item.type === 'flush' && idx > 0);
+      while (flushIndex !== -1) {
+        let flushText = '';
+        // Accumulate all text tokens preceding the flush marker.
+        for (let i = 0; i < flushIndex; i++) {
+          if (this.queue[i].type === 'text') {
+            flushText += this.queue[i].value;
+          }
+        }
+        // Remove those text items.
+        for (let i = 0; i < flushIndex; i++) {
+          const item = this.queue.shift();
+          if (item.type === 'text') {
+            this.bufferedLength -= item.value.length;
+          }
+        }
+        // Remove the flush marker (now at the front).
+        if (this.queue.length > 0 && this.queue[0].type === 'flush') {
+          this.queue.shift();
+        }
+        // Immediately send all accumulated text (ignoring sentence boundaries).
+        if (flushText.length > 0) {
+          const modifiedFlushText = flushText.replace(/\n\n/g, '\n \n');
+          try {
+            await this._api(this.ep, [this.ep.uuid, 'send', modifiedFlushText]);
+          } catch (err) {
+            this.logger.info({ err, flushText }, 'TtsStreamingBuffer:_feedQueue Error sending TTS chunk');
+          }
+        }
+        // Send the flush command.
+        await this._doFlush();
+
+        flushIndex = this.queue.findIndex((item, idx) => item.type === 'flush' && idx > 0);
+      }
+
+      // If a flush marker is at the very front, process it.
+      while (this.queue.length > 0 && this.queue[0].type === 'flush') {
+        this.queue.shift();
+        await this._doFlush();
+      }
+
+      // --- Phase 2: Process remaining text tokens ---
+      if (this.queue.length === 0) {
+        this._removeTimer();
        return;
      }

-      /* must send at least one sentence */
-      const limit = Math.min(MAX_CHUNK_SIZE, this.tokens.length);
-      let chunkEnd = findSentenceBoundary(this.tokens, limit);
+      // Accumulate contiguous text tokens (from the front) up to MAX_CHUNK_SIZE.
+      let combinedText = '';
+      for (const item of this.queue) {
+        if (item.type !== 'text') break;
+        combinedText += item.value;
+        if (combinedText.length >= MAX_CHUNK_SIZE) break;
+      }
+      if (combinedText.length === 0) {
+        this._removeTimer();
+        return;
+      }

+      const limit = Math.min(MAX_CHUNK_SIZE, combinedText.length);
+      let chunkEnd = findSentenceBoundary(combinedText, limit);
      if (chunkEnd <= 0) {
        if (handlingTimeout) {
-          /* on a timeout we've left some tokens sitting around, so be more aggressive now in sending them */
-          chunkEnd = findWordBoundary(this.tokens, limit);
+          chunkEnd = findWordBoundary(combinedText, limit);
          if (chunkEnd <= 0) {
-            this.logger.debug('TtsStreamingBuffer:_feedTokens: no word boundary found');
            this._setTimerIfNeeded();
            return;
          }
-        }
-        else {
-          /* if we just received tokens, we wont send unless we have at least a full sentence */
-          this.logger.debug('TtsStreamingBuffer:_feedTokens: no sentence boundary found');
+        } else {
          this._setTimerIfNeeded();
          return;
        }
      }
+      const chunk = combinedText.slice(0, chunkEnd);

-      const chunk = this.tokens.slice(0, chunkEnd);
-      this.tokens = this.tokens.slice(chunkEnd);
+      // Now we iterate over the queue items
+      // and deduct their lengths until we've accounted for chunkEnd characters.
+      let remaining = chunkEnd;
+      let tokensProcessed = 0;
+      for (let i = 0; i < this.queue.length; i++) {
+        const token = this.queue[i];
+        if (token.type !== 'text') break;
+        if (remaining >= token.value.length) {
+          remaining -= token.value.length;
+          tokensProcessed = i + 1;
+        } else {
+          // Partially consumed token: update its value to remove the consumed part.
+          token.value = token.value.slice(remaining);
+          tokensProcessed = i;
+          remaining = 0;
+          break;
+        }
+      }
+      // Remove the fully consumed tokens from the front of the queue.
+      this.queue.splice(0, tokensProcessed);
+      this.bufferedLength -= chunkEnd;

-      /* freeswitch looks for sequence of 2 newlines to determine end of message, so insert a space */
      const modifiedChunk = chunk.replace(/\n\n/g, '\n \n');
-      await this._api(this.ep, [this.ep.uuid, 'send', modifiedChunk]);
-      this.logger.debug(`TtsStreamingBuffer:_feedTokens: sent ${chunk.length}, remaining: ${this.tokens.length}`);
+      this.logger.debug(`TtsStreamingBuffer:_feedQueue sending chunk to tts: ${modifiedChunk}`);

-      if (this._pendingFlush) {
-        this._doFlush();
-        this._pendingFlush = false;
+      try {
+        await this._api(this.ep, [this.ep.uuid, 'send', modifiedChunk]);
+      } catch (err) {
+        this.logger.info({ err, chunk }, 'TtsStreamingBuffer:_feedQueue Error sending TTS chunk');
      }

-      if (this.isFull && this.tokens.length <= LOW_WATER_BUFFER_SIZE) {
-        this.logger.info('TtsStreamingBuffer throttling: TTS streaming buffer is no longer full - resuming');
+      if (this._isFull && this.bufferedLength <= LOW_WATER_BUFFER_SIZE) {
+        this.logger.info('TtsStreamingBuffer throttling: buffer is no longer full - resuming');
        this._isFull = false;
        this.emit(TtsStreamingEvents.Resume);
      }
-    } catch (err) {
-      this.logger.info({err}, 'TtsStreamingBuffer:_feedTokens Error sending TTS chunk');
-      this.tokens = '';
-    }

-    return;
+      return this._feedQueue();
+    } catch (err) {
+      this.logger.info({ err }, 'TtsStreamingBuffer:_feedQueue Error sending TTS chunk');
+      this.queue = [];
+      this.bufferedLength = 0;
+    }
  }

  async _api(ep, args) {
    const apiCmd = `uuid_${this.vendor.startsWith('custom:') ? 'custom' : this.vendor}_tts_streaming`;
    const res = await ep.api(apiCmd, `^^|${args.join('|')}`);
    if (!res.body?.startsWith('+OK')) {
-      this.logger.info({args}, `Error calling ${apiCmd}: ${res.body}`);
+      this.logger.info({ args }, `Error calling ${apiCmd}: ${res.body}`);
      throw new Error(`Error calling ${apiCmd}: ${res.body}`);
    }
  }

-  _onConnectFailure(vendor) {
-    this.logger.info(`streaming tts connection failed to ${vendor}`);
-    this._connectionStatus = TtsStreamingConnectionStatus.Failed;
-    this.tokens = '';
-    this.emit(TtsStreamingEvents.ConnectFailure, {vendor});
-  }
-
  _doFlush() {
-    this._api(this.ep, [this.ep.uuid, 'flush'])
-      .catch((err) => this.logger.info({err},
-        `TtsStreamingBuffer:_doFlush Error flushing TTS streaming: ${JSON.stringify(err)}`));
+    return this._api(this.ep, [this.ep.uuid, 'flush'])
+      .then(() => this.logger.debug('TtsStreamingBuffer:_doFlush sent flush command'))
+      .catch((err) =>
+        this.logger.info(
+          { err },
+          `TtsStreamingBuffer:_doFlush Error flushing TTS streaming: ${JSON.stringify(err)}`
+        )
+      );
  }

  async _onConnect(vendor) {
-    this.logger.info(`streaming tts connection made to ${vendor}`);
+    this.logger.info(`TtsStreamingBuffer:_onConnect streaming tts connection made to ${vendor} successful`);
    this._connectionStatus = TtsStreamingConnectionStatus.Connected;
-    if (this.tokens.length > 0) {
-      await this._feedTokens();
-    }
-    if (this._flushPending) {
-      this.flush();
-      this._flushPending = false;
+    if (this.queue.length > 0) {
+      await this._feedQueue();
    }
  }

+  _onConnectFailure(vendor) {
+    this.logger.info(`TtsStreamingBuffer:_onConnectFailure streaming tts connection failed to ${vendor}`);
+    this._connectionStatus = TtsStreamingConnectionStatus.Failed;
+    this.queue = [];
+    this.bufferedLength = 0;
+    this.emit(TtsStreamingEvents.ConnectFailure, { vendor });
+  }
+
  _setTimerIfNeeded() {
-    if (this.tokens.length > 0 && !this.timer) {
+    if (this.bufferedLength > 0 && !this.timer) {
+      this.logger.debug({queue: this.queue},
+        `TtsStreamingBuffer:_setTimerIfNeeded setting timer because ${this.bufferedLength} buffered`);
      this.timer = setTimeout(this._onTimeout.bind(this), TIMEOUT_RETRY_MSECS);
    }
  }

+  _removeTimer() {
+    if (this.timer) {
+      this.logger.debug('TtsStreamingBuffer:_removeTimer clearing timer');
+      clearTimeout(this.timer);
+      this.timer = null;
+    }
+  }
+
  _onTimeout() {
-    this.logger.info('TtsStreamingBuffer:_onTimeout');
+    this.logger.debug('TtsStreamingBuffer:_onTimeout Timeout waiting for sentence boundary');
+    // Check if new text has been added since the timer was set.
+    const now = Date.now();
+    if (now - this.lastUpdateTime < TIMEOUT_RETRY_MSECS) {
+      this.logger.debug('TtsStreamingBuffer:_onTimeout New text received recently; postponing flush.');
+      this._setTimerIfNeeded();
+      return;
+    }
    this.timer = null;
-    this._feedTokens(true);
+    this._feedQueue(true);
  }

  _onTtsEmpty(vendor) {
-    this.emit(TtsStreamingEvents.Empty, {vendor});
+    this.emit(TtsStreamingEvents.Empty, { vendor });
  }

  addCustomEventListener(ep, event, handler) {
-    this.eventHandlers.push({ep, event, handler});
+    this.eventHandlers.push({ ep, event, handler });
    ep.addCustomEventListener(event, handler);
  }

@@ -274,7 +403,6 @@ class TtsStreamingBuffer extends Emitter {

  _initHandlers(ep) {
    [
-      // DH: add other vendors here as modules are added
      'deepgram',
      'cartesia',
      'elevenlabs',
@@ -293,23 +421,21 @@ class TtsStreamingBuffer extends Emitter {
 }

 const findSentenceBoundary = (text, limit) => {
-  // Match traditional sentence boundaries or double newlines
+  // Look for punctuation or double newline that signals sentence end.
  const sentenceEndRegex = /[.!?](?=\s|$)|\n\n/g;
  let lastSentenceBoundary = -1;
  let match;
-
  while ((match = sentenceEndRegex.exec(text)) && match.index < limit) {
-    const precedingText = text.slice(0, match.index).trim(); // Extract text before the match and trim whitespace
-    if (precedingText.length > 0) { // Check if there's actual content
+    const precedingText = text.slice(0, match.index).trim();
+    if (precedingText.length > 0) {
      if (
-        match[0] === '\n\n' || // It's a double newline
-        (match.index === 0 || !/\d$/.test(text[match.index - 1])) // Standard punctuation rules
+        match[0] === '\n\n' ||
+        (match.index === 0 || !/\d$/.test(text[match.index - 1]))
      ) {
-        lastSentenceBoundary = match.index + (match[0] === '\n\n' ? 2 : 1); // Include the boundary
+        lastSentenceBoundary = match.index + (match[0] === '\n\n' ? 2 : 1);
      }
    }
  }
-
  return lastSentenceBoundary;
 };

@@ -317,7 +443,6 @@ const findWordBoundary = (text, limit) => {
  const wordBoundaryRegex = /\s+/g;
  let lastWordBoundary = -1;
  let match;
-
  while ((match = wordBoundaryRegex.exec(text)) && match.index < limit) {
    lastWordBoundary = match.index;
  }