feed TTS in sentence chunks when streaming (#1013)

* feed TTS in sentence chunks when streaming * tts streaming: treat a paragraph as a chunk of text, even it not ending with a line end character * wip
2025-12-19 04:17:44 +00:00 · 2024-12-31 15:16:25 -05:00
parent c9f0481ca6
commit 25f1e65f63
1 changed files with 80 additions and 97 deletions
--- a/lib/utils/tts-streaming-buffer.js
+++ b/lib/utils/tts-streaming-buffer.js
@@ -4,36 +4,10 @@ const {
  TtsStreamingEvents,
  TtsStreamingConnectionStatus
 } = require('../utils/constants');
-const FEED_INTERVAL = 2000;
 const MAX_CHUNK_SIZE = 1800;
 const HIGH_WATER_BUFFER_SIZE = 5000;
 const LOW_WATER_BUFFER_SIZE = 1000;
-const MIN_INITIAL_WORDS = 4;
-
-const findSentenceBoundary = (text, limit) => {
-  const sentenceEndRegex = /[.!?](?=\s|$)/g;
-  let lastSentenceBoundary = -1;
-  let match;
-
-  while ((match = sentenceEndRegex.exec(text)) && match.index < limit) {
-    /* Ensure it's not a decimal point (e.g., "3.14") */
-    if (match.index === 0 || !/\d$/.test(text[match.index - 1])) {
-      lastSentenceBoundary = match.index + 1; // Include the punctuation
-    }
-  }
-  return lastSentenceBoundary;
-};
-
-const findWordBoundary = (text, limit) => {
-  const wordBoundaryRegex = /\s+/g;
-  let lastWordBoundary = -1;
-  let match;
-
-  while ((match = wordBoundaryRegex.exec(text)) && match.index < limit) {
-    lastWordBoundary = match.index;
-  }
-  return lastWordBoundary;
-};
+const TIMEOUT_RETRY_MSECS = 3000;

 class TtsStreamingBuffer extends Emitter {
  constructor(cs) {
@@ -46,7 +20,7 @@ class TtsStreamingBuffer extends Emitter {
    this._isFull = false;
    this._connectionStatus = TtsStreamingConnectionStatus.NotConnected;
    this._flushPending = false;
-    this._countSendsInThisTurn = 0;
+    this.timer = null;
  }

  get isEmpty() {
@@ -130,19 +104,7 @@ class TtsStreamingBuffer extends Emitter {
    );
    this.tokens += (tokens || '');

-    const  leftoverTokens = await this._feedTokens();
-
-    /* do we need to start a timer to periodically feed tokens to the endpoint? */
-    if (this.isEmpty && leftoverTokens > 0) {
-      assert(!this.timer);
-      this.timer = setInterval(async() => {
-        const remaining = await this._feedTokens();
-        if (remaining === 0) {
-          clearInterval(this.timer);
-          this.timer = null;
-        }
-      }, FEED_INTERVAL);
-    }
+    await this._feedTokens();

    return {status: 'ok'};
  }
@@ -155,7 +117,6 @@ class TtsStreamingBuffer extends Emitter {
      return;
    }
    else if (this._connectionStatus === TtsStreamingConnectionStatus.Connected) {
-      this._countSendsInThisTurn = 0;
      this._api(this.ep, [this.ep.uuid, 'flush'])
        .catch((err) => this.logger.info({err},
          `TtsStreamingBuffer:flush Error flushing TTS streaming: ${JSON.stringify(err)}`));
@@ -175,13 +136,14 @@ class TtsStreamingBuffer extends Emitter {
  }

  /**
-   * Send the next chunk of tokens to the endpoint (max 2000 chars)
-   * Return the number of tokens left in the buffer.
+   * Send tokens to the TTS engine in sentence chunks for best playout
   */
-  async _feedTokens() {
+  async _feedTokens(handlingTimeout = false) {
    this.logger.debug({tokens: this.tokens}, '_feedTokens');

    try {
+
+      /* are we in a state where we can feed tokens to the TTS? */
      if (!this.cs.isTtsStreamOpen || !this.ep || !this.tokens) {
        this.logger.debug('TTS stream is not open or no tokens to send');
        return this.tokens?.length || 0;
@@ -190,86 +152,63 @@ class TtsStreamingBuffer extends Emitter {
      if (this._connectionStatus === TtsStreamingConnectionStatus.NotConnected ||
        this._connectionStatus === TtsStreamingConnectionStatus.Failed) {
        this.logger.debug('TtsStreamingBuffer:_feedTokens TTS stream is not connected');
-        return this.tokens.length;
+        return;
      }

      if (this._connectionStatus === TtsStreamingConnectionStatus.Connecting) {
        this.logger.debug('TtsStreamingBuffer:_feedTokens TTS stream is not ready, waiting for connect');
-        return this.tokens.length;
-      }
-
-      /**
-       * Rules:
-       * 1. If this is our first send, we must have at least N words
-       * 2. Otherwise, must EITHER have N words OR be the ending of a sentence
-       *
-       * When sending, send the max size possible, capped at a limit to avoid overwhelming the server.
-       */
-
-      /* must have at least N words, or be the ending of a sentence */
-      const words = this.tokens.split(' ').length;
-      if (words < MIN_INITIAL_WORDS) {
-        const endsWithPunctuation = /[.!?]$/.test(this.tokens);
-        if (!endsWithPunctuation || this._countSendsInThisTurn === 0) {
-          this.logger.debug(`TtsStreamingBuffer:_feedTokens: only ${words} words to send, waiting for more`);
-          return this.tokens.length;
-        }
+        return;
      }

+      /* must send at least one sentence */
      const limit = Math.min(MAX_CHUNK_SIZE, this.tokens.length);
      let chunkEnd = findSentenceBoundary(this.tokens, limit);

-      if (chunkEnd === -1) {
-        this.logger.debug('TtsStreamingBuffer:_feedTokens: no sentence boundary found, look for word boundary');
-        chunkEnd = findWordBoundary(this.tokens, limit);
-      }
-
-      if (chunkEnd === -1) {
-        chunkEnd = limit;
+      if (chunkEnd <= 0) {
+        if (handlingTimeout) {
+          /* on a timeout we've left some tokens sitting around, so be more aggressive now in sending them */
+          chunkEnd = findWordBoundary(this.tokens, limit);
+          if (chunkEnd <= 0) {
+            this.logger.debug('TtsStreamingBuffer:_feedTokens: no word boundary found');
+            this._setTimerIfNeeded();
+            return;
+          }
+        }
+        else {
+          /* if we just received tokens, we wont send unless we have at least a full sentence */
+          this.logger.debug('TtsStreamingBuffer:_feedTokens: no sentence boundary found');
+          this._setTimerIfNeeded();
+          return;
+        }
      }

      const chunk = this.tokens.slice(0, chunkEnd);
-      this.tokens = this.tokens.slice(chunkEnd); // Remove sent chunk
+      this.tokens = this.tokens.slice(chunkEnd);

      /* freeswitch looks for sequence of 2 newlines to determine end of message, so insert a space */
      const modifiedChunk = chunk.replace(/\n\n/g, '\n \n');
+      await this._api(this.ep, [this.ep.uuid, 'send', modifiedChunk]);
+      this.logger.debug(`TtsStreamingBuffer:_feedTokens: sent ${chunk.length}, remaining: ${this.tokens.length}`);

-      if (modifiedChunk.length > 0) {
-        try {
-          this._countSendsInThisTurn++;
-          this.logger.debug({tokens: modifiedChunk},
-            `TtsStreamingBuffer:_feedTokens: sending tokens, in send#${this._countSendsInThisTurn}`);
-          await this._api(this.ep, [this.ep.uuid, 'send', modifiedChunk]);
-        } catch (err) {
-          this.logger.info({err}, 'TtsStreamingBuffer:_feedTokens Error sending TTS chunk');
-        }
-
-        this.logger.debug(`TtsStreamingBuffer:_feedTokens: sent ${chunk.length}, remaining: ${this.tokens.length}`);
-
-        if (this.isFull && this.tokens.length <= LOW_WATER_BUFFER_SIZE) {
-          this.logger.info('TtsStreamingBuffer:_feedTokens TTS streaming buffer is no longer full');
-          this._isFull = false;
-          this.emit(TtsStreamingEvents.Resume);
-        }
+      if (this.isFull && this.tokens.length <= LOW_WATER_BUFFER_SIZE) {
+        this.logger.info('TtsStreamingBuffer:_feedTokens TTS streaming buffer is no longer full');
+        this._isFull = false;
+        this.emit(TtsStreamingEvents.Resume);
      }
    } catch (err) {
      this.logger.info({err}, 'TtsStreamingBuffer:_feedTokens Error sending TTS chunk');
      this.tokens = '';
    }

-    if (0 === this.tokens.length && this.timer) {
-      clearTimeout(this.timer);
-      this.timer = null;
-    }
-
-    return this.tokens.length;
+    return;
  }

  async _api(ep, args) {
    const apiCmd = `uuid_${this.vendor}_tts_streaming`;
    const res = await ep.api(apiCmd, `^^|${args.join('|')}`);
    if (!res.body?.startsWith('+OK')) {
-      throw new Error({args}, `Error calling ${apiCmd}: ${res.body}`);
+      this.logger.info({args}, `Error calling ${apiCmd}: ${res.body}`);
+      throw new Error(`Error calling ${apiCmd}: ${res.body}`);
    }
  }

@@ -292,6 +231,18 @@ class TtsStreamingBuffer extends Emitter {
    }
  }

+  _setTimerIfNeeded() {
+    if (this.tokens.length > 0 && !this.timer) {
+      this.timer = setTimeout(this._onTimeout.bind(this), TIMEOUT_RETRY_MSECS);
+    }
+  }
+
+  _onTimeout() {
+    this.logger.info('TtsStreamingBuffer:_onTimeout');
+    this.timer = null;
+    this._feedTokens(true);
+  }
+
  _onTtsEmpty(vendor) {
    this.emit(TtsStreamingEvents.Empty, {vendor});
  }
@@ -323,4 +274,36 @@ class TtsStreamingBuffer extends Emitter {
  }
 }

+const findSentenceBoundary = (text, limit) => {
+  // Match traditional sentence boundaries or double newlines
+  const sentenceEndRegex = /[.!?](?=\s|$)|\n\n/g;
+  let lastSentenceBoundary = -1;
+  let match;
+
+  while ((match = sentenceEndRegex.exec(text)) && match.index < limit) {
+    const precedingText = text.slice(0, match.index).trim(); // Extract text before the match and trim whitespace
+    if (precedingText.length > 0) { // Check if there's actual content
+      if (
+        match[0] === '\n\n' || // It's a double newline
+        (match.index === 0 || !/\d$/.test(text[match.index - 1])) // Standard punctuation rules
+      ) {
+        lastSentenceBoundary = match.index + (match[0] === '\n\n' ? 2 : 1); // Include the boundary
+      }
+    }
+  }
+
+  return lastSentenceBoundary;
+};
+
+const findWordBoundary = (text, limit) => {
+  const wordBoundaryRegex = /\s+/g;
+  let lastWordBoundary = -1;
+  let match;
+
+  while ((match = wordBoundaryRegex.exec(text)) && match.index < limit) {
+    lastWordBoundary = match.index;
+  }
+  return lastWordBoundary;
+};
+
 module.exports = TtsStreamingBuffer;