feed TTS in sentence chunks when streaming (#1013)

* feed TTS in sentence chunks when streaming

* tts streaming: treat a paragraph as a chunk of text, even it not ending with a line end character

* wip
This commit is contained in:
Dave Horton
2024-12-31 15:16:25 -05:00
committed by GitHub
parent c9f0481ca6
commit 25f1e65f63

View File

@@ -4,36 +4,10 @@ const {
TtsStreamingEvents,
TtsStreamingConnectionStatus
} = require('../utils/constants');
const FEED_INTERVAL = 2000;
const MAX_CHUNK_SIZE = 1800;
const HIGH_WATER_BUFFER_SIZE = 5000;
const LOW_WATER_BUFFER_SIZE = 1000;
const MIN_INITIAL_WORDS = 4;
const findSentenceBoundary = (text, limit) => {
const sentenceEndRegex = /[.!?](?=\s|$)/g;
let lastSentenceBoundary = -1;
let match;
while ((match = sentenceEndRegex.exec(text)) && match.index < limit) {
/* Ensure it's not a decimal point (e.g., "3.14") */
if (match.index === 0 || !/\d$/.test(text[match.index - 1])) {
lastSentenceBoundary = match.index + 1; // Include the punctuation
}
}
return lastSentenceBoundary;
};
const findWordBoundary = (text, limit) => {
const wordBoundaryRegex = /\s+/g;
let lastWordBoundary = -1;
let match;
while ((match = wordBoundaryRegex.exec(text)) && match.index < limit) {
lastWordBoundary = match.index;
}
return lastWordBoundary;
};
const TIMEOUT_RETRY_MSECS = 3000;
class TtsStreamingBuffer extends Emitter {
constructor(cs) {
@@ -46,7 +20,7 @@ class TtsStreamingBuffer extends Emitter {
this._isFull = false;
this._connectionStatus = TtsStreamingConnectionStatus.NotConnected;
this._flushPending = false;
this._countSendsInThisTurn = 0;
this.timer = null;
}
get isEmpty() {
@@ -130,19 +104,7 @@ class TtsStreamingBuffer extends Emitter {
);
this.tokens += (tokens || '');
const leftoverTokens = await this._feedTokens();
/* do we need to start a timer to periodically feed tokens to the endpoint? */
if (this.isEmpty && leftoverTokens > 0) {
assert(!this.timer);
this.timer = setInterval(async() => {
const remaining = await this._feedTokens();
if (remaining === 0) {
clearInterval(this.timer);
this.timer = null;
}
}, FEED_INTERVAL);
}
await this._feedTokens();
return {status: 'ok'};
}
@@ -155,7 +117,6 @@ class TtsStreamingBuffer extends Emitter {
return;
}
else if (this._connectionStatus === TtsStreamingConnectionStatus.Connected) {
this._countSendsInThisTurn = 0;
this._api(this.ep, [this.ep.uuid, 'flush'])
.catch((err) => this.logger.info({err},
`TtsStreamingBuffer:flush Error flushing TTS streaming: ${JSON.stringify(err)}`));
@@ -175,13 +136,14 @@ class TtsStreamingBuffer extends Emitter {
}
/**
* Send the next chunk of tokens to the endpoint (max 2000 chars)
* Return the number of tokens left in the buffer.
* Send tokens to the TTS engine in sentence chunks for best playout
*/
async _feedTokens() {
async _feedTokens(handlingTimeout = false) {
this.logger.debug({tokens: this.tokens}, '_feedTokens');
try {
/* are we in a state where we can feed tokens to the TTS? */
if (!this.cs.isTtsStreamOpen || !this.ep || !this.tokens) {
this.logger.debug('TTS stream is not open or no tokens to send');
return this.tokens?.length || 0;
@@ -190,86 +152,63 @@ class TtsStreamingBuffer extends Emitter {
if (this._connectionStatus === TtsStreamingConnectionStatus.NotConnected ||
this._connectionStatus === TtsStreamingConnectionStatus.Failed) {
this.logger.debug('TtsStreamingBuffer:_feedTokens TTS stream is not connected');
return this.tokens.length;
return;
}
if (this._connectionStatus === TtsStreamingConnectionStatus.Connecting) {
this.logger.debug('TtsStreamingBuffer:_feedTokens TTS stream is not ready, waiting for connect');
return this.tokens.length;
}
/**
* Rules:
* 1. If this is our first send, we must have at least N words
* 2. Otherwise, must EITHER have N words OR be the ending of a sentence
*
* When sending, send the max size possible, capped at a limit to avoid overwhelming the server.
*/
/* must have at least N words, or be the ending of a sentence */
const words = this.tokens.split(' ').length;
if (words < MIN_INITIAL_WORDS) {
const endsWithPunctuation = /[.!?]$/.test(this.tokens);
if (!endsWithPunctuation || this._countSendsInThisTurn === 0) {
this.logger.debug(`TtsStreamingBuffer:_feedTokens: only ${words} words to send, waiting for more`);
return this.tokens.length;
}
return;
}
/* must send at least one sentence */
const limit = Math.min(MAX_CHUNK_SIZE, this.tokens.length);
let chunkEnd = findSentenceBoundary(this.tokens, limit);
if (chunkEnd === -1) {
this.logger.debug('TtsStreamingBuffer:_feedTokens: no sentence boundary found, look for word boundary');
chunkEnd = findWordBoundary(this.tokens, limit);
}
if (chunkEnd === -1) {
chunkEnd = limit;
if (chunkEnd <= 0) {
if (handlingTimeout) {
/* on a timeout we've left some tokens sitting around, so be more aggressive now in sending them */
chunkEnd = findWordBoundary(this.tokens, limit);
if (chunkEnd <= 0) {
this.logger.debug('TtsStreamingBuffer:_feedTokens: no word boundary found');
this._setTimerIfNeeded();
return;
}
}
else {
/* if we just received tokens, we wont send unless we have at least a full sentence */
this.logger.debug('TtsStreamingBuffer:_feedTokens: no sentence boundary found');
this._setTimerIfNeeded();
return;
}
}
const chunk = this.tokens.slice(0, chunkEnd);
this.tokens = this.tokens.slice(chunkEnd); // Remove sent chunk
this.tokens = this.tokens.slice(chunkEnd);
/* freeswitch looks for sequence of 2 newlines to determine end of message, so insert a space */
const modifiedChunk = chunk.replace(/\n\n/g, '\n \n');
await this._api(this.ep, [this.ep.uuid, 'send', modifiedChunk]);
this.logger.debug(`TtsStreamingBuffer:_feedTokens: sent ${chunk.length}, remaining: ${this.tokens.length}`);
if (modifiedChunk.length > 0) {
try {
this._countSendsInThisTurn++;
this.logger.debug({tokens: modifiedChunk},
`TtsStreamingBuffer:_feedTokens: sending tokens, in send#${this._countSendsInThisTurn}`);
await this._api(this.ep, [this.ep.uuid, 'send', modifiedChunk]);
} catch (err) {
this.logger.info({err}, 'TtsStreamingBuffer:_feedTokens Error sending TTS chunk');
}
this.logger.debug(`TtsStreamingBuffer:_feedTokens: sent ${chunk.length}, remaining: ${this.tokens.length}`);
if (this.isFull && this.tokens.length <= LOW_WATER_BUFFER_SIZE) {
this.logger.info('TtsStreamingBuffer:_feedTokens TTS streaming buffer is no longer full');
this._isFull = false;
this.emit(TtsStreamingEvents.Resume);
}
if (this.isFull && this.tokens.length <= LOW_WATER_BUFFER_SIZE) {
this.logger.info('TtsStreamingBuffer:_feedTokens TTS streaming buffer is no longer full');
this._isFull = false;
this.emit(TtsStreamingEvents.Resume);
}
} catch (err) {
this.logger.info({err}, 'TtsStreamingBuffer:_feedTokens Error sending TTS chunk');
this.tokens = '';
}
if (0 === this.tokens.length && this.timer) {
clearTimeout(this.timer);
this.timer = null;
}
return this.tokens.length;
return;
}
async _api(ep, args) {
const apiCmd = `uuid_${this.vendor}_tts_streaming`;
const res = await ep.api(apiCmd, `^^|${args.join('|')}`);
if (!res.body?.startsWith('+OK')) {
throw new Error({args}, `Error calling ${apiCmd}: ${res.body}`);
this.logger.info({args}, `Error calling ${apiCmd}: ${res.body}`);
throw new Error(`Error calling ${apiCmd}: ${res.body}`);
}
}
@@ -292,6 +231,18 @@ class TtsStreamingBuffer extends Emitter {
}
}
_setTimerIfNeeded() {
if (this.tokens.length > 0 && !this.timer) {
this.timer = setTimeout(this._onTimeout.bind(this), TIMEOUT_RETRY_MSECS);
}
}
_onTimeout() {
this.logger.info('TtsStreamingBuffer:_onTimeout');
this.timer = null;
this._feedTokens(true);
}
_onTtsEmpty(vendor) {
this.emit(TtsStreamingEvents.Empty, {vendor});
}
@@ -323,4 +274,36 @@ class TtsStreamingBuffer extends Emitter {
}
}
const findSentenceBoundary = (text, limit) => {
// Match traditional sentence boundaries or double newlines
const sentenceEndRegex = /[.!?](?=\s|$)|\n\n/g;
let lastSentenceBoundary = -1;
let match;
while ((match = sentenceEndRegex.exec(text)) && match.index < limit) {
const precedingText = text.slice(0, match.index).trim(); // Extract text before the match and trim whitespace
if (precedingText.length > 0) { // Check if there's actual content
if (
match[0] === '\n\n' || // It's a double newline
(match.index === 0 || !/\d$/.test(text[match.index - 1])) // Standard punctuation rules
) {
lastSentenceBoundary = match.index + (match[0] === '\n\n' ? 2 : 1); // Include the boundary
}
}
}
return lastSentenceBoundary;
};
const findWordBoundary = (text, limit) => {
const wordBoundaryRegex = /\s+/g;
let lastWordBoundary = -1;
let match;
while ((match = wordBoundaryRegex.exec(text)) && match.index < limit) {
lastWordBoundary = match.index;
}
return lastWordBoundary;
};
module.exports = TtsStreamingBuffer;