From 1ad0261336978e876ab868f3d9814cac58c9efa8 Mon Sep 17 00:00:00 2001 From: Vinod Dharashive Date: Mon, 8 Dec 2025 21:14:20 +0530 Subject: [PATCH] Enhance TTS sentence boundary detection for Arabic and Japanese (#1464) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update sentenceEndRegex to treat the following as sentence boundaries: ASCII .!? followed by whitespace or end-of-text; Arabic question mark (؟) and full stop (۔) with the same rule; Japanese 。, !, ? treated as boundaries regardless of following character; and double newlines (\n\n). This improves streaming chunking for mixed-language content. --- lib/utils/tts-streaming-buffer.js | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/utils/tts-streaming-buffer.js b/lib/utils/tts-streaming-buffer.js index b397aeeb..6745121f 100644 --- a/lib/utils/tts-streaming-buffer.js +++ b/lib/utils/tts-streaming-buffer.js @@ -437,7 +437,15 @@ class TtsStreamingBuffer extends Emitter { const findSentenceBoundary = (text, limit) => { // Look for punctuation or double newline that signals sentence end. - const sentenceEndRegex = /[.!?](?=\s|$)|\n\n/g; + // Includes: + // - ASCII: . ! ? + // - Arabic: ؟ (question mark), ۔ (full stop) + // - Japanese: 。 (full stop), !, ? (full-width exclamation/question) + // + // For languages that use spaces between sentences, we still require + // whitespace or end-of-string after the mark. For Japanese (no spaces), + // we treat the punctuation itself as a boundary regardless of following char. + const sentenceEndRegex = /[.!?؟۔](?=\s|$)|[。!?]|\n\n/g; let lastSentenceBoundary = -1; let match; while ((match = sentenceEndRegex.exec(text)) && match.index < limit) {