From 1ad0261336978e876ab868f3d9814cac58c9efa8 Mon Sep 17 00:00:00 2001
From: Vinod Dharashive <vdharashive@gmail.com>
Date: Mon, 8 Dec 2025 21:14:20 +0530
Subject: [PATCH] Enhance TTS sentence boundary detection for Arabic and
 Japanese (#1464)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update sentenceEndRegex to treat the following as sentence boundaries: ASCII .!? followed by whitespace or end-of-text; Arabic question mark (؟) and full stop (۔) with the same rule; Japanese 。, ！, ？ treated as boundaries regardless of following character; and double newlines (\n\n). This improves streaming chunking for mixed-language content.
---
 lib/utils/tts-streaming-buffer.js | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/lib/utils/tts-streaming-buffer.js b/lib/utils/tts-streaming-buffer.js
index b397aeeb..6745121f 100644
--- a/lib/utils/tts-streaming-buffer.js
+++ b/lib/utils/tts-streaming-buffer.js
@@ -437,7 +437,15 @@ class TtsStreamingBuffer extends Emitter {
 
 const findSentenceBoundary = (text, limit) => {
   // Look for punctuation or double newline that signals sentence end.
-  const sentenceEndRegex = /[.!?](?=\s|$)|\n\n/g;
+  // Includes:
+  //   - ASCII: . ! ?
+  //   - Arabic: ؟ (question mark), ۔ (full stop)
+  //   - Japanese: 。 (full stop), ！, ？ (full-width exclamation/question)
+  //
+  // For languages that use spaces between sentences, we still require
+  // whitespace or end-of-string after the mark. For Japanese (no spaces),
+  // we treat the punctuation itself as a boundary regardless of following char.
+  const sentenceEndRegex = /[.!?؟۔](?=\s|$)|[。！？]|\n\n/g;
   let lastSentenceBoundary = -1;
   let match;
   while ((match = sentenceEndRegex.exec(text)) && match.index < limit) {