mirror of
https://github.com/jambonz/jambonz-feature-server.git
synced 2025-12-19 04:17:44 +00:00
Enhance TTS sentence boundary detection for Arabic and Japanese (#1464)
Update sentenceEndRegex to treat the following as sentence boundaries: ASCII .!? followed by whitespace or end-of-text; Arabic question mark (؟) and full stop (۔) with the same rule; Japanese 。, !, ? treated as boundaries regardless of following character; and double newlines (\n\n). This improves streaming chunking for mixed-language content.
This commit is contained in:
@@ -437,7 +437,15 @@ class TtsStreamingBuffer extends Emitter {
|
||||
|
||||
const findSentenceBoundary = (text, limit) => {
|
||||
// Look for punctuation or double newline that signals sentence end.
|
||||
const sentenceEndRegex = /[.!?](?=\s|$)|\n\n/g;
|
||||
// Includes:
|
||||
// - ASCII: . ! ?
|
||||
// - Arabic: ؟ (question mark), ۔ (full stop)
|
||||
// - Japanese: 。 (full stop), !, ? (full-width exclamation/question)
|
||||
//
|
||||
// For languages that use spaces between sentences, we still require
|
||||
// whitespace or end-of-string after the mark. For Japanese (no spaces),
|
||||
// we treat the punctuation itself as a boundary regardless of following char.
|
||||
const sentenceEndRegex = /[.!?؟۔](?=\s|$)|[。!?]|\n\n/g;
|
||||
let lastSentenceBoundary = -1;
|
||||
let match;
|
||||
while ((match = sentenceEndRegex.exec(text)) && match.index < limit) {
|
||||
|
||||
Reference in New Issue
Block a user