fix: split ssml to correct chunks (#225)

* fix: split ssml to correct chunks

* fix: split ssml to correct chunks

* fixed: eslint

* fixed: eslint

* fixed: add comment to testcase

* fixed: review comments

* fixed: review comments

* fixed: review comments

* fixed: review comments

* fixed: review comments

Co-authored-by: Quan HL <quanluuhoang8@gmail.com>
This commit is contained in:
Hoan Luu Huu
2023-01-24 21:48:31 +07:00
committed by GitHub
parent 8c0044a378
commit 088316d266
6 changed files with 78 additions and 87 deletions

View File

@@ -1,96 +1,26 @@
const Task = require('./task');
const {TaskName, TaskPreconditions} = require('../utils/constants');
const pollySSMLSplit = require('polly-ssml-split');
const breakLengthyTextIfNeeded = (logger, text) => {
const chunkSize = 1000;
if (text.length <= chunkSize) return [text];
const result = [];
const isSSML = text.startsWith('<speak>');
let startPos = 0;
let charPos = isSSML ? 7 : 0; // skip <speak>
let tag;
//logger.debug({isSSML}, `breakLengthyTextIfNeeded: handling text of length ${text.length}`);
while (startPos + charPos < text.length) {
if (isSSML && !tag && text[startPos + charPos] === '<') {
const tagStartPos = ++charPos;
while (startPos + charPos < text.length) {
if (text[startPos + charPos] === '>') {
if (text[startPos + charPos - 1] === '\\') tag = null;
else if (!tag) tag = text.substring(startPos + tagStartPos, startPos + charPos - 1);
break;
}
if (!tag) {
const c = text[startPos + charPos];
if (c === ' ') {
tag = text.substring(startPos + tagStartPos, startPos + charPos);
//logger.debug(`breakLengthyTextIfNeeded: enter tag ${tag} (space)`);
break;
}
}
charPos++;
}
if (tag) {
//search for end of tag
//logger.debug(`breakLengthyTextIfNeeded: searching forward for </${tag}>`);
const e1 = text.indexOf(`</${tag}>`, startPos + charPos);
const e2 = text.indexOf('/>', startPos + charPos);
const tagEndPos = e1 === -1 ? e2 : e2 === -1 ? e1 : Math.min(e1, e2);
if (tagEndPos === -1) {
//logger.debug(`breakLengthyTextIfNeeded: exit tag ${tag} not found, exiting`);
} else {
//logger.debug(`breakLengthyTextIfNeeded: exit tag ${tag} found at ${tagEndPos}`);
charPos = tagEndPos + 1;
}
tag = null;
}
continue;
}
if (charPos < chunkSize) {
charPos++;
continue;
}
// start looking for a good break point
let chunkIt = false;
const a = text[startPos + charPos];
const b = text[startPos + charPos + 1];
if (/[\.!\?]/.test(a) && /\s/.test(b)) {
//logger.debug('breakLengthyTextIfNeeded: breaking at sentence end');
chunkIt = true;
}
if (chunkIt) {
charPos++;
const chunk = text.substr(startPos, charPos);
if (isSSML) {
result.push(0 === startPos ? `${chunk}</speak>` : `<speak>${chunk}</speak>`);
}
else result.push(chunk);
charPos = 0;
startPos += chunk.length;
//logger.debug({chunk: result[result.length - 1]},
// `breakLengthyTextIfNeeded: chunked; new starting pos ${startPos}`);
}
else charPos++;
if (text.length <= chunkSize || !isSSML) return [text];
const options = {
// MIN length
softLimit: 100,
// MAX length, exclude 15 characters <speak></speak>
hardLimit: chunkSize - 15,
// Set of extra split characters (Optional property)
extraSplitChars: ',;!?',
};
pollySSMLSplit.configure(options);
try {
return pollySSMLSplit.split(text);
} catch (err) {
logger.info({err}, 'Error spliting SSML long text');
return [text];
}
// final chunk
if (startPos < text.length) {
const chunk = text.substr(startPos);
if (isSSML) {
result.push(0 === startPos ? `${chunk}</speak>` : `<speak>${chunk}`);
}
else result.push(chunk);
//logger.debug({chunk: result[result.length - 1]},
// `breakLengthyTextIfNeeded: final chunk; starting pos ${startPos} length ${chunk.length}`);
}
return result;
};
class TaskSay extends Task {