punctuation for microsoft (#566)

* punctuation for microsoft

* wip
This commit is contained in:
Hoan Luu Huu
2023-12-18 20:38:05 +07:00
committed by GitHub
parent bcb4bf43bf
commit 30977b309c
3 changed files with 10 additions and 7 deletions

View File

@@ -590,7 +590,8 @@ class TaskGather extends SttTask {
return;
}
evt = this.normalizeTranscription(evt, this.vendor, 1, this.language, this.shortUtterance);
evt = this.normalizeTranscription(evt, this.vendor, 1, this.language,
this.shortUtterance, this.data.recognizer.punctuation);
if (evt.alternatives.length === 0) {
this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening');
return;

View File

@@ -305,7 +305,8 @@ class TaskTranscribe extends SttTask {
}
this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - before normalization');
evt = this.normalizeTranscription(evt, this.vendor, channel, this.language);
evt = this.normalizeTranscription(evt, this.vendor, channel, this.language, undefined,
this.data.recognizer.punctuation);
this.logger.debug({evt}, 'TaskTranscribe:_onTranscription');
if (evt.alternatives.length === 0) {
this.logger.info({evt}, 'TaskTranscribe:_onTranscription - got empty transcript, continue listening');

View File

@@ -338,19 +338,20 @@ const normalizeNuance = (evt, channel, language) => {
};
};
const normalizeMicrosoft = (evt, channel, language) => {
const normalizeMicrosoft = (evt, channel, language, punctuation = true) => {
const copy = JSON.parse(JSON.stringify(evt));
const nbest = evt.NBest;
const language_code = evt.PrimaryLanguage?.Language || language;
const alternatives = nbest ? nbest.map((n) => {
return {
confidence: n.Confidence,
transcript: n.Display
// remove all puntuation if needed
transcript: punctuation ? n.Display : n.Display.replace(/\p{P}/gu, '')
};
}) :
[
{
transcript: evt.DisplayText || evt.Text
transcript: punctuation ? evt.DisplayText || evt.Text : (evt.DisplayText || evt.Text).replace(/\p{P}/gu, '')
}
];
@@ -400,14 +401,14 @@ const normalizeAssemblyAi = (evt, channel, language) => {
};
module.exports = (logger) => {
const normalizeTranscription = (evt, vendor, channel, language, shortUtterance) => {
const normalizeTranscription = (evt, vendor, channel, language, shortUtterance, punctuation) => {
//logger.debug({ evt, vendor, channel, language }, 'normalizeTranscription');
switch (vendor) {
case 'deepgram':
return normalizeDeepgram(evt, channel, language, shortUtterance);
case 'microsoft':
return normalizeMicrosoft(evt, channel, language);
return normalizeMicrosoft(evt, channel, language, punctuation);
case 'google':
return normalizeGoogle(evt, channel, language);
case 'aws':