punctuation for microsoft (#566)

* punctuation for microsoft

* wip
This commit is contained in:
Hoan Luu Huu
2023-12-18 20:38:05 +07:00
committed by GitHub
parent bcb4bf43bf
commit 30977b309c
3 changed files with 10 additions and 7 deletions

View File

@@ -590,7 +590,8 @@ class TaskGather extends SttTask {
return; return;
} }
evt = this.normalizeTranscription(evt, this.vendor, 1, this.language, this.shortUtterance); evt = this.normalizeTranscription(evt, this.vendor, 1, this.language,
this.shortUtterance, this.data.recognizer.punctuation);
if (evt.alternatives.length === 0) { if (evt.alternatives.length === 0) {
this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening'); this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening');
return; return;

View File

@@ -305,7 +305,8 @@ class TaskTranscribe extends SttTask {
} }
this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - before normalization'); this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - before normalization');
evt = this.normalizeTranscription(evt, this.vendor, channel, this.language); evt = this.normalizeTranscription(evt, this.vendor, channel, this.language, undefined,
this.data.recognizer.punctuation);
this.logger.debug({evt}, 'TaskTranscribe:_onTranscription'); this.logger.debug({evt}, 'TaskTranscribe:_onTranscription');
if (evt.alternatives.length === 0) { if (evt.alternatives.length === 0) {
this.logger.info({evt}, 'TaskTranscribe:_onTranscription - got empty transcript, continue listening'); this.logger.info({evt}, 'TaskTranscribe:_onTranscription - got empty transcript, continue listening');

View File

@@ -338,19 +338,20 @@ const normalizeNuance = (evt, channel, language) => {
}; };
}; };
const normalizeMicrosoft = (evt, channel, language) => { const normalizeMicrosoft = (evt, channel, language, punctuation = true) => {
const copy = JSON.parse(JSON.stringify(evt)); const copy = JSON.parse(JSON.stringify(evt));
const nbest = evt.NBest; const nbest = evt.NBest;
const language_code = evt.PrimaryLanguage?.Language || language; const language_code = evt.PrimaryLanguage?.Language || language;
const alternatives = nbest ? nbest.map((n) => { const alternatives = nbest ? nbest.map((n) => {
return { return {
confidence: n.Confidence, confidence: n.Confidence,
transcript: n.Display // remove all puntuation if needed
transcript: punctuation ? n.Display : n.Display.replace(/\p{P}/gu, '')
}; };
}) : }) :
[ [
{ {
transcript: evt.DisplayText || evt.Text transcript: punctuation ? evt.DisplayText || evt.Text : (evt.DisplayText || evt.Text).replace(/\p{P}/gu, '')
} }
]; ];
@@ -400,14 +401,14 @@ const normalizeAssemblyAi = (evt, channel, language) => {
}; };
module.exports = (logger) => { module.exports = (logger) => {
const normalizeTranscription = (evt, vendor, channel, language, shortUtterance) => { const normalizeTranscription = (evt, vendor, channel, language, shortUtterance, punctuation) => {
//logger.debug({ evt, vendor, channel, language }, 'normalizeTranscription'); //logger.debug({ evt, vendor, channel, language }, 'normalizeTranscription');
switch (vendor) { switch (vendor) {
case 'deepgram': case 'deepgram':
return normalizeDeepgram(evt, channel, language, shortUtterance); return normalizeDeepgram(evt, channel, language, shortUtterance);
case 'microsoft': case 'microsoft':
return normalizeMicrosoft(evt, channel, language); return normalizeMicrosoft(evt, channel, language, punctuation);
case 'google': case 'google':
return normalizeGoogle(evt, channel, language); return normalizeGoogle(evt, channel, language);
case 'aws': case 'aws':