fix transcribe fixes for speechmatics (#978)

* fix transcribe fixes for speechmatics

* update to verb-specs with fixes for speechmatics

* add support for speechmatics translation

* add handlers for receiving translations

* call translation hookd

* gather: no need to restart speechmatics after a final transcript during continuous asr

* graceful shutdown

* wip

* wip

* wip

* wip

* wip
This commit is contained in:
Dave Horton
2024-11-16 10:21:04 -05:00
committed by GitHub
parent 27f3a4b520
commit c1330d4651
8 changed files with 82 additions and 18 deletions

17
app.js
View File

@@ -100,12 +100,19 @@ createHttpListener(logger, srf)
});
setInterval(async() => {
const monInterval = setInterval(async() => {
srf.locals.stats.gauge('fs.sip.calls.count', sessionTracker.count);
// Checking system log level
const systemInformation = await srf.locals.dbHelpers.lookupSystemInformation();
if (systemInformation && systemInformation.log_level) {
logger.level = systemInformation.log_level;
try {
const systemInformation = await srf.locals.dbHelpers.lookupSystemInformation();
if (systemInformation && systemInformation.log_level) {
logger.level = systemInformation.log_level;
}
} catch (err) {
if (process.env.NODE_ENV === 'test') {
clearInterval(monInterval);
logger.error('all tests complete');
}
else logger.error({err}, 'Error checking system log level in database');
}
}, 20000);

View File

@@ -846,7 +846,8 @@ class TaskGather extends SttTask {
this._startAsrTimer();
/* some STT engines will keep listening after a final response, so no need to restart */
if (!['soniox', 'aws', 'microsoft', 'deepgram'].includes(this.vendor)) this._startTranscribing(ep);
if (!['soniox', 'aws', 'microsoft', 'deepgram', 'speechmatics']
.includes(this.vendor)) this._startTranscribing(ep);
}
else {
/* this was removed to fix https://github.com/jambonz/jambonz-feature-server/issues/783 */

View File

@@ -27,6 +27,7 @@ class TaskTranscribe extends SttTask {
super(logger, opts, parentTask);
this.transcriptionHook = this.data.transcriptionHook;
this.translationHook = this.data.translationHook;
this.earlyMedia = this.data.earlyMedia === true || (parentTask && parentTask.earlyMedia);
if (this.data.recognizer) {
@@ -302,7 +303,9 @@ class TaskTranscribe extends SttTask {
case 'speechmatics':
this.bugname = `${this.bugname_prefix}speechmatics_transcribe`;
this.addCustomEventListener(
ep, SpeechmaticsTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
ep, SpeechmaticsTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep, channel));
this.addCustomEventListener(
ep, SpeechmaticsTranscriptionEvents.Translation, this._onTranslation.bind(this, cs, ep, channel));
this.addCustomEventListener(ep, SpeechmaticsTranscriptionEvents.Info,
this._onSpeechmaticsInfo.bind(this, cs, ep));
this.addCustomEventListener(ep, SpeechmaticsTranscriptionEvents.RecognitionStarted,
@@ -441,7 +444,7 @@ class TaskTranscribe extends SttTask {
this._startAsrTimer(channel);
/* some STT engines will keep listening after a final response, so no need to restart */
if (!['soniox', 'aws', 'microsoft', 'deepgram', 'google']
if (!['soniox', 'aws', 'microsoft', 'deepgram', 'google', 'speechmatics']
.includes(this.vendor)) this._startTranscribing(cs, ep, channel);
}
else {
@@ -466,7 +469,7 @@ class TaskTranscribe extends SttTask {
this._resolve(channel, evt);
/* some STT engines will keep listening after a final response, so no need to restart */
if (!['soniox', 'aws', 'microsoft', 'deepgram', 'google'].includes(this.vendor) &&
if (!['soniox', 'aws', 'microsoft', 'deepgram', 'google', 'speechmatics'].includes(this.vendor) &&
!this.vendor.startsWith('custom:')) {
this.logger.debug('TaskTranscribe:_onTranscription - restarting transcribe');
this._startTranscribing(cs, ep, channel);
@@ -492,6 +495,47 @@ class TaskTranscribe extends SttTask {
}
}
async _onTranslation(_cs, _ep, channel, evt, _fsEvent) {
this.logger.debug({evt}, 'TaskTranscribe:_onTranslation');
if (this.translationHook && evt.results?.length > 0) {
try {
const b3 = this.getTracingPropagation();
const httpHeaders = b3 && {b3};
const payload = {
...this.cs.callInfo,
...httpHeaders,
translation: {
channel,
language: evt.language,
translation: evt.results[0].content
}
};
this.logger.debug({payload}, 'sending translationHook');
const json = await this.cs.requestor.request('verb:hook', this.translationHook, payload);
this.logger.info({json}, 'completed translationHook');
if (json && Array.isArray(json) && !this.parentTask) {
const makeTask = require('./make_task');
const tasks = normalizeJambones(this.logger, json).map((tdata) => makeTask(this.logger, tdata));
if (tasks && tasks.length > 0) {
this.logger.info({tasks: tasks}, `${this.name} replacing application with ${tasks.length} tasks`);
this.cs.replaceApplication(tasks);
}
}
} catch (err) {
this.logger.info(err, 'TranscribeTask:_onTranslation error');
}
if (this.parentTask) {
this.parentTask.emit('translation', evt);
}
}
if (this.killed) {
this.logger.debug('TaskTranscribe:_onTranslation exiting after receiving final transcription');
this._clearTimer();
this.notifyTaskDone();
}
}
async _resolve(channel, evt) {
if (evt.is_final) {
/* we've got a final transcript, so end the otel child span for this channel */
@@ -671,7 +715,7 @@ class TaskTranscribe extends SttTask {
this.logger.debug({evt}, 'TaskGather:_onSpeechmaticsInfo');
}
async _onSpeechmaticsErrror(cs, _ep, evt) {
async _onSpeechmaticsError(cs, _ep, evt) {
// eslint-disable-next-line no-unused-vars
const {message, ...e} = evt;
this._onVendorError(cs, _ep, {error: JSON.stringify(e)});

View File

@@ -129,6 +129,7 @@
},
"SpeechmaticsTranscriptionEvents": {
"Transcription": "speechmatics_transcribe::transcription",
"Translation": "speechmatics_transcribe::translation",
"Info": "speechmatics_transcribe::info",
"RecognitionStarted": "speechmatics_transcribe::recognition_started",
"ConnectFailure": "speechmatics_transcribe::connect_failed",

View File

@@ -109,6 +109,8 @@ const stickyVars = {
'SPEECHMATICS_HOST',
'SPEECHMATICS_PATH',
'SPEECHMATICS_SPEECH_HINTS',
'SPEECHMATICS_TRANSLATION_LANGUAGES',
'SPEECHMATICS_TRANSLATION_PARTIALS'
]
};
@@ -937,11 +939,18 @@ module.exports = (logger) => {
};
}
else if ('speechmatics' === vendor) {
const {speechmaticsOptions = {}} = rOpts;
opts = {
...opts,
...(sttCredentials.api_key) && {SPEECHMATICS_API_KEY: sttCredentials.api_key},
...(sttCredentials.speechmatics_stt_uri) && {SPEECHMATICS_HOST: sttCredentials.speechmatics_stt_uri},
...(rOpts.hints?.length > 0 && {SPEECHMATICS_SPEECH_HINTS: rOpts.hints.join(',')}),
...(speechmaticsOptions.translation_config &&
{
SPEECHMATICS_TRANSLATION_LANGUAGES: speechmaticsOptions.translation_config.target_languages.join(','),
SPEECHMATICS_TRANSLATION_PARTIALS: speechmaticsOptions.translation_config.enable_partials ? 1 : 0
}
),
};
}
else if (vendor.startsWith('custom:')) {

14
package-lock.json generated
View File

@@ -18,7 +18,7 @@
"@jambonz/speech-utils": "^0.1.22",
"@jambonz/stats-collector": "^0.1.10",
"@jambonz/time-series": "^0.2.9",
"@jambonz/verb-specifications": "^0.0.83",
"@jambonz/verb-specifications": "^0.0.85",
"@opentelemetry/api": "^1.8.0",
"@opentelemetry/exporter-jaeger": "^1.23.0",
"@opentelemetry/exporter-trace-otlp-http": "^0.50.0",
@@ -1581,9 +1581,9 @@
}
},
"node_modules/@jambonz/verb-specifications": {
"version": "0.0.83",
"resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.83.tgz",
"integrity": "sha512-3m1o3gnWw1yEwNfkZ6IIyUhdUqsQ3aWBNoWJHswe4udvVbEIxPHi+8jKGTJVF2vxddzwfOWp7RmMCV9RmKWzkw==",
"version": "0.0.85",
"resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.85.tgz",
"integrity": "sha512-NQRXcfamrs3lRGy/1pAaAIuZcZPj4zjwUTYf7Sv+It0fuzB4KWbiqL9yeSq61qtCX2AutwRsV4WA8OZQYfLdgw==",
"dependencies": {
"debug": "^4.3.4",
"pino": "^8.8.0"
@@ -10751,9 +10751,9 @@
}
},
"@jambonz/verb-specifications": {
"version": "0.0.83",
"resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.83.tgz",
"integrity": "sha512-3m1o3gnWw1yEwNfkZ6IIyUhdUqsQ3aWBNoWJHswe4udvVbEIxPHi+8jKGTJVF2vxddzwfOWp7RmMCV9RmKWzkw==",
"version": "0.0.85",
"resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.85.tgz",
"integrity": "sha512-NQRXcfamrs3lRGy/1pAaAIuZcZPj4zjwUTYf7Sv+It0fuzB4KWbiqL9yeSq61qtCX2AutwRsV4WA8OZQYfLdgw==",
"requires": {
"debug": "^4.3.4",
"pino": "^8.8.0"

View File

@@ -34,7 +34,7 @@
"@jambonz/speech-utils": "^0.1.22",
"@jambonz/stats-collector": "^0.1.10",
"@jambonz/time-series": "^0.2.9",
"@jambonz/verb-specifications": "^0.0.83",
"@jambonz/verb-specifications": "^0.0.85",
"@opentelemetry/api": "^1.8.0",
"@opentelemetry/exporter-jaeger": "^1.23.0",
"@opentelemetry/exporter-trace-otlp-http": "^0.50.0",

View File

@@ -351,6 +351,8 @@ speech_credential_sid CHAR(36) NOT NULL,
model VARCHAR(512) NOT NULL,
reported_usage ENUM('REPORTED_USAGE_UNSPECIFIED','REALTIME','OFFLINE') DEFAULT 'REALTIME',
name VARCHAR(64) NOT NULL,
voice_cloning_key MEDIUMTEXT,
use_voice_cloning_key BOOLEAN DEFAULT false,
PRIMARY KEY (google_custom_voice_sid)
);