Feature/azure recognition (#46)

* add support for microsoft speech recognition * update to drachtio-fsmrf that support microsoft stt * gather and transcribe now support microsoft
2026-02-13 01:39:26 +00:00 · 2021-11-26 16:40:25 -06:00
parent fe1778e9ae
commit 1e93973419
6 changed files with 180 additions and 66 deletions
--- a/lib/tasks/gather.js
+++ b/lib/tasks/gather.js
@@ -3,7 +3,8 @@ const {
  TaskName,
  TaskPreconditions,
  GoogleTranscriptionEvents,
-  AwsTranscriptionEvents
+  AwsTranscriptionEvents,
+  AzureTranscriptionEvents
 } = require('../utils/constants');

 const makeTask = require('./make_task');
@@ -33,6 +34,12 @@ class TaskGather extends Task {
      this.vocabularyName = recognizer.vocabularyName;
      this.vocabularyFilterName = recognizer.vocabularyFilterName;
      this.filterMethod = recognizer.filterMethod;
+
+      /* microsoft options */
+      this.outputFormat = recognizer.outputFormat || 'simple';
+      this.profanityOption = recognizer.profanityOption || 'raw';
+      this.requestSnr = recognizer.requestSnr || false;
+      this.initialSpeechTimeoutMs = recognizer.initialSpeechTimeoutMs || 0;
    }

    this.digitBuffer = '';
@@ -63,7 +70,7 @@ class TaskGather extends Task {
    this.sttCredentials = cs.getSpeechCredentials(this.vendor, 'stt');
    if (this.needsStt && !this.sttCredentials) {
      const {writeAlerts, AlertType} = cs.srf.locals;
-      this.logger.info(`TaskGather:exec - ERROR stt using ${this.vendor} requested but not creds supplied`);
+      this.logger.info(`TaskGather:exec - ERROR stt using ${this.vendor} requested but creds not supplied`);
      writeAlerts({
        account_sid: cs.accountSid,
        alert_type: AlertType.STT_NOT_PROVISIONED,
@@ -106,6 +113,8 @@ class TaskGather extends Task {
    ep.removeCustomEventListener(GoogleTranscriptionEvents.Transcription);
    ep.removeCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance);
    ep.removeCustomEventListener(AwsTranscriptionEvents.Transcription);
+    ep.removeCustomEventListener(AzureTranscriptionEvents.Transcription);
+    ep.removeCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected);
  }

  kill(cs) {
@@ -135,7 +144,9 @@ class TaskGather extends Task {
        GOOGLE_SPEECH_SINGLE_UTTERANCE: true,
        GOOGLE_SPEECH_MODEL: 'command_and_search'
      });
-      if (this.hints && this.hints.length > 1) opts.GOOGLE_SPEECH_HINTS = this.hints.join(',');
+      if (this.hints && this.hints.length > 1) {
+        opts.GOOGLE_SPEECH_HINTS = this.hints.map((h) => h.trim()).join(',');
+      }
      if (this.altLanguages && this.altLanguages.length > 1) {
        opts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = this.altLanguages.join(',');
      }
@@ -145,22 +156,41 @@ class TaskGather extends Task {
      ep.addCustomEventListener(GoogleTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
      ep.addCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance, this._onEndOfUtterance.bind(this, cs, ep));
    }
-    else {
+    else if (['aws', 'polly'].includes(this.vendor)) {
      if (this.vocabularyName) opts.AWS_VOCABULARY_NAME = this.vocabularyName;
      if (this.vocabularyFilterName) {
        opts.AWS_VOCABULARY_NAME = this.vocabularyFilterName;
        opts.AWS_VOCABULARY_FILTER_METHOD = this.filterMethod || 'mask';
      }
-      Object.assign(opts, {
-        AWS_ACCESS_KEY_ID: this.sttCredentials.accessKeyId,
-        AWS_SECRET_ACCESS_KEY: this.sttCredentials.secretAccessKey,
-        AWS_REGION: this.sttCredentials.region
-      });
+      if (this.sttCredentials) {
+        Object.assign(opts, {
+          AWS_ACCESS_KEY_ID: this.sttCredentials.accessKeyId,
+          AWS_SECRET_ACCESS_KEY: this.sttCredentials.secretAccessKey,
+          AWS_REGION: this.sttCredentials.region
+        });
+      }
      ep.addCustomEventListener(AwsTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
    }
+    else if ('microsoft' === this.vendor) {
+      if (this.sttCredentials) {
+        Object.assign(opts, {
+          'AZURE_SUBSCRIPTION_KEY': this.sttCredentials.api_key,
+          'AZURE_REGION': this.sttCredentials.region
+        });
+      }
+      if (this.hints && this.hints.length > 1) {
+        opts.AZURE_SPEECH_HINTS = this.hints.map((h) => h.trim()).join(',');
+      }
+      //if (this.requestSnr) opts.AZURE_REQUEST_SNR = 1;
+      //if (this.profanityOption !== 'raw') opts.AZURE_PROFANITY_OPTION = this.profanityOption;
+      if (this.initialSpeechTimeoutMs > 0) opts.AZURE_INITIAL_SPEECH_TIMEOUT_MS = this.initialSpeechTimeoutMs;
+      opts.AZURE_USE_OUTPUT_FORMAT_DETAILED = 1;
+
+      ep.addCustomEventListener(AzureTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
+      ep.addCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected, this._onNoSpeechDetected.bind(this, cs, ep));
+    }
    await ep.set(opts)
      .catch((err) => this.logger.info(err, 'Error setting channel variables'));
-
  }

  _startTranscribing(ep) {
@@ -208,11 +238,21 @@ class TaskGather extends Task {

  _onTranscription(cs, ep, evt) {
    if ('aws' === this.vendor && Array.isArray(evt) && evt.length > 0) evt = evt[0];
-    this.logger.debug(evt, 'TaskGather:_onTranscription');
-    const final = evt.is_final;
-    if (final) {
-      this._resolve('speech', evt);
+    if ('microsoft' === this.vendor) {
+      const nbest = evt.NBest;
+      const newEvent = {
+        is_final: evt.RecognitionStatus === 'Success',
+        alternatives: [
+          {
+            confidence: nbest[0].Confidence,
+            transcript: nbest[0].Display
+          }
+        ]
+      };
+      evt = newEvent;
    }
+    this.logger.debug(evt, 'TaskGather:_onTranscription');
+    if (evt.is_final) this._resolve('speech', evt);
    else if (this.partialResultHook) {
      this.cs.requestor.request(this.partialResultHook,  Object.assign({speech: evt}, this.cs.callInfo))
        .catch((err) => this.logger.info(err, 'GatherTask:_onTranscription error'));
@@ -225,6 +265,10 @@ class TaskGather extends Task {
    }
  }

+  _onNoSpeechDetected(cs, ep) {
+    this._resolve('timeout');
+  }
+
  async _resolve(reason, evt) {
    if (this.resolved) return;
    this.resolved = true;
--- a/lib/tasks/specs.json
+++ b/lib/tasks/specs.json
@@ -348,7 +348,7 @@
    "properties": {
      "vendor": {
        "type": "string",
-        "enum": ["google", "aws", "polly", "default"]
+        "enum": ["google", "aws", "polly", "microsoft", "default"]
      },
      "language": "string",
      "voice": "string",
@@ -365,7 +365,7 @@
    "properties": {
      "vendor": {
        "type": "string",
-        "enum": ["google", "aws", "default"]
+        "enum": ["google", "aws", "microsoft", "default"]
      },
      "language": "string",
      "hints": "array",
@@ -405,7 +405,24 @@
          "mask",
          "tag"
        ]
-      }
+      },
+      "outputFormat": {
+        "type": "string",
+        "enum": [
+          "simple",
+          "detailed"
+        ]
+      },
+      "profanityOption": {
+        "type": "string",
+        "enum": [
+          "masked",
+          "removed",
+          "raw"
+        ]
+      },
+      "requestSnr": "boolean",
+      "initialSpeechTimeoutMs": "number"
    },
    "required": [
      "vendor"
--- a/lib/tasks/transcribe.js
+++ b/lib/tasks/transcribe.js
@@ -3,6 +3,7 @@ const {
  TaskName,
  TaskPreconditions,
  GoogleTranscriptionEvents,
+  AzureTranscriptionEvents,
  AwsTranscriptionEvents
 } = require('../utils/constants');

@@ -38,6 +39,12 @@ class TaskTranscribe extends Task {
    this.vocabularyName = recognizer.vocabularyName;
    this.vocabularyFilterName = recognizer.vocabularyFilterName;
    this.filterMethod = recognizer.filterMethod;
+
+    /* microsoft options */
+    this.outputFormat = recognizer.outputFormat || 'simple';
+    this.profanityOption = recognizer.profanityOption || 'raw';
+    this.requestSnr = recognizer.requestSnr || false;
+    this.initialSpeechTimeoutMs = recognizer.initialSpeechTimeoutMs || 0;
  }

  get name() { return TaskName.Transcribe; }
@@ -53,7 +60,13 @@ class TaskTranscribe extends Task {

    try {
      if (!this.sttCredentials) {
-        // TODO: generate alert (actually should be done by cs.getSpeechCredentials)
+        const {writeAlerts, AlertType} = cs.srf.locals;
+        this.logger.info(`TaskTranscribe:exec - ERROR stt using ${this.vendor} requested but creds not supplied`);
+        writeAlerts({
+          account_sid: cs.accountSid,
+          alert_type: AlertType.STT_NOT_PROVISIONED,
+          vendor: this.vendor
+        }).catch((err) => this.logger.info({err}, 'Error generating alert for no stt'));
        throw new Error('no provisioned speech credentials for TTS');
      }
      await this._startTranscribing(cs, ep);
@@ -70,6 +83,8 @@ class TaskTranscribe extends Task {
    ep.removeCustomEventListener(AwsTranscriptionEvents.Transcription);
    ep.removeCustomEventListener(AwsTranscriptionEvents.NoAudioDetected);
    ep.removeCustomEventListener(AwsTranscriptionEvents.MaxDurationExceeded);
+    ep.removeCustomEventListener(AzureTranscriptionEvents.Transcription);
+    ep.removeCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected);
  }

  async kill(cs) {
@@ -96,6 +111,8 @@ class TaskTranscribe extends Task {
    ep.addCustomEventListener(AwsTranscriptionEvents.NoAudioDetected, this._onNoAudio.bind(this, cs, ep));
    ep.addCustomEventListener(AwsTranscriptionEvents.MaxDurationExceeded,
      this._onMaxDurationExceeded.bind(this, cs, ep));
+    ep.addCustomEventListener(AzureTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
+    ep.addCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected, this._onNoAudio.bind(this, cs, ep));

    if (this.vendor === 'google') {
      if (this.sttCredentials) opts.GOOGLE_APPLICATION_CREDENTIALS = JSON.stringify(this.sttCredentials.credentials);
@@ -164,6 +181,22 @@ class TaskTranscribe extends Task {
      await ep.set(opts)
        .catch((err) => this.logger.info(err, 'TaskTranscribe:_startTranscribing with aws'));
    }
+    else if (this.vendor === 'microsoft') {
+      Object.assign(opts, {
+        'AZURE_SUBSCRIPTION_KEY': this.sttCredentials.api_key,
+        'AZURE_REGION': this.sttCredentials.region
+      });
+      if (this.hints && this.hints.length > 1) {
+        opts.AZURE_SPEECH_HINTS = this.hints.map((h) => h.trim()).join(',');
+      }
+      if (this.requestSnr) opts.AZURE_REQUEST_SNR = 1;
+      if (this.profanityOption !== 'raw') opts.AZURE_PROFANITY_OPTION = this.profanityOption;
+      if (this.initialSpeechTimeoutMs > 0) opts.AZURE_INITIAL_SPEECH_TIMEOUT_MS = this.initialSpeechTimeoutMs;
+      if (this.outputFormat !== 'simple') opts.AZURE_USE_OUTPUT_FORMAT_DETAILED = 1;
+
+      await ep.set(opts)
+        .catch((err) => this.logger.info(err, 'TaskTranscribe:_startTranscribing with azure'));
+    }
    await this._transcribe(ep);
  }

@@ -178,6 +211,20 @@ class TaskTranscribe extends Task {

  _onTranscription(cs, ep, evt) {
    if ('aws' === this.vendor && Array.isArray(evt) && evt.length > 0) evt = evt[0];
+    if ('microsoft' === this.vendor) {
+      const nbest = evt.NBest;
+      const alternatives = nbest.map((n) => {
+        return {
+          confidence: n.Confidence,
+          transcript: n.Display
+        };
+      });
+      const newEvent = {
+        is_final: evt.RecognitionStatus === 'Success',
+        alternatives
+      };
+      evt = newEvent;
+    }
    this.logger.debug(evt, 'TaskTranscribe:_onTranscription');

    this.cs.requestor.request(this.transcriptionHook, Object.assign({speech: evt}, this.cs.callInfo))