Feature/vm detection (#137)

* initial changes for amd * wip * fix bug where transcripts were discarded * a bit of refactoring, and adding support for avmd in config verb * bug fixes
2026-02-11 17:01:30 +00:00 · 2022-07-27 17:46:52 +01:00
parent a035b67e6c
commit 2882fa2d0a
14 changed files with 503 additions and 30 deletions
--- a/lib/utils/amd-utils.js
+++ b/lib/utils/amd-utils.js
@@ -0,0 +1,324 @@
+const Emitter = require('events');
+const {readFile} = require('fs');
+const {
+  GoogleTranscriptionEvents,
+  AwsTranscriptionEvents,
+  AzureTranscriptionEvents,
+  AmdEvents,
+  AvmdEvents
+} = require('./constants');
+const bugname = 'amd_bug';
+const {VMD_HINTS_FILE} = process.env;
+let voicemailHints = [];
+
+const updateHints = async(file, callback) => {
+  readFile(file, 'utf8', (err, data) => {
+    if (err) return callback(err);
+    try {
+      callback(null, JSON.parse(data));
+    } catch (err) {
+      callback(err);
+    }
+  });
+};
+
+if (VMD_HINTS_FILE) {
+  updateHints(VMD_HINTS_FILE, (err, hints) => {
+    if (err) {  console.error(err); }
+    voicemailHints = hints;
+
+    /* if successful, update the hints every hour */
+    setInterval(() => {
+      updateHints(VMD_HINTS_FILE, (err, hints) => {
+        if (err) {  console.error(err); }
+        voicemailHints = hints;
+      });
+    }, 60000);
+  });
+}
+
+class Amd extends Emitter {
+  constructor(logger, cs, opts) {
+    super();
+    this.logger = logger;
+    this.vendor = opts.recognizer?.vendor || cs.speechRecognizerVendor;
+    if ('default' === this.vendor) this.vendor = cs.speechRecognizerVendor;
+
+    this.language = opts.recognizer?.language || cs.speechRecognizerLanguage;
+    if ('default' === this.language) this.language = cs.speechRecognizerLanguage;
+
+    this.sttCredentials = cs.getSpeechCredentials(this.vendor, 'stt');
+
+    if (!this.sttCredentials) throw new Error(`No speech credentials found for vendor ${this.vendor}`);
+
+    this.thresholdWordCount = opts.thresholdWordCount || 9;
+    const {normalizeTranscription} = require('./transcription-utils')(logger);
+    this.normalizeTranscription = normalizeTranscription;
+
+    const {
+      noSpeechTimeoutMs = 5000,
+      decisionTimeoutMs = 15000,
+      toneTimeoutMs = 20000,
+      greetingCompletionTimeoutMs = 2000
+    } = opts.timers || {};
+    this.noSpeechTimeoutMs = noSpeechTimeoutMs;
+    this.decisionTimeoutMs = decisionTimeoutMs;
+    this.toneTimeoutMs = toneTimeoutMs;
+    this.greetingCompletionTimeoutMs = greetingCompletionTimeoutMs;
+
+    this.beepDetected = false;
+  }
+
+  startDecisionTimer() {
+    this.decisionTimer = setTimeout(this._onDecisionTimeout.bind(this), this.decisionTimeoutMs);
+    this.noSpeechTimer = setTimeout(this._onNoSpeechTimeout.bind(this), this.noSpeechTimeoutMs);
+    this.startToneTimer();
+  }
+  stopDecisionTimer() {
+    this.decisionTimer && clearTimeout(this.decisionTimer);
+  }
+  stopNoSpeechTimer() {
+    this.noSpeechTimer && clearTimeout(this.noSpeechTimer);
+  }
+  startToneTimer() {
+    this.toneTimer = setTimeout(this._onToneTimeout.bind(this), this.toneTimeoutMs);
+  }
+  startGreetingCompletionTimer() {
+    this.greetingCompletionTimer = setTimeout(
+      this._onGreetingCompletionTimeout.bind(this),
+      this.beepDetected ? 1000 : this.greetingCompletionTimeoutMs);
+  }
+  stopGreetingCompletionTimer() {
+    this.greetingCompletionTimer && clearTimeout(this.greetingCompletionTimer);
+  }
+  restartGreetingCompletionTimer() {
+    this.stopGreetingCompletionTimer();
+    this.startGreetingCompletionTimer();
+  }
+  stopToneTimer() {
+    this.toneTimer && clearTimeout(this.toneTimer);
+  }
+  stopAllTimers() {
+    this.stopDecisionTimer();
+    this.stopNoSpeechTimer();
+    this.stopToneTimer();
+    this.stopGreetingCompletionTimer();
+  }
+  _onDecisionTimeout() {
+    this.emit(this.decision = AmdEvents.DecisionTimeout);
+    this.stopNoSpeechTimer();
+  }
+  _onToneTimeout() {
+    this.emit(AmdEvents.ToneTimeout);
+  }
+  _onNoSpeechTimeout() {
+    this.emit(this.decision = AmdEvents.NoSpeechDetected);
+    this.stopDecisionTimer();
+  }
+  _onGreetingCompletionTimeout() {
+    this.emit(AmdEvents.MachineStoppedSpeaking);
+  }
+
+  evaluateTranscription(evt) {
+    if (this.decision) {
+      /* at this point we are only listening for the machine to stop speaking */
+      if (this.decision === AmdEvents.MachineDetected) {
+        this.restartGreetingCompletionTimer();
+      }
+      return;
+    }
+    this.stopNoSpeechTimer();
+
+    this.logger.debug({evt}, 'Amd:evaluateTranscription - raw');
+    const t = this.normalizeTranscription(evt, this.vendor, this.language);
+    const hints = voicemailHints[this.language] || [];
+
+    this.logger.debug({t}, 'Amd:evaluateTranscription - normalized');
+
+    if (Array.isArray(t.alternatives) && t.alternatives.length > 0) {
+      const wordCount = t.alternatives[0].transcript.split(' ').length;
+      const final = t.is_final;
+
+      const foundHint = hints.find((h) =>  t.alternatives[0].transcript.includes(h));
+      if (foundHint) {
+        /* we detected a common voice mail greeting */
+        this.logger.debug(`Amd:evaluateTranscription: found hint ${foundHint}`);
+        this.emit(this.decision = AmdEvents.MachineDetected, {
+          reason: 'hint',
+          hint: foundHint,
+          language: t.language_code
+        });
+      }
+      else if (final && wordCount < this.thresholdWordCount) {
+        /* a short greeting is typically a human */
+        this.emit(this.decision = AmdEvents.HumanDetected, {
+          reason: 'short greeting',
+          greeting: t.alternatives[0].transcript,
+          language: t.language_code
+        });
+      }
+      else if (wordCount >= this.thresholdWordCount) {
+        /* a long greeting is typically a machine */
+        this.emit(this.decision = AmdEvents.MachineDetected, {
+          reason: 'long greeting',
+          greeting: t.alternatives[0].transcript,
+          language: t.language_code
+        });
+      }
+
+      if (this.decision) {
+        this.stopDecisionTimer();
+
+        if (this.decision === AmdEvents.MachineDetected) {
+          /* if we detected a machine, then wait for greeting to end */
+          this.startGreetingCompletionTimer();
+        }
+      }
+      return this.decision;
+    }
+  }
+}
+
+module.exports = (logger) => {
+  const startTranscribing = async(cs, ep, task) => {
+    const {vendor, language} = ep.amd;
+    ep.startTranscription({
+      vendor,
+      language,
+      interim: true,
+      bugname
+    }).catch((err) => {
+      const {writeAlerts, AlertType} = cs.srf.locals;
+      ep.amd = null;
+      task.emit(AmdEvents.Error, err);
+      logger.error(err, 'amd:_startTranscribing error');
+      writeAlerts({
+        account_sid: cs.accountSid,
+        alert_type: AlertType.STT_FAILURE,
+        vendor: vendor,
+        detail: err.message
+      });
+    }).catch((err) => logger.info({err}, 'Error generating alert for tts failure'));
+
+  };
+
+  const onEndOfUtterance = (cs, ep, task) => {
+    logger.debug('amd:onEndOfUtterance');
+    startTranscribing(cs, ep, task);
+  };
+  const onNoSpeechDetected = (cs, ep, task) => {
+    logger.debug('amd:onNoSpeechDetected');
+    ep.amd.stopAllTimers();
+    task.emit(AmdEvents.NoSpeechDetected);
+  };
+  const onTranscription = (cs, ep, task, evt, fsEvent) => {
+    if (fsEvent.getHeader('media-bugname') !== bugname || ep.amd.decision) return;
+    ep.amd?.evaluateTranscription(evt);
+  };
+  const onBeep = (cs, ep, task, evt, fsEvent) => {
+    logger.debug({evt, fsEvent}, 'onBeep');
+    const frequency = Math.floor(fsEvent.getHeader('Frequency'));
+    const variance = Math.floor(fsEvent.getHeader('Frequency-variance'));
+    task.emit('amd', {type: AmdEvents.ToneDetected, frequency, variance});
+    if (ep.amd) {
+      ep.amd.stopToneTimer();
+      ep.amd.beepDetected = true;
+    }
+    ep.execute('avmd_stop').catch((err) => this.logger.info(err, 'Error stopping avmd'));
+  };
+
+  const startAmd = async(cs, ep, task, opts) => {
+    const amd = ep.amd = new Amd(logger, cs, opts);
+    const {vendor, language, sttCredentials} = amd;
+    const sttOpts = {};
+    const hints = voicemailHints[language] || [];
+
+    /* set stt options */
+    logger.info(`starting amd for vendor ${vendor} and language ${language}`);
+    if ('google' === vendor) {
+      sttOpts.GOOGLE_APPLICATION_CREDENTIALS = JSON.stringify(sttCredentials.credentials);
+      sttOpts.GOOGLE_SPEECH_USE_ENHANCED = true;
+      sttOpts.GOOGLE_SPEECH_HINTS = hints.join(',');
+      if (opts.recognizer?.altLanguages) {
+        sttOpts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = opts.recognizer.altLanguages.join(',');
+      }
+      ep.addCustomEventListener(GoogleTranscriptionEvents.Transcription, onTranscription.bind(null, cs, ep, task));
+      ep.addCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance, onEndOfUtterance.bind(null, cs, ep, task));
+    }
+    else if (['aws', 'polly'].includes(vendor)) {
+      Object.assign(sttOpts, {
+        AWS_ACCESS_KEY_ID: sttCredentials.accessKeyId,
+        AWS_SECRET_ACCESS_KEY: sttCredentials.secretAccessKey,
+        AWS_REGION: sttCredentials.region
+      });
+      ep.addCustomEventListener(AwsTranscriptionEvents.Transcription, onTranscription.bind(null, cs, ep, task));
+    }
+    else if ('microsoft' === vendor) {
+      Object.assign(sttOpts, {
+        'AZURE_SUBSCRIPTION_KEY': sttCredentials.api_key,
+        'AZURE_REGION': sttCredentials.region
+      });
+      sttOpts.AZURE_SPEECH_HINTS = hints.join(',');
+      if (opts.recognizer?.altLanguages) {
+        sttOpts.AZURE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = opts.recognizer.altLanguages.join(',');
+      }
+      sttOpts.AZURE_INITIAL_SPEECH_TIMEOUT_MS = opts.resolveTimeoutMs || 20000;
+
+      ep.addCustomEventListener(AzureTranscriptionEvents.Transcription, onTranscription.bind(null, cs, ep, task));
+      ep.addCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected, onNoSpeechDetected.bind(null, cs, ep, task));
+    }
+    logger.debug({sttOpts}, 'startAmd: setting channel vars');
+    await ep.set(sttOpts).catch((err) => logger.info(err, 'Error setting channel variables'));
+
+    amd
+      .on(AmdEvents.NoSpeechDetected, (evt) => {
+        task.emit('amd', {type: AmdEvents.NoSpeechDetected, ...evt});
+        ep.stopTranscription({vendor, bugname});
+      })
+      .on(AmdEvents.HumanDetected, (evt) => {
+        task.emit('amd', {type: AmdEvents.HumanDetected, ...evt});
+        ep.stopTranscription({vendor, bugname});
+      })
+      .on(AmdEvents.MachineDetected, (evt) => {
+        task.emit('amd', {type: AmdEvents.MachineDetected, ...evt});
+      })
+      .on(AmdEvents.DecisionTimeout, (evt) => {
+        task.emit('amd', {type: AmdEvents.DecisionTimeout, ...evt});
+        ep.stopTranscription({vendor, bugname});
+      })
+      .on(AmdEvents.ToneTimeout, (evt) => {
+        //task.emit('amd', {type: AmdEvents.ToneTimeout, ...evt});
+        ep.execute('avmd_stop').catch((err) => logger.info(err, 'Error stopping avmd'));
+      })
+      .on(AmdEvents.MachineStoppedSpeaking, () => {
+        task.emit('amd', {type: AmdEvents.MachineStoppedSpeaking});
+        ep.stopTranscription({vendor, bugname});
+      });
+
+    /* start transcribing, and also listening for beep */
+    amd.startDecisionTimer();
+    startTranscribing(cs, ep, task);
+
+    ep.addCustomEventListener(AvmdEvents.Beep, onBeep.bind(null, cs, ep, task));
+    ep.execute('avmd_start').catch((err) => this.logger.info(err, 'Error starting avmd'));
+  };
+
+  const stopAmd = (ep, task) => {
+    let vendor;
+    if (ep.amd) {
+      vendor = ep.amd.vendor;
+      ep.amd.stopAllTimers();
+      ep.amd = null;
+    }
+
+    if (ep.connected) {
+      ep.stopTranscription({vendor, bugname})
+        .catch((err) => logger.info(err, 'stopAmd: Error stopping transcription'));
+      task.emit('amd', {type: AmdEvents.Stopped});
+      ep.execute('avmd_stop').catch((err) => this.logger.info(err, 'Error stopping avmd'));
+    }
+    ep.removeCustomEventListener(AvmdEvents.Beep);
+  };
+
+  return {startAmd, stopAmd};
+};
--- a/lib/utils/constants.json
+++ b/lib/utils/constants.json
@@ -56,6 +56,9 @@
    "StableCall": "stable-call",
    "UnansweredCall": "unanswered-call"
  },
+  "AvmdEvents": {
+    "Beep": "avmd::beep"
+  },
  "GoogleTranscriptionEvents": {
    "Transcription": "google_transcribe::transcription",
    "EndOfUtterance": "google_transcribe::end_of_utterance",
@@ -125,6 +128,17 @@
    "RecordingOff": "recording_off",
    "RecordingPaused": "recording_paused"
  },
+  "AmdEvents": {
+    "NoSpeechDetected": "amd_no_speech_detected",
+    "HumanDetected": "amd_human_detected",
+    "MachineDetected": "amd_machine_detected",
+    "MachineStoppedSpeaking": "amd_machine_stopped_speaking",
+    "Error": "amd_error",
+    "DecisionTimeout": "amd_decision_timeout",
+    "ToneDetected": "amd_tone_detected",
+    "ToneTimeout": "amd_tone_timeout",
+    "Stopped": "amd_stopped"
+  },
  "MAX_SIMRINGS": 10,
  "BONG_TONE": "tone_stream://v=-7;%(100,0,941.0,1477.0);v=-7;>=2;+=.1;%(1400,0,350,440)",
  "FS_UUID_SET_NAME": "fsUUIDs"
--- a/lib/utils/http-requestor.js
+++ b/lib/utils/http-requestor.js
@@ -122,7 +122,11 @@ class HttpRequestor extends BaseRequestor {
        timeout: HTTP_TIMEOUT,
        followRedirects: false
      });
-      if (![200, 202, 204].includes(statusCode)) throw new Error({statusCode});
+      if (![200, 202, 204].includes(statusCode)) {
+        const err = new Error();
+        err.statusCode = statusCode;
+        throw err;
+      }
      if (headers['content-type'].includes('application/json')) {
        buf = await body.json();
      }
--- a/lib/utils/transcription-utils.js
+++ b/lib/utils/transcription-utils.js
@@ -0,0 +1,33 @@
+module.exports = (logger) => {
+  const normalizeTranscription = (evt, vendor, channel) => {
+    if ('aws' === vendor && Array.isArray(evt) && evt.length > 0) evt = evt[0];
+    if ('microsoft' === vendor) {
+      const nbest = evt.NBest;
+      const language_code = evt.PrimaryLanguage?.Language || this.language;
+      const alternatives = nbest ? nbest.map((n) => {
+        return {
+          confidence: n.Confidence,
+          transcript: n.Display
+        };
+      }) :
+        [
+          {
+            transcript: evt.DisplayText || evt.Text
+          }
+        ];
+
+      const newEvent = {
+        is_final: evt.RecognitionStatus === 'Success',
+        channel,
+        language_code,
+        alternatives
+      };
+      evt = newEvent;
+    }
+    evt.channel_tag = channel;
+    //logger.debug({evt}, 'normalized transcription');
+    return evt;
+  };
+
+  return {normalizeTranscription};
+};