jambonz-feature-server/lib/utils/amd-utils.js

const Emitter = require('events');
const {readFile} = require('fs');
const {
  TaskName,
  GoogleTranscriptionEvents,
  AwsTranscriptionEvents,
  AzureTranscriptionEvents,
  NuanceTranscriptionEvents,
  NvidiaTranscriptionEvents,
  IbmTranscriptionEvents,
  SonioxTranscriptionEvents,
  CobaltTranscriptionEvents,
  DeepgramTranscriptionEvents,
  JambonzTranscriptionEvents,
  AmdEvents,
  AvmdEvents
} = require('./constants');
const bugname = 'amd_bug';
const {VMD_HINTS_FILE} = require('../config');
let voicemailHints = [];

const updateHints = async(file, callback) => {
  readFile(file, 'utf8', (err, data) => {
    if (err) return callback(err);
    try {
      callback(null, JSON.parse(data));
    } catch (err) {
      callback(err);
    }
  });
};

if (VMD_HINTS_FILE) {
  updateHints(VMD_HINTS_FILE, (err, hints) => {
    if (err) {  console.error(err); }
    voicemailHints = hints;

    /* if successful, update the hints every hour */
    setInterval(() => {
      updateHints(VMD_HINTS_FILE, (err, hints) => {
        if (err) {  console.error(err); }
        voicemailHints = hints;
      });
    }, 60000);
  });
}

class Amd extends Emitter {
  constructor(logger, cs, opts) {
    super();
    this.logger = logger;
    this.vendor = opts.recognizer?.vendor || cs.speechRecognizerVendor;
    if ('default' === this.vendor) this.vendor = cs.speechRecognizerVendor;

    this.language = opts.recognizer?.language || cs.speechRecognizerLanguage;
    if ('default' === this.language) this.language = cs.speechRecognizerLanguage;

    this.sttCredentials = cs.getSpeechCredentials(this.vendor, 'stt',
      opts.recognizer?.label || cs.speechRecognizerLabel);

    if (!this.sttCredentials) throw new Error(`No speech credentials found for vendor ${this.vendor}`);

    this.thresholdWordCount = opts.thresholdWordCount || 9;
    const {normalizeTranscription} = require('./transcription-utils')(logger);
    this.normalizeTranscription = normalizeTranscription;
    const {getNuanceAccessToken, getIbmAccessToken} = cs.srf.locals.dbHelpers;
    this.getNuanceAccessToken = getNuanceAccessToken;
    this.getIbmAccessToken = getIbmAccessToken;
    const {setChannelVarsForStt} = require('./transcription-utils')(logger);
    this.setChannelVarsForStt = setChannelVarsForStt;

    const {
      noSpeechTimeoutMs = 5000,
      decisionTimeoutMs = 15000,
      toneTimeoutMs = 20000,
      greetingCompletionTimeoutMs = 2000
    } = opts.timers || {};
    this.noSpeechTimeoutMs = noSpeechTimeoutMs;
    this.decisionTimeoutMs = decisionTimeoutMs;
    this.toneTimeoutMs = toneTimeoutMs;
    this.greetingCompletionTimeoutMs = greetingCompletionTimeoutMs;

    this.beepDetected = false;
  }

  startDecisionTimer() {
    this.decisionTimer = setTimeout(this._onDecisionTimeout.bind(this), this.decisionTimeoutMs);
    this.noSpeechTimer = setTimeout(this._onNoSpeechTimeout.bind(this), this.noSpeechTimeoutMs);
    this.startToneTimer();
  }
  stopDecisionTimer() {
    this.decisionTimer && clearTimeout(this.decisionTimer);
  }
  stopNoSpeechTimer() {
    this.noSpeechTimer && clearTimeout(this.noSpeechTimer);
  }
  startToneTimer() {
    this.toneTimer = setTimeout(this._onToneTimeout.bind(this), this.toneTimeoutMs);
  }
  startGreetingCompletionTimer() {
    this.greetingCompletionTimer = setTimeout(
      this._onGreetingCompletionTimeout.bind(this),
      this.beepDetected ? 1000 : this.greetingCompletionTimeoutMs);
  }
  stopGreetingCompletionTimer() {
    this.greetingCompletionTimer && clearTimeout(this.greetingCompletionTimer);
  }
  restartGreetingCompletionTimer() {
    this.stopGreetingCompletionTimer();
    this.startGreetingCompletionTimer();
  }
  stopToneTimer() {
    this.toneTimer && clearTimeout(this.toneTimer);
  }
  stopAllTimers() {
    this.stopDecisionTimer();
    this.stopNoSpeechTimer();
    this.stopToneTimer();
    this.stopGreetingCompletionTimer();
  }
  _onDecisionTimeout() {
    this.emit(this.decision = AmdEvents.DecisionTimeout);
    this.stopNoSpeechTimer();
  }
  _onToneTimeout() {
    this.emit(AmdEvents.ToneTimeout);
  }
  _onNoSpeechTimeout() {
    this.emit(this.decision = AmdEvents.NoSpeechDetected);
    this.stopDecisionTimer();
  }
  _onGreetingCompletionTimeout() {
    this.emit(AmdEvents.MachineStoppedSpeaking);
  }

  evaluateTranscription(evt) {
    if (this.decision) {
      /* at this point we are only listening for the machine to stop speaking */
      if (this.decision === AmdEvents.MachineDetected) {
        this.restartGreetingCompletionTimer();
      }
      return;
    }
    this.stopNoSpeechTimer();

    this.logger.debug({evt}, 'Amd:evaluateTranscription - raw');
    const t = this.normalizeTranscription(evt, this.vendor, this.language);
    const hints = voicemailHints[this.language] || [];

    this.logger.debug({t}, 'Amd:evaluateTranscription - normalized');

    if (Array.isArray(t.alternatives) && t.alternatives.length > 0) {
      const wordCount = t.alternatives[0].transcript.split(' ').length;
      const final = t.is_final;

      const foundHint = hints.find((h) =>  t.alternatives[0].transcript.includes(h));
      if (foundHint) {
        /* we detected a common voice mail greeting */
        this.logger.debug(`Amd:evaluateTranscription: found hint ${foundHint}`);
        this.emit(this.decision = AmdEvents.MachineDetected, {
          reason: 'hint',
          hint: foundHint,
          language: t.language_code
        });
      }
      else if (final && wordCount < this.thresholdWordCount) {
        /* a short greeting is typically a human */
        this.emit(this.decision = AmdEvents.HumanDetected, {
          reason: 'short greeting',
          greeting: t.alternatives[0].transcript,
          language: t.language_code
        });
      }
      else if (wordCount >= this.thresholdWordCount) {
        /* a long greeting is typically a machine */
        this.emit(this.decision = AmdEvents.MachineDetected, {
          reason: 'long greeting',
          greeting: t.alternatives[0].transcript,
          language: t.language_code
        });
      }

      if (this.decision) {
        this.stopDecisionTimer();

        if (this.decision === AmdEvents.MachineDetected) {
          /* if we detected a machine, then wait for greeting to end */
          this.startGreetingCompletionTimer();
        }
      }
      return this.decision;
    }
  }
}

module.exports = (logger) => {
  const startTranscribing = async(cs, ep, task) => {
    const {vendor, language} = ep.amd;
    ep.startTranscription({
      vendor,
      locale: language,
      interim: true,
      bugname
    }).catch((err) => {
      const {writeAlerts, AlertType} = cs.srf.locals;
      ep.amd = null;
      task.emit(AmdEvents.Error, err);
      logger.error(err, 'amd:_startTranscribing error');
      writeAlerts({
        account_sid: cs.accountSid,
        alert_type: AlertType.STT_FAILURE,
        vendor: vendor,
        detail: err.message,
        target_sid: cs.callSid
      });
    }).catch((err) => logger.info({err}, 'Error generating alert for tts failure'));

  };

  const onEndOfUtterance = (cs, ep, task) => {
    logger.debug('amd:onEndOfUtterance');
    startTranscribing(cs, ep, task);
  };
  const onNoSpeechDetected = (cs, ep, task) => {
    logger.debug('amd:onNoSpeechDetected');
    ep.amd.stopAllTimers();
    task.emit(AmdEvents.NoSpeechDetected);
  };
  const onTranscription = (cs, ep, task, evt, fsEvent) => {
    if (fsEvent.getHeader('media-bugname') !== bugname) return;
    ep.amd?.evaluateTranscription(evt);
  };
  const onBeep = (cs, ep, task, evt, fsEvent) => {
    logger.debug({evt, fsEvent}, 'onBeep');
    const frequency = Math.floor(fsEvent.getHeader('Frequency'));
    const variance = Math.floor(fsEvent.getHeader('Frequency-variance'));
    task.emit('amd', {type: AmdEvents.ToneDetected, frequency, variance});
    if (ep.amd) {
      ep.amd.stopToneTimer();
      ep.amd.beepDetected = true;
    }
    ep.execute('avmd_stop').catch((err) => this.logger.info(err, 'Error stopping avmd'));
  };

  const startAmd = async(cs, ep, task, opts) => {
    const amd = ep.amd = new Amd(logger, cs, opts);
    const {vendor, language} = amd;
    let sttCredentials = amd.sttCredentials;
    const hints = voicemailHints[language] || [];

    if (vendor === 'nuance' && sttCredentials.client_id) {
      /* get nuance access token */
      const {getNuanceAccessToken} = amd;
      const {client_id, secret} = sttCredentials;
      const {access_token, servedFromCache} = await getNuanceAccessToken(client_id, secret, 'asr tts');
      logger.debug({client_id}, `Gather:exec - got nuance access token ${servedFromCache ? 'from cache' : ''}`);
      sttCredentials = {...sttCredentials, access_token};
    }
    else if (vendor == 'ibm' && sttCredentials.stt_api_key) {
      /* get ibm access token */
      const {getIbmAccessToken} = amd;
      const {stt_api_key, stt_region} = sttCredentials;
      const {access_token, servedFromCache} = await getIbmAccessToken(stt_api_key);
      logger.debug({stt_api_key}, `Gather:exec - got ibm access token ${servedFromCache ? 'from cache' : ''}`);
      sttCredentials = {...sttCredentials, access_token, stt_region};
    }

    /* set stt options */
    logger.info(`starting amd for vendor ${vendor} and language ${language}`);
    const sttOpts = amd.setChannelVarsForStt({name: TaskName.Gather}, sttCredentials, language, {
      vendor,
      hints,
      enhancedModel: true,
      altLanguages: opts.recognizer?.altLanguages || [],
      initialSpeechTimeoutMs: opts.resolveTimeoutMs,
    });

    await ep.set(sttOpts).catch((err) => logger.info(err, 'Error setting channel variables'));

    amd.transcriptionHandler = onTranscription.bind(null, cs, ep, task);
    amd.EndOfUtteranceHandler = onEndOfUtterance.bind(null, cs, ep, task);
    amd.noSpeechHandler = onNoSpeechDetected.bind(null, cs, ep, task);

    switch (vendor) {
      case 'google':
        ep.addCustomEventListener(GoogleTranscriptionEvents.Transcription, amd.transcriptionHandler);
        ep.addCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance, amd.EndOfUtteranceHandler);
        break;

      case 'aws':
      case 'polly':
        ep.addCustomEventListener(AwsTranscriptionEvents.Transcription, amd.transcriptionHandler);
        break;
      case 'microsoft':
        ep.addCustomEventListener(AzureTranscriptionEvents.Transcription, amd.transcriptionHandler);
        ep.addCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected, amd.noSpeechHandler);
        break;
      case 'nuance':
        ep.addCustomEventListener(NuanceTranscriptionEvents.Transcription, amd.transcriptionHandler);
        break;

      case 'deepgram':
        ep.addCustomEventListener(DeepgramTranscriptionEvents.Transcription, amd.transcriptionHandler);
        break;

      case 'soniox':
        amd.bugname = 'soniox_amd_transcribe';
        ep.addCustomEventListener(SonioxTranscriptionEvents.Transcription, amd.transcriptionHandler);
        break;

      case 'ibm':
        ep.addCustomEventListener(IbmTranscriptionEvents.Transcription, amd.transcriptionHandler);
        break;

      case 'nvidia':
        ep.addCustomEventListener(NvidiaTranscriptionEvents.Transcription, amd.transcriptionHandler);
        break;

      case 'cobalt':
        ep.addCustomEventListener(CobaltTranscriptionEvents.Transcription, amd.transcriptionHandler);
        break;

      default:
        if (vendor.startsWith('custom:')) {
          ep.addCustomEventListener(JambonzTranscriptionEvents.Transcription, amd.transcriptionHandler);
          break;
        }
        else {
          throw new Error(`Invalid vendor ${this.vendor}`);
        }
    }
    amd
      .on(AmdEvents.NoSpeechDetected, (evt) => {
        task.emit('amd', {type: AmdEvents.NoSpeechDetected, ...evt});
        try {
          stopAmd(ep, task);
        } catch (err) {
          logger.info({err}, 'Error stopping transcription');
        }
      })
      .on(AmdEvents.HumanDetected, (evt) => {
        task.emit('amd', {type: AmdEvents.HumanDetected, ...evt});
        try {
          stopAmd(ep, task);
        } catch (err) {
          logger.info({err}, 'Error stopping transcription');
        }
      })
      .on(AmdEvents.MachineDetected, (evt) => {
        task.emit('amd', {type: AmdEvents.MachineDetected, ...evt});
      })
      .on(AmdEvents.DecisionTimeout, (evt) => {
        task.emit('amd', {type: AmdEvents.DecisionTimeout, ...evt});
        try {
          stopAmd(ep, task);
        } catch (err) {
          logger.info({err}, 'Error stopping transcription');
        }
      })
      .on(AmdEvents.ToneTimeout, (evt) => {
        //task.emit('amd', {type: AmdEvents.ToneTimeout, ...evt});
        try {
          stopAmd(ep, task);
        } catch (err) {
          logger.info({err}, 'Error stopping avmd');
        }
      })
      .on(AmdEvents.MachineStoppedSpeaking, () => {
        task.emit('amd', {type: AmdEvents.MachineStoppedSpeaking});
        try {
          stopAmd(ep, task);
        } catch (err) {
          logger.info({err}, 'Error stopping transcription');
        }
      });

    /* start transcribing, and also listening for beep */
    amd.startDecisionTimer();
    startTranscribing(cs, ep, task);

    ep.addCustomEventListener(AvmdEvents.Beep, onBeep.bind(null, cs, ep, task));
    ep.execute('avmd_start').catch((err) => this.logger.info(err, 'Error starting avmd'));
  };

  const stopAmd = (ep, task) => {
    let vendor;
    if (ep.amd) {
      vendor = ep.amd.vendor;
      ep.amd.stopAllTimers();

      ep.removeListener(GoogleTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
      ep.removeListener(GoogleTranscriptionEvents.EndOfUtterance, ep.amd.EndOfUtteranceHandler);
      ep.removeListener(AwsTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
      ep.removeListener(AzureTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
      ep.removeListener(AzureTranscriptionEvents.NoSpeechDetected, ep.amd.noSpeechHandler);
      ep.removeListener(NuanceTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
      ep.removeListener(DeepgramTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
      ep.removeListener(SonioxTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
      ep.removeListener(IbmTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
      ep.removeListener(NvidiaTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
      ep.removeListener(JambonzTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);

      ep.amd = null;
    }

    if (ep.connected) {
      ep.stopTranscription({vendor, bugname})
        .catch((err) => logger.info(err, 'stopAmd: Error stopping transcription'));
      task.emit('amd', {type: AmdEvents.Stopped});
      ep.execute('avmd_stop').catch((err) => this.logger.info(err, 'Error stopping avmd'));
    }
    ep.removeCustomEventListener(AvmdEvents.Beep);
  };

  return {startAmd, stopAmd};
};