initial changes for openai stt (#1127)

* initial changes for openai stt * wip * wip * wip * wip * wip * make minBargeinWordCount work for openai * wip * wip * wip * wip * wip * wip * wip * wipp * wip * wip * wip * openai stt: support for prompt templates * lint * wip * support openai semantic_vad * wip * transcribe supports openai stt * sip * wip * wip * refactor list of stt vendors that dont need to be restarted after a final transcript * cleanup * wip * cleanup * wip * wip * wip * remove credentials from log * comment
2026-02-12 01:10:30 +00:00 · 2025-03-28 13:14:58 -04:00
parent ee846b283d
commit fcaf2e59e7
11 changed files with 382 additions and 29 deletions
--- a/lib/utils/constants.json
+++ b/lib/utils/constants.json
@@ -137,6 +137,18 @@
    "Connect": "speechmatics_transcribe::connect",
    "Error": "speechmatics_transcribe::error"
  },
+  "OpenAITranscriptionEvents": {
+    "Transcription": "openai_transcribe::transcription",
+    "Translation": "openai_transcribe::translation",
+    "SpeechStarted": "openai_transcribe::speech_started",
+    "SpeechStopped": "openai_transcribe::speech_stopped",
+    "PartialTranscript": "openai_transcribe::partial_transcript",
+    "Info": "openai_transcribe::info",
+    "RecognitionStarted": "openai_transcribe::recognition_started",
+    "ConnectFailure": "openai_transcribe::connect_failed",
+    "Connect": "openai_transcribe::connect",
+    "Error": "openai_transcribe::error"
+  },
  "JambonzTranscriptionEvents": {
    "Transcription": "jambonz_transcribe::transcription",
    "ConnectFailure": "jambonz_transcribe::connect_failed",
--- a/lib/utils/db-utils.js
+++ b/lib/utils/db-utils.js
@@ -142,6 +142,11 @@ const speechMapper = (cred) => {
      obj.api_key = o.api_key;
      obj.speechmatics_stt_uri = o.speechmatics_stt_uri;
    }
+    else if ('openai' === obj.vendor) {
+      const o = JSON.parse(decrypt(credential));
+      obj.api_key = o.api_key;
+      obj.model_id = o.model_id;
+    }
    else if (obj.vendor.startsWith('custom:')) {
      const o = JSON.parse(decrypt(credential));
      obj.auth_token = o.auth_token;
--- a/lib/utils/transcription-utils.js
+++ b/lib/utils/transcription-utils.js
@@ -117,7 +117,16 @@ const stickyVars = {
    'SPEECHMATICS_SPEECH_HINTS',
    'SPEECHMATICS_TRANSLATION_LANGUAGES',
    'SPEECHMATICS_TRANSLATION_PARTIALS'
-  ]
+  ],
+  openai: [
+    'OPENAI_API_KEY',
+    'OPENAI_MODEL',
+    'OPENAI_INPUT_AUDIO_NOISE_REDUCTION',
+    'OPENAI_TURN_DETECTION_TYPE',
+    'OPENAI_TURN_DETECTION_THRESHOLD',
+    'OPENAI_TURN_DETECTION_PREFIX_PADDING_MS',
+    'OPENAI_TURN_DETECTION_SILENCE_DURATION_MS',
+  ],
 };

 /**
@@ -571,6 +580,35 @@ const normalizeSpeechmatics = (evt, channel, language) => {
  return obj;
 };

+const calculateConfidence = (logprobsArray) => {
+  // Sum the individual log probabilities
+  const totalLogProb = logprobsArray.reduce((sum, tokenInfo) => sum + tokenInfo.logprob, 0);
+
+  // Convert the total log probability back to a regular probability
+  const confidence = Math.exp(totalLogProb);
+  return confidence;
+};
+
+const normalizeOpenAI = (evt, channel, language) => {
+  const copy = JSON.parse(JSON.stringify(evt));
+  const obj = {
+    language_code: language,
+    channel_tag: channel,
+    is_final: true,
+    alternatives: [
+      {
+        transcript: evt.transcript,
+        confidence: evt.logprobs ? calculateConfidence(evt.logprobs) : 1.0,
+      }
+    ],
+    vendor: {
+      name: 'openai',
+      evt: copy
+    }
+  };
+  return obj;
+};
+
 module.exports = (logger) => {
  const normalizeTranscription = (evt, vendor, channel, language, shortUtterance, punctuation) => {

@@ -602,6 +640,8 @@ module.exports = (logger) => {
        return normalizeVerbio(evt, channel, language);
      case 'speechmatics':
        return normalizeSpeechmatics(evt, channel, language);
+      case 'openai':
+        return normalizeOpenAI(evt, channel, language);
      default:
        if (vendor.startsWith('custom:')) {
          return normalizeCustom(evt, channel, language, vendor);
@@ -968,6 +1008,36 @@ module.exports = (logger) => {
          {VOXIST_API_KEY: sttCredentials.api_key},
      };
    }
+    else if ('openai' === vendor) {
+      const {openaiOptions = {}} = rOpts;
+      const model = openaiOptions.model || rOpts.model || sttCredentials.model_id || 'whisper-1';
+      const apiKey = openaiOptions.apiKey || sttCredentials.api_key;
+
+      opts = {
+        OPENAI_MODEL: model,
+        OPENAI_API_KEY: apiKey,
+        ...opts,
+        ...(openaiOptions.prompt && {OPENAI_PROMPT: openaiOptions.prompt}),
+        ...(openaiOptions.input_audio_noise_reduction &&
+          {OPENAI_INPUT_AUDIO_NOISE_REDUCTION: openaiOptions.input_audio_noise_reduction}),
+      };
+
+      if (openaiOptions.turn_detection) {
+        opts = {
+          ...opts,
+          OPENAI_TURN_DETECTION_TYPE: openaiOptions.turn_detection.type,
+          ...(openaiOptions.turn_detection.threshold && {
+            OPENAI_TURN_DETECTION_THRESHOLD: openaiOptions.turn_detection.threshold
+          }),
+          ...(openaiOptions.turn_detection.prefix_padding_ms && {
+            OPENAI_TURN_DETECTION_PREFIX_PADDING_MS: openaiOptions.turn_detection.prefix_padding_ms
+          }),
+          ...(openaiOptions.turn_detection.silence_duration_ms && {
+            OPENAI_TURN_DETECTION_SILENCE_DURATION_MS: openaiOptions.turn_detection.silence_duration_ms
+          }),
+        };
+      }
+    }
    else if ('verbio' === vendor) {
      const {verbioOptions = {}} = rOpts;
      opts = {