updating for transcription schema

2026-01-25 02:08:27 +00:00 · 2024-06-26 19:51:18 -04:00
parent 4a6cc284a7
commit 90ecd734b4
5 changed files with 189 additions and 156 deletions
--- a/lib/deepgram/config.js
+++ b/lib/deepgram/config.js
@@ -1,13 +1,15 @@
 const transcriptionOptions = {
  model: 'nova-2',
  smart_format: true,
-  detect_entities: true
+  detect_entities: true,
+  multichannel:true
 };

 const redactionOptions = {
  model: 'nova-2',
  smart_format: true,
-  redact: 'pii'
+  redact: 'pii',
+  multichannel:true
 };

 const analysisOptions = {
--- a/lib/deepgram/transcribe.js
+++ b/lib/deepgram/transcribe.js
@@ -2,6 +2,39 @@ const fs = require('fs');
 const { createClient } = require('@deepgram/sdk');
 const { transcriptionOptions, redactionOptions, analysisOptions } = require('./config');

+function extractTranscript(data) {
+  // eslint-disable-next-line max-len
+  const paragraphs = data.results.channels.flatMap((channel) => channel.alternatives.flatMap((alt) => alt.paragraphs.paragraphs));
+  let ctr = 0;
+  // Use reduce to process each paragraph and sentence, consolidating transcripts by speaker
+  return  paragraphs.reduce((acc, paragraph) => {
+    paragraph.sentences.forEach((sentence) => {
+      const wordsDetails = data.results.channels
+        .find((channel) => channel.alternatives.some((alt) => alt.paragraphs.paragraphs.includes(paragraph)))
+        .alternatives[0].words
+        .filter((word) => word.start >= sentence.start && word.end <= sentence.end)
+        .map((word) => ({
+          word: word.word,
+          start: word.start,
+          end: word.end,
+          confidence: word.confidence
+        }));
+
+      acc.push({
+        timestamp: sentence.start,
+        duration: Math.round(1000 * (sentence.end - sentence.start)),
+        startTime: sentence.start,
+        endTime: sentence.end,
+        speaker: ctr++ % 2,
+        transcript: sentence.text,
+        words: wordsDetails
+      });
+    });
+    return acc;
+  }, []);
+}
+
+
 const transcribe = async(logger, apiKey, filePath) => {
  logger.info(`Transcribing audio file: ${filePath}`);
  //creating a deepgram client
@@ -9,32 +42,25 @@ const transcribe = async(logger, apiKey, filePath) => {
  //audio file buffer
  const fileBuffer = fs.readFileSync(filePath);
  //transcription
-  const { result:transcriptResult } = await client.listen.prerecorded.transcribeFile(fileBuffer, transcriptionOptions);
-  const transcript = transcriptResult.results.channels[0].alternatives[0].transcript;
-  const timestamps = transcriptResult.results.channels[0].alternatives[0].words;
-  const entities = transcriptResult.results.channels[0].alternatives[0].entities;
-  const confidence = transcriptResult.results.channels[0].alternatives[0].confidence;
+  // const { result:transcriptResult } = await client.listen.prerecorded.transcribeFile(fileBuffer, transcriptionOptions);
  //redaction
  const { result:redactionResult } = await client.listen.prerecorded.transcribeFile(fileBuffer, redactionOptions);
-  const redactionTimestamps = redactionResult.results.channels[0].alternatives[0].words;
-  const redacted = redactionResult.results.channels[0].alternatives[0].transcript;
-  //analysis and sentiment
-  const { result:analysisResult } = await client.read.analyzeText({ text:transcript }, analysisOptions);
-  const sentimentSegment = analysisResult.results.sentiments.segments[0];
-  const sentiment = sentimentSegment.sentiment;
-  const sentimentScore = sentimentSegment.sentiment_score;
-  const vendor = 'deepgram';
-  return {
-    vendor,
-    transcript,
-    timestamps,
-    redactionTimestamps,
-    redacted,
-    sentiment,
-    sentimentScore,
-    entities,
-    confidence
+  const data = {
+    'vendor' : 'deepgram',
+    'model' : redactionResult.metadata.model_info[redactionResult.metadata.models[0]].arch,
+    'channels' : redactionResult.metadata.channels,
+    'createdAt': redactionResult.metadata.created
  };
+  data.speechEvents = extractTranscript(redactionResult);
+  const combinedTranscript = data.speechEvents.map(event => event.transcript).join(" ");
+  data.redactionTimestamps = data.speechEvents.flatMap(event => event.words);
+  //analysis and sentiment
+  const { result:analysisResult } = await client.read.analyzeText({ text:combinedTranscript }, analysisOptions);
+  const sentimentSegment = analysisResult.results.sentiments.segments[0];
+  data.sentiment = sentimentSegment.sentiment;
+  data.sentimentScore = sentimentSegment.sentiment_score;
+  data.totalDuration = Math.round(1000 * redactionResult.metadata.duration);
+  return data;
 };

 module.exports = transcribe;
--- a/lib/redact.js
+++ b/lib/redact.js
@@ -2,7 +2,7 @@ const ffmpeg = require('fluent-ffmpeg');

 const redact = async(logger, { transcriptionData, audioPath, audioOutputPath, delta = 0.05 }) => {
  logger.info(`Redacting audio file: ${audioPath}`);
-  console.log(transcriptionData);
+  // console.log(transcriptionData);
  return new Promise((resolve, reject) => {
    const command = ffmpeg(audioPath)
      .outputFormat('wav'); // Ensure output format is WAV
--- a/lib/schema/transciption-template.js
+++ b/lib/schema/transciption-template.js
@@ -1,24 +1,41 @@
 const audioRecordingTemplate = {
-  'parties': {
-    'N': 0,
-    'from': '',
-    'to': ''
-  },
-  'duration': 0,
-  'url': '',
-  'conversation': {
-    'as heard': {
-      'full transcript': '',
-      'confidence': '',
-      'transcription vendor': '',
-      'timestamps': []
-    },
-    'after the fact': {
-      'full transcript': '',
-      'confidence': '',
-      'transcription vendor': '',
-      'timestamps': []
+  'participants': [
+    {
+      'type': '',
+      'initiatedConversation': false,
+      'id': {
+        'name': null,
+        'phone': ''
+      }
    }
+  ],
+  'duration': 0,
+  'transcript': {
+    'vendor': '',
+    'model': '',
+    'channels': 0,
+    'createdAt': '',
+    'speechEvents': [
+      {
+        'timestamp': 0,
+        'duration': 0,
+        'startTime': 0,
+        'endTime': 0,
+        'speaker': 0,
+        'transcript': '',
+        'words': [
+          {
+            'word': '',
+            'start': 0,
+            'end': 0,
+            'confidence': 0
+          }
+        ]
+      }
+    ],
+    'sentiment': '',
+    'sentimentScore': 0,
+    'totalDuration': 0
  }
 };

--- a/lib/schema/transcription_schema.json
+++ b/lib/schema/transcription_schema.json
@@ -2,127 +2,115 @@
  "$schema": "http://json-schema.org/draft-07/schema#",
  "type": "object",
  "properties": {
-    "parties": {
-      "type": "object",
-      "properties": {
-        "N": {
-          "type": "integer",
-          "description": "Number of parties"
+    "participants": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "type": {
+            "type": "string",
+            "enum": ["human", "machine"]
+          },
+          "initiatedConversation": {
+            "type": "boolean"
+          },
+          "id": {
+            "type": "object",
+            "properties": {
+              "name": {
+                "type": ["string", "null"]
+              },
+              "phone": {
+                "type": "string",
+                "pattern": "^\\+\\d{11}$"
+              }
+            },
+            "required": ["phone"]
+          }
        },
-        "from": {
-          "type": "string",
-          "description": "Identifier for the initiating party"
-        },
-        "to": {
-          "type": "string",
-          "description": "Identifier for the receiving party"
-        }
-      },
+        "required": ["type", "initiatedConversation", "id"]
+      }
+    },
    "duration": {
-      "type": "integer",
-      "description": "Duration of recording in milliseconds"
+      "type": "integer"
    },
-    "url": {
-      "type": "string",
-      "description": "Where recording is located",
-      "format": "uri"
-    },
-    "conversation": {
+    "transcript": {
      "type": "object",
      "properties": {
-        "as heard": {
-          "type": "object",
-          "properties": {
-            "full transcript": {
-              "type": "string",
-              "description": "Transcript as heard during the conversation"
-            },
-            "confidence": {
-              "type": "string",
-              "description": "confidence score for transcription as heard"
-            },
-            "transcription vendor": {
-              "type": "string",
-              "description": "transcription vendor realtime"
-            },
-            "timestamps": {
-              "type": "array",
-              "items": {
-                "type": "object",
-                "properties": {
-                  "word": {
-                    "type": "string",
-                    "description": "Word in the as heard transcript"
-                  },
-                  "startTime": {
-                    "type": "string",
-                    "description": "Start time of the word",
-                    "format": "date-time"
-                  },
-                  "endTime": {
-                    "type": "string",
-                    "description": "End time of the word",
-                    "format": "date-time"
-                  },
-                  "confidence": {
-                    "type": "number",
-                    "description": "Confidence level of the word"
-                  }
-                },
-                "required": ["word", "startTime", "endTime", "confidence"]
-              }
-            }
-          },
-          "required": ["full transcript", "timestamps"]
+        "vendor": {
+          "type": "string"
        },
-        "after the fact": {
-          "type": "object",
-          "properties": {
-            "full transcript": {
-              "type": "string",
-              "description": "Transcript generated after analyzing the conversation"
-            },
-            "confidence": {
-              "type": "string",
-              "description": "confidence score for transcription after the fact"
-            },
-            "transcription vendor": {
-              "type": "string",
-              "description": "transcription vendor used for after the fact processing"
-            },
-            "timestamps": {
-              "type": "array",
-              "items": {
-                "type": "object",
-                "properties": {
-                  "word": {
-                    "type": "string",
-                    "description": "Word in the after the fact transcript"
+        "model": {
+          "type": "string"
+        },
+        "channels": {
+          "type": "integer"
+        },
+        "createdAt": {
+          "type": "string",
+          "format": "date-time"
+        },
+        "speechEvents": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "timestamp": {
+                "type": "number"
+              },
+              "duration": {
+                "type": "number"
+              },
+              "startTime": {
+                "type": "number"
+              },
+              "endTime": {
+                "type": "number"
+              },
+              "speaker": {
+                "type": "integer"
+              },
+              "transcript": {
+                "type": "string"
+              },
+              "words": {
+                "type": "array",
+                "items": {
+                  "type": "object",
+                  "properties": {
+                    "word": {
+                      "type": "string"
+                    },
+                    "start": {
+                      "type": "number"
+                    },
+                    "end": {
+                      "type": "number"
+                    },
+                    "confidence": {
+                      "type": "number"
+                    }
                  },
-                  "startTime": {
-                    "type": "string",
-                    "description": "Start time of the word",
-                    "format": "date-time"
-                  },
-                  "endTime": {
-                    "type": "string",
-                    "description": "End time of the word",
-                    "format": "date-time"
-                  },
-                  "confidence": {
-                    "type": "number",
-                    "description": "Confidence level of the word"
-                  }
-                },
-                "required": ["word", "startTime", "endTime", "confidence"]
+                  "required": ["word", "start", "end", "confidence"]
+                }
              }
-            }
-          },
-          "required": ["full transcript", "timestamps"]
+            },
+            "required": ["timestamp", "duration", "startTime", "endTime", "speaker", "transcript", "words"]
+          }
+        },
+        "sentiment": {
+          "type": "string",
+          "enum": ["positive", "negative", "neutral"]
+        },
+        "sentimentScore": {
+          "type": "number"
+        },
+        "totalDuration": {
+          "type": "integer"
        }
      },
-      "required": ["as heard", "after the fact"]
+      "required": ["vendor", "model", "channels", "createdAt", "speechEvents", "sentiment", "sentimentScore", "totalDuration"]
    }
  },
-  "required": ["body", "parties", "duration", "url", "conversation"]
+  "required": ["participants", "duration", "transcript"]
 }