diff --git a/lib/deepgram/config.js b/lib/deepgram/config.js index b8a505f..3184dab 100644 --- a/lib/deepgram/config.js +++ b/lib/deepgram/config.js @@ -1,13 +1,15 @@ const transcriptionOptions = { model: 'nova-2', smart_format: true, - detect_entities: true + detect_entities: true, + multichannel:true }; const redactionOptions = { model: 'nova-2', smart_format: true, - redact: 'pii' + redact: 'pii', + multichannel:true }; const analysisOptions = { diff --git a/lib/deepgram/transcribe.js b/lib/deepgram/transcribe.js index 2191b2e..65f72ad 100644 --- a/lib/deepgram/transcribe.js +++ b/lib/deepgram/transcribe.js @@ -2,6 +2,39 @@ const fs = require('fs'); const { createClient } = require('@deepgram/sdk'); const { transcriptionOptions, redactionOptions, analysisOptions } = require('./config'); +function extractTranscript(data) { + // eslint-disable-next-line max-len + const paragraphs = data.results.channels.flatMap((channel) => channel.alternatives.flatMap((alt) => alt.paragraphs.paragraphs)); + let ctr = 0; + // Use reduce to process each paragraph and sentence, consolidating transcripts by speaker + return paragraphs.reduce((acc, paragraph) => { + paragraph.sentences.forEach((sentence) => { + const wordsDetails = data.results.channels + .find((channel) => channel.alternatives.some((alt) => alt.paragraphs.paragraphs.includes(paragraph))) + .alternatives[0].words + .filter((word) => word.start >= sentence.start && word.end <= sentence.end) + .map((word) => ({ + word: word.word, + start: word.start, + end: word.end, + confidence: word.confidence + })); + + acc.push({ + timestamp: sentence.start, + duration: Math.round(1000 * (sentence.end - sentence.start)), + startTime: sentence.start, + endTime: sentence.end, + speaker: ctr++ % 2, + transcript: sentence.text, + words: wordsDetails + }); + }); + return acc; + }, []); +} + + const transcribe = async(logger, apiKey, filePath) => { logger.info(`Transcribing audio file: ${filePath}`); //creating a deepgram client @@ -9,32 +42,25 @@ const transcribe = async(logger, apiKey, filePath) => { //audio file buffer const fileBuffer = fs.readFileSync(filePath); //transcription - const { result:transcriptResult } = await client.listen.prerecorded.transcribeFile(fileBuffer, transcriptionOptions); - const transcript = transcriptResult.results.channels[0].alternatives[0].transcript; - const timestamps = transcriptResult.results.channels[0].alternatives[0].words; - const entities = transcriptResult.results.channels[0].alternatives[0].entities; - const confidence = transcriptResult.results.channels[0].alternatives[0].confidence; + // const { result:transcriptResult } = await client.listen.prerecorded.transcribeFile(fileBuffer, transcriptionOptions); //redaction const { result:redactionResult } = await client.listen.prerecorded.transcribeFile(fileBuffer, redactionOptions); - const redactionTimestamps = redactionResult.results.channels[0].alternatives[0].words; - const redacted = redactionResult.results.channels[0].alternatives[0].transcript; - //analysis and sentiment - const { result:analysisResult } = await client.read.analyzeText({ text:transcript }, analysisOptions); - const sentimentSegment = analysisResult.results.sentiments.segments[0]; - const sentiment = sentimentSegment.sentiment; - const sentimentScore = sentimentSegment.sentiment_score; - const vendor = 'deepgram'; - return { - vendor, - transcript, - timestamps, - redactionTimestamps, - redacted, - sentiment, - sentimentScore, - entities, - confidence + const data = { + 'vendor' : 'deepgram', + 'model' : redactionResult.metadata.model_info[redactionResult.metadata.models[0]].arch, + 'channels' : redactionResult.metadata.channels, + 'createdAt': redactionResult.metadata.created }; + data.speechEvents = extractTranscript(redactionResult); + const combinedTranscript = data.speechEvents.map(event => event.transcript).join(" "); + data.redactionTimestamps = data.speechEvents.flatMap(event => event.words); + //analysis and sentiment + const { result:analysisResult } = await client.read.analyzeText({ text:combinedTranscript }, analysisOptions); + const sentimentSegment = analysisResult.results.sentiments.segments[0]; + data.sentiment = sentimentSegment.sentiment; + data.sentimentScore = sentimentSegment.sentiment_score; + data.totalDuration = Math.round(1000 * redactionResult.metadata.duration); + return data; }; module.exports = transcribe; diff --git a/lib/redact.js b/lib/redact.js index 86cd30b..7cb0d52 100644 --- a/lib/redact.js +++ b/lib/redact.js @@ -2,7 +2,7 @@ const ffmpeg = require('fluent-ffmpeg'); const redact = async(logger, { transcriptionData, audioPath, audioOutputPath, delta = 0.05 }) => { logger.info(`Redacting audio file: ${audioPath}`); - console.log(transcriptionData); + // console.log(transcriptionData); return new Promise((resolve, reject) => { const command = ffmpeg(audioPath) .outputFormat('wav'); // Ensure output format is WAV diff --git a/lib/schema/transciption-template.js b/lib/schema/transciption-template.js index e11a9ef..d1b5dc3 100644 --- a/lib/schema/transciption-template.js +++ b/lib/schema/transciption-template.js @@ -1,24 +1,41 @@ const audioRecordingTemplate = { - 'parties': { - 'N': 0, - 'from': '', - 'to': '' - }, - 'duration': 0, - 'url': '', - 'conversation': { - 'as heard': { - 'full transcript': '', - 'confidence': '', - 'transcription vendor': '', - 'timestamps': [] - }, - 'after the fact': { - 'full transcript': '', - 'confidence': '', - 'transcription vendor': '', - 'timestamps': [] + 'participants': [ + { + 'type': '', + 'initiatedConversation': false, + 'id': { + 'name': null, + 'phone': '' + } } + ], + 'duration': 0, + 'transcript': { + 'vendor': '', + 'model': '', + 'channels': 0, + 'createdAt': '', + 'speechEvents': [ + { + 'timestamp': 0, + 'duration': 0, + 'startTime': 0, + 'endTime': 0, + 'speaker': 0, + 'transcript': '', + 'words': [ + { + 'word': '', + 'start': 0, + 'end': 0, + 'confidence': 0 + } + ] + } + ], + 'sentiment': '', + 'sentimentScore': 0, + 'totalDuration': 0 } }; diff --git a/lib/schema/transcription_schema.json b/lib/schema/transcription_schema.json index 75b4f48..ed6c1e5 100644 --- a/lib/schema/transcription_schema.json +++ b/lib/schema/transcription_schema.json @@ -2,127 +2,115 @@ "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "properties": { - "parties": { - "type": "object", - "properties": { - "N": { - "type": "integer", - "description": "Number of parties" + "participants": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["human", "machine"] + }, + "initiatedConversation": { + "type": "boolean" + }, + "id": { + "type": "object", + "properties": { + "name": { + "type": ["string", "null"] + }, + "phone": { + "type": "string", + "pattern": "^\\+\\d{11}$" + } + }, + "required": ["phone"] + } }, - "from": { - "type": "string", - "description": "Identifier for the initiating party" - }, - "to": { - "type": "string", - "description": "Identifier for the receiving party" - } - }, + "required": ["type", "initiatedConversation", "id"] + } + }, "duration": { - "type": "integer", - "description": "Duration of recording in milliseconds" + "type": "integer" }, - "url": { - "type": "string", - "description": "Where recording is located", - "format": "uri" - }, - "conversation": { + "transcript": { "type": "object", "properties": { - "as heard": { - "type": "object", - "properties": { - "full transcript": { - "type": "string", - "description": "Transcript as heard during the conversation" - }, - "confidence": { - "type": "string", - "description": "confidence score for transcription as heard" - }, - "transcription vendor": { - "type": "string", - "description": "transcription vendor realtime" - }, - "timestamps": { - "type": "array", - "items": { - "type": "object", - "properties": { - "word": { - "type": "string", - "description": "Word in the as heard transcript" - }, - "startTime": { - "type": "string", - "description": "Start time of the word", - "format": "date-time" - }, - "endTime": { - "type": "string", - "description": "End time of the word", - "format": "date-time" - }, - "confidence": { - "type": "number", - "description": "Confidence level of the word" - } - }, - "required": ["word", "startTime", "endTime", "confidence"] - } - } - }, - "required": ["full transcript", "timestamps"] + "vendor": { + "type": "string" }, - "after the fact": { - "type": "object", - "properties": { - "full transcript": { - "type": "string", - "description": "Transcript generated after analyzing the conversation" - }, - "confidence": { - "type": "string", - "description": "confidence score for transcription after the fact" - }, - "transcription vendor": { - "type": "string", - "description": "transcription vendor used for after the fact processing" - }, - "timestamps": { - "type": "array", - "items": { - "type": "object", - "properties": { - "word": { - "type": "string", - "description": "Word in the after the fact transcript" + "model": { + "type": "string" + }, + "channels": { + "type": "integer" + }, + "createdAt": { + "type": "string", + "format": "date-time" + }, + "speechEvents": { + "type": "array", + "items": { + "type": "object", + "properties": { + "timestamp": { + "type": "number" + }, + "duration": { + "type": "number" + }, + "startTime": { + "type": "number" + }, + "endTime": { + "type": "number" + }, + "speaker": { + "type": "integer" + }, + "transcript": { + "type": "string" + }, + "words": { + "type": "array", + "items": { + "type": "object", + "properties": { + "word": { + "type": "string" + }, + "start": { + "type": "number" + }, + "end": { + "type": "number" + }, + "confidence": { + "type": "number" + } }, - "startTime": { - "type": "string", - "description": "Start time of the word", - "format": "date-time" - }, - "endTime": { - "type": "string", - "description": "End time of the word", - "format": "date-time" - }, - "confidence": { - "type": "number", - "description": "Confidence level of the word" - } - }, - "required": ["word", "startTime", "endTime", "confidence"] + "required": ["word", "start", "end", "confidence"] + } } - } - }, - "required": ["full transcript", "timestamps"] + }, + "required": ["timestamp", "duration", "startTime", "endTime", "speaker", "transcript", "words"] + } + }, + "sentiment": { + "type": "string", + "enum": ["positive", "negative", "neutral"] + }, + "sentimentScore": { + "type": "number" + }, + "totalDuration": { + "type": "integer" } }, - "required": ["as heard", "after the fact"] + "required": ["vendor", "model", "channels", "createdAt", "speechEvents", "sentiment", "sentimentScore", "totalDuration"] } }, - "required": ["body", "parties", "duration", "url", "conversation"] + "required": ["participants", "duration", "transcript"] }