diff --git a/lib/routes/api/tts-cache.js b/lib/routes/api/tts-cache.js index 9a0ff99..fb46559 100644 --- a/lib/routes/api/tts-cache.js +++ b/lib/routes/api/tts-cache.js @@ -9,6 +9,7 @@ const {DbErrorBadRequest} = require('../../utils/errors'); const Account = require('../../models/account'); const sysError = require('../error'); const { getSpeechCredential, decryptCredential } = require('../../utils/speech-utils'); +const PCMToMP3Encoder = require('../../record/encoder'); router.delete('/', async(req, res) => { const {purgeTtsCache} = req.app.locals; @@ -38,6 +39,7 @@ router.post('/Synthesize', async(req, res) => { try { const accountSid = parseAccountSid(req); const body = req.body; + const encodingMp3 = req.body.encodingMp3 || false; if (!body.speech_credential_sid || !body.text || !body.language || !body.voice) { throw new DbErrorBadRequest('speech_credential_sid, text, language, voice are all required'); } @@ -85,18 +87,31 @@ router.post('/Synthesize', async(req, res) => { disableTtsCache: false }); - const stat = fs.statSync(filePath); + let contentType = 'audio/mpeg'; + + let readStream = fs.createReadStream(filePath); + if (['nuance', 'nvidia'].includes(cred.vendor) || + ( + process.env.JAMBONES_TTS_TRIM_SILENCE && + ['microsoft', 'azure'].includes(cred.vendor) + ) + ) { + if (encodingMp3) { + readStream = readStream + .pipe(new PCMToMP3Encoder({ + channels: 1, + sampleRate: 8000, + bitRate: 128 + }, logger)); + } else { + contentType = 'application/octet-stream'; + } + } res.writeHead(200, { - 'Content-Type': 'audio/mpeg', - 'Content-Length': stat.size, + 'Content-Type': contentType, }); - - const readStream = fs.createReadStream(filePath); - // We replaced all the event handlers with a simple call to readStream.pipe() readStream.pipe(res); - readStream.on('end', () => { - // Delete the file after it's been read fs.unlink(filePath, (err) => { if (err) throw err; logger.info(`${filePath} was deleted`); diff --git a/lib/swagger/swagger.yaml b/lib/swagger/swagger.yaml index 15c1321..0c3588d 100644 --- a/lib/swagger/swagger.yaml +++ b/lib/swagger/swagger.yaml @@ -4320,6 +4320,10 @@ paths: type: string description: voice ID example: en-US-Standard-C + encodingMp3: + type: boolean + description: convert audio to mp3. + example: true required: - speech_credential_sid - text