Merge pull request #13 from jambonz/feat/aws-v3

fix: custom tts support multiple audio types
2026-01-25 02:08:26 +00:00 · 2023-03-15 08:51:25 -04:00
parent 2f3a766713 0e883cf82a
commit 17305325ff
2 changed files with 35 additions and 9 deletions
--- a/lib/synth-audio.js
+++ b/lib/synth-audio.js
@@ -155,7 +155,8 @@ async function synthAudio(client, logger, stats, { account_sid,
        audioBuffer = await synthWellSaid(logger, {credentials, stats, language, voice, text, filePath});
        break;
      case vendor.startsWith('custom') ? vendor : 'cant_match_value':
-        audioBuffer = await synthCustomVendor(logger, {credentials, stats, language, voice, text});
+        ({ audioBuffer, filePath } = await synthCustomVendor(logger,
+          {credentials, stats, language, voice, text, filePath}));
        break;
      default:
        assert(`synthAudio: unsupported speech vendor ${vendor}`);
@@ -440,30 +441,56 @@ const synthNvidia = async(client, logger, {credentials, stats, language,  voice,
 };


-// CustomVendor accept only mp3
-const synthCustomVendor = async(logger, {credentials, stats, language, voice, text}) => {
+const synthCustomVendor = async(logger, {credentials, stats, language, voice, text, filePath}) => {
  const {vendor, auth_token, custom_tts_url} = credentials;

  try {
-    const post = bent('POST', 'buffer', {
+    const post = bent('POST', {
      'Authorization': `Bearer ${auth_token}`,
-      'Accept': 'audio/mpeg',
      'Content-Type': 'application/json'
    });

-    const mp3 = await post(custom_tts_url, {
+    const response = await post(custom_tts_url, {
      language,
-      format: 'audio/mpeg',
      voice,
      type: text.startsWith('<speak>') ? 'ssml' : 'text',
      text
    });

-    return mp3;
+    const regex = /\.[^\.]*$/g;
+    const mime = response.headers['content-type'];
+    const buffer = await response.arrayBuffer();
+    return {
+      audioBuffer: buffer,
+      filePath: filePath.replace(regex, getFileExtFromMime(mime))
+    };
  } catch (err) {
    logger.info({err}, `Vendor ${vendor} returned error`);
    throw err;
  }
 };

+const getFileExtFromMime = (mime) => {
+  switch (mime) {
+    case 'audio/wav':
+    case 'audio/x-wav':
+      return '.wav';
+    case /audio\/l16.*rate=8000/.test(mime) ? mime : 'cant match value':
+      return '.r8';
+    case /audio\/l16.*rate=16000/.test(mime) ? mime : 'cant match value':
+      return '.r16';
+    case /audio\/l16.*rate=24000/.test(mime) ? mime : 'cant match value':
+      return '.r24';
+    case /audio\/l16.*rate=32000/.test(mime) ? mime : 'cant match value':
+      return '.r32';
+    case /audio\/l16.*rate=48000/.test(mime) ? mime : 'cant match value':
+      return '.r48';
+    case 'audio/mpeg':
+    case 'audio/mp3':
+      return '.mp3';
+    default:
+      return '.wav';
+  }
+};
+
 module.exports = synthAudio;
--- a/test/synth.js
+++ b/test/synth.js
@@ -348,7 +348,6 @@ test('Custom Vendor speech synth tests', async(t) => {
    let obj = await getJSON(`http://127.0.0.1:3100/lastRequest/somethingnew`);
    t.ok(obj.headers.Authorization == 'Bearer some_jwt_token', 'Custom Vendor Authentication Header is correct');
    t.ok(obj.body.language == 'en-US', 'Custom Vendor Language is correct');
-    t.ok(obj.body.format == 'audio/mpeg', 'Custom Vendor format is correct');
    t.ok(obj.body.voice == 'English-US.Female-1', 'Custom Vendor voice is correct');
    t.ok(obj.body.type == 'text', 'Custom Vendor type is correct');
    t.ok(obj.body.text == 'This is a test.  This is only a test', 'Custom Vendor text is correct');