Merge pull request #7 from jambonz/feat/custom-vendor

custom tts vendor
2026-07-04 19:31:49 +00:00 · 2023-03-06 10:13:27 -05:00
parent 3e503996e6 95b055113d
commit 73e4122c85
7 changed files with 1317 additions and 3 deletions
@@ -60,8 +60,9 @@ async function synthAudio(client, logger, stats, {
  let rtt;
  logger = logger || noopLogger;

-  assert.ok(['google', 'aws', 'polly', 'microsoft', 'wellsaid', 'nuance', 'nvidia', 'ibm'].includes(vendor),
-    `synthAudio supported vendors are google, aws, microsoft, nuance, nvidia and wellsaid, not ${vendor}`);
+  assert.ok(['google', 'aws', 'polly', 'microsoft', 'wellsaid', 'nuance', 'nvidia', 'ibm'].includes(vendor) ||
+  vendor.startsWith('custom'),
+  `synthAudio supported vendors are google, aws, microsoft, nuance, nvidia and wellsaid, not ${vendor}`);
  if ('google' === vendor) {
    assert.ok(language, 'synthAudio requires language when google is used');
  }
@@ -91,6 +92,8 @@ async function synthAudio(client, logger, stats, {
    language = 'en-US'; // WellSaid only supports English atm
    assert.ok(voice, 'synthAudio requires voice when wellsaid is used');
    assert.ok(!text.startsWith('<speak'), 'wellsaid does not support SSML tags');
+  } else  if (vendor.startsWith('custom')) {
+    assert.ok(credentials.custom_tts_url, `synthAudio requires custom_tts_url in credentials when ${vendor} is used`);
  }

  const key = makeSynthKey({
@@ -151,6 +154,9 @@ async function synthAudio(client, logger, stats, {
      case 'wellsaid':
        audioBuffer = await synthWellSaid(logger, {credentials, stats, language, voice, text, filePath});
        break;
+      case vendor.startsWith('custom') ? vendor : 'cant_match_value':
+        audioBuffer = await synthCustomVendor(logger, {credentials, stats, language, voice, text});
+        break;
      default:
        assert(`synthAudio: unsupported speech vendor ${vendor}`);
    }
@@ -433,4 +439,31 @@ const synthNvidia = async(client, logger, {credentials, stats, language,  voice,
  });
 };

+
+// CustomVendor accept only mp3
+const synthCustomVendor = async(logger, {credentials, stats, language, voice, text}) => {
+  const {vendor, auth_token, custom_tts_url} = credentials;
+
+  try {
+    const post = bent('POST', 'buffer', {
+      'Authorization': `Bearer ${auth_token}`,
+      'Accept': 'audio/mpeg',
+      'Content-Type': 'application/json'
+    });
+
+    const mp3 = await post(custom_tts_url, {
+      language,
+      format: 'audio/mpeg',
+      voice,
+      type: text.startsWith('<speak>') ? 'ssml' : 'text',
+      text
+    });
+
+    return mp3;
+  } catch (err) {
+    logger.info({err}, `Vendor ${vendor} returned error`);
+    throw err;
+  }
+};
+
 module.exports = synthAudio;
@@ -1,10 +1,20 @@
-version: '3'
+version: '3.9'
+
+networks:
+  speech-utils:
+    driver: bridge
+    ipam:
+      config:
+        - subnet: 172.41.0.0/16

 services:
  redis:
    image: redis:alpine
    ports:
      - "3379:6379"
+    networks:
+      speech-utils:
+        ipv4_address: 172.41.0.5

  redis-auth:
    image: redis:alpine
@@ -13,3 +23,17 @@ services:
      - "3380:6379"
    volumes:
      - ./tmp:/tmp
+    networks:
+      speech-utils:
+        ipv4_address: 172.41.0.6
+
+  webhook-tts-scaffold:
+    image: jambonz/webhook-tts-test-scaffold:latest
+    ports:
+      - "3100:3000/tcp"
+    volumes:
+      - ./test-apps:/tmp
+    networks:
+      speech-utils:
+        ipv4_address: 172.41.0.10
+    
@@ -4,6 +4,8 @@ const opts = config.get('redis');
 const fs = require('fs');
 const {makeSynthKey} = require('../lib/utils');
 const logger = require('pino')();
+const bent = require('bent');
+const getJSON = bent('json')

 process.on('unhandledRejection', (reason, p) => {
  console.log('Unhandled Rejection at: Promise', p, 'reason:', reason);
@@ -326,6 +328,53 @@ test('IBM watson speech synth tests', async(t) => {
  client.quit();
 });

+test('Custom Vendor speech synth tests', async(t) => {
+  const fn = require('..');
+  const {synthAudio, client} = fn(opts, logger);
+
+  try {
+    let opts = await synthAudio(stats, {
+      vendor: 'custom:somethingnew',
+      credentials: {
+        use_for_tts: 1,
+        custom_tts_url: "http://127.0.0.1:3100/somethingnew",
+        auth_token: 'some_jwt_token'
+      },
+      language: 'en-US',
+      voice: 'English-US.Female-1',
+      text: 'This is a test.  This is only a test',
+    });
+    t.ok(!opts.servedFromCache, `successfully synthesized custom vendor audio to ${opts.filePath}`);
+    let obj = await getJSON(`http://127.0.0.1:3100/lastRequest/somethingnew`);
+    t.ok(obj.headers.Authorization == 'Bearer some_jwt_token', 'Custom Vendor Authentication Header is correct');
+    t.ok(obj.body.language == 'en-US', 'Custom Vendor Language is correct');
+    t.ok(obj.body.format == 'audio/mpeg', 'Custom Vendor format is correct');
+    t.ok(obj.body.voice == 'English-US.Female-1', 'Custom Vendor voice is correct');
+    t.ok(obj.body.type == 'text', 'Custom Vendor type is correct');
+    t.ok(obj.body.text == 'This is a test.  This is only a test', 'Custom Vendor text is correct');
+
+    opts = await synthAudio(stats, {
+      vendor: 'custom:somethingnew2',
+      credentials: {
+        use_for_tts: 1,
+        custom_tts_url: "http://127.0.0.1:3100/somethingnew2",
+        auth_token: 'some_jwt_token'
+      },
+      language: 'en-US',
+      voice: 'English-US.Female-1',
+      text: '<speak>This is a test.  This is only a test</speak>',
+    });
+    t.ok(!opts.servedFromCache, `successfully synthesized Custom Vendor audio to ${opts.filePath}`);
+    obj = await getJSON(`http://127.0.0.1:3100/lastRequest/somethingnew2`);
+    t.ok(obj.body.type == 'ssml', 'Custom Vendor type is correct');
+    t.ok(obj.body.text == '<speak>This is a test.  This is only a test</speak>');
+  } catch (err) {
+    console.error(err);
+    t.end(err);
+  }
+  client.quit();
+});
+
 test('TTS Cache tests', async(t) => {
  const fn = require('..');
  const {purgeTtsCache, client} = fn(opts, logger);
@@ -0,0 +1,23 @@
+FROM --platform=linux/amd64 node:18.6.0-alpine as base
+
+RUN apk --update --no-cache add --virtual .builds-deps build-base python3
+
+WORKDIR /opt/app/
+
+FROM base as build
+
+COPY package.json package-lock.json ./
+
+RUN npm ci
+
+COPY . .
+
+FROM base
+
+COPY --from=build /opt/app /opt/app/
+
+ARG NODE_ENV
+
+ENV NODE_ENV $NODE_ENV
+
+CMD [ "node", "app.js" ]
@@ -0,0 +1,125 @@
+const express = require('express');
+const app = express();
+const Websocket = require('ws');
+const listenPort = process.env.HTTP_PORT || 3000;
+let hook_mapping = new Map();
+let ws_packet_count = new Map();
+let ws_metadata = new Map();
+
+/** websocket server for listen audio  */
+const recvAudio = (socket, req) => {
+  let packets = 0;
+  let path = req.url;
+  console.log('received websocket connection');
+  socket.on('message', (data, isBinary) => {
+    if (!isBinary) {
+      try {
+        const msg = JSON.parse(data);
+        console.log({msg}, 'received websocket message');
+        ws_metadata.set(path, msg);
+      }
+      catch (err) {
+        console.log({err}, 'error parsing websocket message');
+      }
+    }
+    else {
+      packets += data.length;
+    }
+  });
+  socket.on('error', (err) => {
+    console.log({err}, 'listen websocket: error');
+  });
+
+  socket.on('close', () => {
+    ws_packet_count.set(path, packets);
+  })
+};
+
+const wsServer = new Websocket.Server({ noServer: true });
+wsServer.setMaxListeners(0);
+wsServer.on('connection', recvAudio.bind(null));
+
+const server = app.listen(listenPort, () => {
+  console.log(`sample jambones app server listening on ${listenPort}`);
+});
+server.on('upgrade', (request, socket, head) => {
+  console.log('received upgrade request');
+  wsServer.handleUpgrade(request, socket, head, (socket) => {
+    wsServer.emit('connection', socket, request);
+  });
+});
+
+app.use(express.urlencoded({ extended: true }));
+app.use(express.json());
+
+/*
+ * Markup language
+ */
+
+app.all('/:key', (req, res) => {
+  let key = req.params.key;
+  console.log(req.body, 'POST /' + key);
+  addRequestToMap(key, req, hook_mapping);
+  return res.json({"audio":"content"})
+});
+
+// Fetch Requests
+app.get('/requests/:key', (req, res) => {
+  let key = req.params.key;
+  if (hook_mapping.has(key)) {
+    return res.json(hook_mapping.get(key));
+  } else {
+    return res.sendStatus(404);
+  }
+
+})
+
+app.get('/lastRequest/:key', (req, res) => {
+  let key = req.params.key;
+  if (hook_mapping.has(key)) {
+    let requests = hook_mapping.get(key);
+    return res.json(requests[requests.length - 1]);
+  } else {
+    return res.sendStatus(404);
+  }
+})
+
+// WS Fetch
+app.get('/ws_packet_count/:key', (req, res) => {
+  let key = `/${req.params.key}`;
+  console.log(key, ws_packet_count);
+  if (ws_packet_count.has(key)) {
+    return res.json({ count: ws_packet_count.get(key) });
+  } else {
+    return res.sendStatus(404);
+  }
+})
+
+app.get('/ws_metadata/:key', (req, res) => {
+  let key = `/${req.params.key}`;
+  console.log(key, ws_packet_count);
+  if (ws_metadata.has(key)) {
+    return res.json({ metadata: ws_metadata.get(key) });
+  } else {
+    return res.sendStatus(404);
+  }
+})
+
+function addRequestToMap(key, req, map) {
+  let headers = new Map()
+  for(let i = 0; i < req.rawHeaders.length; i++) {
+    if (i % 2 === 0) {
+      headers.set(req.rawHeaders[i], req.rawHeaders[i + 1])
+    }
+  }
+  let request = {
+    'url': req.url,
+    'headers': Object.fromEntries(headers),
+    'body': req.body
+  }
+  if (map.has(key)) {
+    map.get(key).push(request);
+  } else {
+    map.set(key, [request]);
+  }
+}
@@ -0,0 +1,15 @@
+{
+  "name": "webhook_tts",
+  "version": "1.0.0",
+  "description": "simple webhook tts for test purposes",
+  "main": "app.js",
+  "scripts": {
+    "start": "node app"
+  },
+  "author": "Dave Horton",
+  "license": "MIT",
+  "dependencies": {
+    "express": "^4.18.2",
+    "ws": "^8.12.0"
+  }
+}