mirror of
https://github.com/jambonz/speech-utils.git
synced 2026-07-04 19:31:49 +00:00
chore: deprecate + remove verbio, nuance, playht speech vendor support (#144)
* chore: deprecate and remove verbio, nuance speech vendor support Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * chore: also deprecate and remove PlayHT speech vendor PlayHT was acquired and no longer provides the service. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+4
-257
@@ -16,26 +16,10 @@ const {
|
||||
} = sdk;
|
||||
const {
|
||||
makeSynthKey,
|
||||
createNuanceClient,
|
||||
createKryptonClient,
|
||||
createRivaClient,
|
||||
noopLogger,
|
||||
makeFilePath,
|
||||
makePlayhtKey
|
||||
makeFilePath
|
||||
} = require('./utils');
|
||||
const getNuanceAccessToken = require('./get-nuance-access-token');
|
||||
const getVerbioAccessToken = require('./get-verbio-token');
|
||||
const {
|
||||
SynthesisRequest,
|
||||
Voice,
|
||||
AudioFormat,
|
||||
AudioParameters,
|
||||
PCM,
|
||||
Input,
|
||||
Text,
|
||||
SSML,
|
||||
EventParameters
|
||||
} = require('../stubs/nuance/synthesizer_pb');
|
||||
const {SynthesizeSpeechRequest} = require('../stubs/riva/proto/riva_tts_pb');
|
||||
const {AudioEncoding} = require('../stubs/riva/proto/riva_audio_pb');
|
||||
const debug = require('debug')('jambonz:realtimedb-helpers');
|
||||
@@ -95,10 +79,10 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
|
||||
let rtt;
|
||||
logger = logger || noopLogger;
|
||||
|
||||
assert.ok(['google', 'aws', 'polly', 'microsoft', 'wellsaid', 'nuance', 'nvidia', 'elevenlabs',
|
||||
'whisper', 'deepgram', 'playht', 'rimelabs', 'verbio', 'cartesia', 'inworld', 'resemble'].includes(vendor) ||
|
||||
assert.ok(['google', 'aws', 'polly', 'microsoft', 'wellsaid', 'nvidia', 'elevenlabs',
|
||||
'whisper', 'deepgram', 'rimelabs', 'cartesia', 'inworld', 'resemble'].includes(vendor) ||
|
||||
vendor.startsWith('custom'),
|
||||
`synthAudio supported vendors are google, aws, microsoft, nuance, nvidia and wellsaid ..etc, not ${vendor}`);
|
||||
`synthAudio supported vendors are google, aws, microsoft, nvidia and wellsaid ..etc, not ${vendor}`);
|
||||
if ('google' === vendor) {
|
||||
assert.ok(language, 'synthAudio requires language when google is used');
|
||||
}
|
||||
@@ -109,13 +93,6 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
|
||||
assert.ok(language || deploymentId, 'synthAudio requires language when microsoft is used');
|
||||
assert.ok(voice || deploymentId, 'synthAudio requires voice when microsoft is used');
|
||||
}
|
||||
else if ('nuance' === vendor) {
|
||||
assert.ok(voice, 'synthAudio requires voice when nuance is used');
|
||||
if (!credentials.nuance_tts_uri) {
|
||||
assert.ok(credentials.client_id, 'synthAudio requires client_id in credentials when nuance is used');
|
||||
assert.ok(credentials.secret, 'synthAudio requires client_id in credentials when nuance is used');
|
||||
}
|
||||
}
|
||||
else if ('nvidia' === vendor) {
|
||||
assert.ok(voice, 'synthAudio requires voice when nvidia is used');
|
||||
assert.ok(language, 'synthAudio requires language when nvidia is used');
|
||||
@@ -129,11 +106,6 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
|
||||
assert.ok(voice, 'synthAudio requires voice when elevenlabs is used');
|
||||
assert.ok(credentials.api_key, 'synthAudio requires api_key when elevenlabs is used');
|
||||
assert.ok(credentials.model_id, 'synthAudio requires model_id when elevenlabs is used');
|
||||
} else if ('playht' === vendor) {
|
||||
assert.ok(voice, 'synthAudio requires voice when playht is used');
|
||||
assert.ok(credentials.api_key, 'synthAudio requires api_key when playht is used');
|
||||
assert.ok(credentials.user_id, 'synthAudio requires user_id when playht is used');
|
||||
assert.ok(credentials.voice_engine, 'synthAudio requires voice_engine when playht is used');
|
||||
} else if ('inworld' === vendor) {
|
||||
assert.ok(voice, 'synthAudio requires voice when inworld is used');
|
||||
assert.ok(credentials.api_key, 'synthAudio requires api_key when inworld is used');
|
||||
@@ -148,10 +120,6 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
|
||||
assert.ok(credentials.api_key, 'synthAudio requires api_key when whisper is used');
|
||||
} else if (vendor.startsWith('custom')) {
|
||||
assert.ok(credentials.custom_tts_url, `synthAudio requires custom_tts_url in credentials when ${vendor} is used`);
|
||||
} else if ('verbio' === vendor) {
|
||||
assert.ok(voice, 'synthAudio requires voice when verbio is used');
|
||||
assert.ok(credentials.client_id, 'synthAudio requires client_id when verbio is used');
|
||||
assert.ok(credentials.client_secret, 'synthAudio requires client_secret when verbio is used');
|
||||
} else if ('deepgram' === vendor) {
|
||||
if (!credentials.deepgram_tts_uri) {
|
||||
assert.ok(credentials.api_key, 'synthAudio requires api_key when deepgram is used');
|
||||
@@ -216,10 +184,6 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
|
||||
audioData = await synthMicrosoft(logger, {credentials, stats, language, voice, key, text, deploymentId,
|
||||
renderForCaching, disableTtsStreaming, disableTtsCache});
|
||||
break;
|
||||
case 'nuance':
|
||||
model = model || 'enhanced';
|
||||
audioData = await synthNuance(client, logger, {credentials, stats, voice, model, key, text});
|
||||
break;
|
||||
case 'nvidia':
|
||||
audioData = await synthNvidia(client, logger, {credentials, stats, language, voice, model, key, text,
|
||||
renderForCaching, disableTtsStreaming, disableTtsCache});
|
||||
@@ -232,11 +196,6 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
|
||||
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming,
|
||||
disableTtsCache});
|
||||
break;
|
||||
case 'playht':
|
||||
audioData = await synthPlayHT(client, logger, {
|
||||
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming,
|
||||
disableTtsCache});
|
||||
break;
|
||||
case 'cartesia':
|
||||
audioData = await synthCartesia(logger, {
|
||||
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming,
|
||||
@@ -257,11 +216,6 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
|
||||
credentials, stats, voice, key, text, instructions, renderForCaching, disableTtsStreaming,
|
||||
disableTtsCache});
|
||||
break;
|
||||
case 'verbio':
|
||||
audioData = await synthVerbio(client, logger, {
|
||||
credentials, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache});
|
||||
if (audioData?.filePath) return audioData;
|
||||
break;
|
||||
case 'deepgram':
|
||||
audioData = await synthDeepgram(logger, {credentials, stats, model, key, text,
|
||||
renderForCaching, disableTtsStreaming, disableTtsCache});
|
||||
@@ -725,70 +679,6 @@ const synthWellSaid = async(logger, {credentials, stats, language, voice, gender
|
||||
}
|
||||
};
|
||||
|
||||
const synthNuance = async(client, logger, {credentials, stats, voice, model, text}) => {
|
||||
let nuanceClient;
|
||||
const {client_id, secret, nuance_tts_uri} = credentials;
|
||||
if (nuance_tts_uri) {
|
||||
nuanceClient = await createKryptonClient(nuance_tts_uri);
|
||||
}
|
||||
else {
|
||||
/* get a nuance access token */
|
||||
const {access_token} = await getNuanceAccessToken(client, logger, client_id, secret, 'tts');
|
||||
nuanceClient = await createNuanceClient(access_token);
|
||||
}
|
||||
|
||||
const v = new Voice();
|
||||
const p = new AudioParameters();
|
||||
const f = new AudioFormat();
|
||||
const pcm = new PCM();
|
||||
const params = new EventParameters();
|
||||
const request = new SynthesisRequest();
|
||||
const input = new Input();
|
||||
|
||||
if (text.startsWith('<speak')) {
|
||||
const ssml = new SSML();
|
||||
ssml.setText(text);
|
||||
input.setSsml(ssml);
|
||||
}
|
||||
else {
|
||||
const t = new Text();
|
||||
t.setText(text);
|
||||
input.setText(t);
|
||||
}
|
||||
const sampleRate = 8000;
|
||||
pcm.setSampleRateHz(sampleRate);
|
||||
f.setPcm(pcm);
|
||||
p.setAudioFormat(f);
|
||||
v.setName(voice);
|
||||
v.setModel(model);
|
||||
request.setVoice(v);
|
||||
request.setAudioParams(p);
|
||||
request.setInput(input);
|
||||
request.setEventParams(params);
|
||||
request.setUserId('jambonz');
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
nuanceClient.unarySynthesize(request, (err, response) => {
|
||||
if (err) {
|
||||
console.error(err);
|
||||
return reject(err);
|
||||
}
|
||||
const status = response.getStatus();
|
||||
const code = status.getCode();
|
||||
if (code !== 200) {
|
||||
const message = status.getMessage();
|
||||
const details = status.getDetails();
|
||||
return reject({code, message, details});
|
||||
}
|
||||
resolve({
|
||||
audioContent: Buffer.from(response.getAudio()),
|
||||
extension: 'r8',
|
||||
sampleRate
|
||||
});
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
const synthNvidia = async(client, logger, {
|
||||
credentials, stats, language, voice, model, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
|
||||
}) => {
|
||||
@@ -954,101 +844,6 @@ const synthElevenlabs = async(logger, {
|
||||
}
|
||||
};
|
||||
|
||||
const synthPlayHT = async(client, logger, {
|
||||
credentials, options, stats, voice, language, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
|
||||
}) => {
|
||||
const {api_key, user_id, voice_engine, playht_tts_uri, options: credOpts} = credentials;
|
||||
const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
|
||||
|
||||
let synthesizeUrl = playht_tts_uri ? `${playht_tts_uri}/api/v2/tts/stream` : 'https://api.play.ht/api/v2/tts/stream';
|
||||
// If model is play3.0, the synthesizeUrl is got from authentication endpoint
|
||||
if (voice_engine === 'Play3.0') {
|
||||
try {
|
||||
const post = bent('https://api.play.ht', 'POST', 'json', 201, {
|
||||
'AUTHORIZATION': api_key,
|
||||
'X-USER-ID': user_id,
|
||||
'Accept': 'application/json'
|
||||
});
|
||||
const key = makePlayhtKey(api_key);
|
||||
const url = await client.get(key);
|
||||
if (!url) {
|
||||
const {inference_address, expires_at_ms} = await post('/api/v3/auth');
|
||||
synthesizeUrl = inference_address;
|
||||
const expiry = Math.floor((expires_at_ms - Date.now()) / 1000 - 30);
|
||||
await client.set(key, inference_address, 'EX', expiry);
|
||||
} else {
|
||||
// Use cached URL
|
||||
synthesizeUrl = url;
|
||||
}
|
||||
} catch (err) {
|
||||
logger.info({err}, 'synth PlayHT returned error for authentication version 3.0');
|
||||
stats.increment('tts.count', ['vendor:playht', 'accepted:no']);
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
/* default to using the streaming interface, unless disabled by env var OR we want just a cache file */
|
||||
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
||||
let params = '{';
|
||||
params += `api_key=${api_key}`;
|
||||
params += `,playback_id=${key}`;
|
||||
params += `,user_id=${user_id}`;
|
||||
params += ',vendor=playht';
|
||||
params += `,voice=${voice}`;
|
||||
params += `,voice_engine=${voice_engine}`;
|
||||
params += `,synthesize_url=${synthesizeUrl}`;
|
||||
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
||||
params += `,language=${language}`;
|
||||
if (opts.quality) params += `,quality=${opts.quality}`;
|
||||
if (opts.speed) params += `,speed=${opts.speed}`;
|
||||
if (opts.seed) params += `,style=${opts.seed}`;
|
||||
if (opts.temperature) params += `,temperature=${opts.temperature}`;
|
||||
if (opts.emotion) params += `,emotion=${opts.emotion}`;
|
||||
if (opts.voice_guidance) params += `,voice_guidance=${opts.voice_guidance}`;
|
||||
if (opts.style_guidance) params += `,style_guidance=${opts.style_guidance}`;
|
||||
if (opts.text_guidance) params += `,text_guidance=${opts.text_guidance}`;
|
||||
if (opts.top_p) params += `,top_p=${opts.top_p}`;
|
||||
if (opts.repetition_penalty) params += `,repetition_penalty=${opts.repetition_penalty}`;
|
||||
params += '}';
|
||||
|
||||
return {
|
||||
filePath: `say:${params}${text.replace(/\n/g, ' ').replace(/\r/g, ' ')}`,
|
||||
servedFromCache: false,
|
||||
rtt: 0
|
||||
};
|
||||
}
|
||||
|
||||
try {
|
||||
const post = bent('POST', 'buffer', {
|
||||
...(voice_engine !== 'Play3.0' && {
|
||||
'AUTHORIZATION': api_key,
|
||||
'X-USER-ID': user_id,
|
||||
}),
|
||||
'Accept': 'audio/mpeg',
|
||||
'Content-Type': 'application/json'
|
||||
});
|
||||
|
||||
const audioContent = await post(synthesizeUrl, {
|
||||
text,
|
||||
...(voice_engine === 'Play3.0' && { language }),
|
||||
voice,
|
||||
voice_engine,
|
||||
output_format: 'mp3',
|
||||
sample_rate: 8000,
|
||||
...opts
|
||||
});
|
||||
return {
|
||||
audioContent,
|
||||
extension: 'mp3',
|
||||
sampleRate: 8000
|
||||
};
|
||||
} catch (err) {
|
||||
logger.info({err}, 'synth PlayHT returned error');
|
||||
stats.increment('tts.count', ['vendor:playht', 'accepted:no']);
|
||||
throw err;
|
||||
}
|
||||
};
|
||||
|
||||
const synthInworld = async(logger, {
|
||||
credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
|
||||
}) => {
|
||||
@@ -1174,54 +969,6 @@ const synthRimelabs = async(logger, {
|
||||
throw err;
|
||||
}
|
||||
};
|
||||
const synthVerbio = async(client, logger, {
|
||||
credentials, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
|
||||
}) => {
|
||||
//https://doc.speechcenter.verbio.com/#tag/Text-To-Speech-REST-API
|
||||
if (text.length > 2000) {
|
||||
throw new Error('Verbio cannot synthesize for the text length larger than 2000 characters');
|
||||
}
|
||||
const token = await getVerbioAccessToken(client, logger, credentials);
|
||||
if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
||||
let params = '{';
|
||||
params += `access_token=${token.access_token}`;
|
||||
params += `,playback_id=${key}`;
|
||||
params += ',vendor=verbio';
|
||||
params += `,voice=${voice}`;
|
||||
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
||||
params += '}';
|
||||
|
||||
return {
|
||||
filePath: `say:${params}${text.replace(/\n/g, ' ')}`,
|
||||
servedFromCache: false,
|
||||
rtt: 0
|
||||
};
|
||||
}
|
||||
|
||||
try {
|
||||
const post = bent('https://us.rest.speechcenter.verbio.com', 'POST', 'buffer', {
|
||||
'Authorization': `Bearer ${token.access_token}`,
|
||||
'User-Agent': 'jambonz',
|
||||
'Content-Type': 'application/json'
|
||||
});
|
||||
const audioContent = await post('/api/v1/synthesize', {
|
||||
voice_id: voice,
|
||||
output_sample_rate: '8k',
|
||||
output_encoding: 'pcm16',
|
||||
text
|
||||
});
|
||||
return {
|
||||
audioContent,
|
||||
extension: 'r8',
|
||||
sampleRate: 8000
|
||||
};
|
||||
} catch (err) {
|
||||
logger.info({err}, 'synth Verbio returned error');
|
||||
stats.increment('tts.count', ['vendor:verbio', 'accepted:no']);
|
||||
throw err;
|
||||
}
|
||||
};
|
||||
|
||||
const synthWhisper = async(logger, {credentials, stats, voice, key, text, instructions,
|
||||
renderForCaching, disableTtsStreaming, disableTtsCache}) => {
|
||||
const {api_key, model_id, baseURL, timeout, speed} = credentials;
|
||||
|
||||
Reference in New Issue
Block a user