mirror of
https://github.com/jambonz/jambonz-webapp.git
synced 2026-07-04 19:21:58 +00:00
Support whisper TTS (#346)
* support tts whisper * support tts whisper * wip * wip * fix wrong language and voice
This commit is contained in:
@@ -200,11 +200,7 @@ export const AUDIO_FORMAT_OPTIONS = [
|
||||
|
||||
export const DEFAULT_ELEVENLABS_MODEL = "eleven_multilingual_v2";
|
||||
|
||||
export const ELEVENLABS_MODEL_OPTIONS = [
|
||||
{ name: "Multilingual v2", value: "eleven_multilingual_v2" },
|
||||
{ name: "Multilingual v1", value: "eleven_multilingual_v1" },
|
||||
{ name: "English v1", value: "eleven_monolingual_v1" },
|
||||
];
|
||||
export const DEFAULT_WHISPER_MODEL = "tts-1";
|
||||
|
||||
// Google Custom Voice reported usage options
|
||||
|
||||
|
||||
@@ -416,6 +416,7 @@ export interface SpeechCredential {
|
||||
label: null | string;
|
||||
cobalt_server_uri: null | string;
|
||||
model_id: null | string;
|
||||
model: null | string;
|
||||
}
|
||||
|
||||
export interface Alert {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import React, { useEffect, useState } from "react";
|
||||
import React, { useEffect, useRef, useState } from "react";
|
||||
import {
|
||||
getGoogleCustomVoices,
|
||||
postSpeechServiceLanguages,
|
||||
@@ -24,6 +24,7 @@ import {
|
||||
VENDOR_MICROSOFT,
|
||||
VENDOR_SONIOX,
|
||||
VENDOR_WELLSAID,
|
||||
VENDOR_WHISPER,
|
||||
} from "src/vendor";
|
||||
import {
|
||||
LabelOptions,
|
||||
@@ -89,14 +90,18 @@ export const SpeechProviderSelection = ({
|
||||
|
||||
const currentServiceProvider = useSelectState("currentServiceProvider");
|
||||
|
||||
const currentVendor = useRef(synthVendor);
|
||||
|
||||
useEffect(() => {
|
||||
currentVendor.current = synthVendor;
|
||||
if (!synthesis) {
|
||||
return;
|
||||
}
|
||||
let options = synthesis[synthVendor as keyof SynthesisVendors]
|
||||
const voiceOpts = synthesis[synthVendor as keyof SynthesisVendors]
|
||||
.filter((lang: VoiceLanguage) => {
|
||||
// ELEVENLABS has same voice for all lange, take voices from the 1st language
|
||||
if (synthVendor === VENDOR_ELEVENLABS) {
|
||||
// Only first language has voices, the rest has empty voices
|
||||
if (synthVendor === VENDOR_ELEVENLABS && lang.voices.length > 0) {
|
||||
return true;
|
||||
}
|
||||
return lang.code === synthLang;
|
||||
@@ -107,15 +112,15 @@ export const SpeechProviderSelection = ({
|
||||
value: voice.value,
|
||||
}))
|
||||
) as Voice[];
|
||||
setSynthesisVoiceOptions(options);
|
||||
setSynthesisVoiceOptions(voiceOpts);
|
||||
|
||||
options = synthesis[synthVendor as keyof SynthesisVendors].map(
|
||||
const langOpts = synthesis[synthVendor as keyof SynthesisVendors].map(
|
||||
(lang: VoiceLanguage) => ({
|
||||
name: lang.name,
|
||||
value: lang.code,
|
||||
})
|
||||
);
|
||||
setSynthesisLanguageOptions(options);
|
||||
setSynthesisLanguageOptions(langOpts);
|
||||
|
||||
if (synthVendor === VENDOR_ELEVENLABS) {
|
||||
postSpeechServiceVoices(
|
||||
@@ -127,6 +132,10 @@ export const SpeechProviderSelection = ({
|
||||
label: synthLabel,
|
||||
}
|
||||
).then(({ json }) => {
|
||||
// If after successfully fetching data, vendor is still good, then apply value
|
||||
if (currentVendor.current !== VENDOR_ELEVENLABS) {
|
||||
return;
|
||||
}
|
||||
if (json.length > 0) {
|
||||
setSynthesisVoiceOptions(json);
|
||||
}
|
||||
@@ -151,11 +160,15 @@ export const SpeechProviderSelection = ({
|
||||
account_sid: accountSid,
|
||||
service_provider_sid: serviceProviderSid,
|
||||
}).then(({ json }) => {
|
||||
// If after successfully fetching data, vendor is still good, then apply value
|
||||
if (currentVendor.current !== VENDOR_GOOGLE) {
|
||||
return;
|
||||
}
|
||||
const customVOices = json.map((v) => ({
|
||||
name: `${v.name} (Custom)`,
|
||||
value: `custom_${v.google_custom_voice_sid}`,
|
||||
}));
|
||||
options = synthesis[synthVendor as keyof SynthesisVendors]
|
||||
const options = synthesis[synthVendor as keyof SynthesisVendors]
|
||||
.filter((lang: VoiceLanguage) => {
|
||||
return lang.code === synthLang;
|
||||
})
|
||||
@@ -228,6 +241,15 @@ export const SpeechProviderSelection = ({
|
||||
return;
|
||||
}
|
||||
|
||||
if (vendor === VENDOR_WHISPER) {
|
||||
const newLang = synthesis[vendor].find(
|
||||
(lang) => lang.code === LANG_EN_US
|
||||
);
|
||||
setSynthLang(LANG_EN_US);
|
||||
setSynthVoice(newLang!.voices[0].value);
|
||||
return;
|
||||
}
|
||||
|
||||
/** Google and AWS have different language lists */
|
||||
/** If the new language doesn't map then default to "en-US" */
|
||||
let newLang = synthesis[vendor].find(
|
||||
@@ -359,6 +381,7 @@ export const SpeechProviderSelection = ({
|
||||
(vendor) =>
|
||||
vendor.value != VENDOR_WELLSAID &&
|
||||
vendor.value != VENDOR_ELEVENLABS &&
|
||||
vendor.value != VENDOR_WHISPER &&
|
||||
vendor.value !== VENDOR_CUSTOM
|
||||
)}
|
||||
onChange={(e) => {
|
||||
|
||||
@@ -38,6 +38,8 @@ import {
|
||||
VENDOR_COBALT,
|
||||
VENDOR_ELEVENLABS,
|
||||
VENDOR_ASSEMBLYAI,
|
||||
VENDOR_WHISPER,
|
||||
useTtsModels,
|
||||
} from "src/vendor";
|
||||
import { MSG_REQUIRED_FIELDS } from "src/constants";
|
||||
import {
|
||||
@@ -50,7 +52,12 @@ import {
|
||||
import { getObscuredGoogleServiceKey } from "./utils";
|
||||
import { CredentialStatus } from "./status";
|
||||
|
||||
import type { RegionVendors, GoogleServiceKey, Vendor } from "src/vendor/types";
|
||||
import type {
|
||||
RegionVendors,
|
||||
GoogleServiceKey,
|
||||
Vendor,
|
||||
TtsModels,
|
||||
} from "src/vendor/types";
|
||||
import type {
|
||||
Account,
|
||||
GoogleCustomVoice,
|
||||
@@ -59,10 +66,8 @@ import type {
|
||||
} from "src/api/types";
|
||||
import { setAccountFilter, setLocation } from "src/store/localStore";
|
||||
import {
|
||||
DEFAULT_ELEVENLABS_MODEL,
|
||||
DEFAULT_GOOGLE_CUSTOM_VOICES_REPORTED_USAGE,
|
||||
DISABLE_CUSTOM_SPEECH,
|
||||
ELEVENLABS_MODEL_OPTIONS,
|
||||
GOOGLE_CUSTOM_VOICES_REPORTED_USAGE,
|
||||
} from "src/api/constants";
|
||||
|
||||
@@ -96,7 +101,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
|
||||
const [sttApiKey, setSttApiKey] = useState("");
|
||||
const [ttsRegion, setTtsRegion] = useState("");
|
||||
const [ttsApiKey, setTtsApiKey] = useState("");
|
||||
const [ttsModelId, setTtsModelId] = useState(DEFAULT_ELEVENLABS_MODEL);
|
||||
const [ttsModelId, setTtsModelId] = useState("");
|
||||
const [instanceId, setInstanceId] = useState("");
|
||||
const [initialCheckCustomTts, setInitialCheckCustomTts] = useState(false);
|
||||
const [initialCheckCustomStt, setInitialCheckCustomStt] = useState(false);
|
||||
@@ -134,6 +139,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
|
||||
const [useCustomVoicesCheck, setUseCustomVoicesCheck] = useState(false);
|
||||
const [customVoices, setCustomVoices] = useState<GoogleCustomVoice[]>([]);
|
||||
const [customVoicesMessage, setCustomVoicesMessage] = useState("");
|
||||
const ttsModels = useTtsModels();
|
||||
|
||||
const handleFile = (file: File) => {
|
||||
const handleError = () => {
|
||||
@@ -273,7 +279,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
|
||||
...(vendor === VENDOR_COBALT && {
|
||||
cobalt_server_uri: cobaltServerUri || null,
|
||||
}),
|
||||
...(vendor === VENDOR_ELEVENLABS && {
|
||||
...((vendor === VENDOR_ELEVENLABS || vendor === VENDOR_WHISPER) && {
|
||||
model_id: ttsModelId || null,
|
||||
}),
|
||||
};
|
||||
@@ -316,7 +322,8 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
|
||||
vendor === VENDOR_DEEPGRAM ||
|
||||
vendor === VENDOR_ASSEMBLYAI ||
|
||||
vendor === VENDOR_SONIOX ||
|
||||
vendor === VENDOR_ELEVENLABS
|
||||
vendor === VENDOR_ELEVENLABS ||
|
||||
vendor === VENDOR_WHISPER
|
||||
? apiKey
|
||||
: null,
|
||||
}),
|
||||
@@ -560,6 +567,15 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
|
||||
setRegion("");
|
||||
setApiKey("");
|
||||
setGoogleServiceKey(null);
|
||||
if (
|
||||
ttsModels &&
|
||||
(e.target.value === VENDOR_ELEVENLABS ||
|
||||
e.target.value === VENDOR_WHISPER)
|
||||
) {
|
||||
setTtsModelId(
|
||||
ttsModels[e.target.value as keyof TtsModels][0].value
|
||||
);
|
||||
}
|
||||
}}
|
||||
disabled={credential ? true : false}
|
||||
required
|
||||
@@ -627,6 +643,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
|
||||
)}
|
||||
{vendor !== VENDOR_WELLSAID &&
|
||||
vendor !== VENDOR_CUSTOM &&
|
||||
vendor !== VENDOR_WHISPER &&
|
||||
vendor !== VENDOR_ELEVENLABS && (
|
||||
<label htmlFor="use_for_stt" className="chk">
|
||||
<input
|
||||
@@ -1072,6 +1089,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
|
||||
vendor === VENDOR_DEEPGRAM ||
|
||||
vendor === VENDOR_ASSEMBLYAI ||
|
||||
vendor == VENDOR_ELEVENLABS ||
|
||||
vendor === VENDOR_WHISPER ||
|
||||
vendor === VENDOR_SONIOX) && (
|
||||
<fieldset>
|
||||
<label htmlFor={`${vendor}_apikey`}>
|
||||
@@ -1088,20 +1106,21 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
|
||||
/>
|
||||
</fieldset>
|
||||
)}
|
||||
{vendor == VENDOR_ELEVENLABS && (
|
||||
<fieldset>
|
||||
<label htmlFor={`${vendor}_apikey`}>Model</label>
|
||||
<Selector
|
||||
id={"audio_format"}
|
||||
name={"audio_format"}
|
||||
value={ttsModelId}
|
||||
options={ELEVENLABS_MODEL_OPTIONS}
|
||||
onChange={(e) => {
|
||||
setTtsModelId(e.target.value);
|
||||
}}
|
||||
/>
|
||||
</fieldset>
|
||||
)}
|
||||
{(vendor == VENDOR_ELEVENLABS || vendor == VENDOR_WHISPER) &&
|
||||
ttsModels && (
|
||||
<fieldset>
|
||||
<label htmlFor={`${vendor}_tts_model_id`}>Model</label>
|
||||
<Selector
|
||||
id={"tts_model_id"}
|
||||
name={"tts_model_id"}
|
||||
value={ttsModelId}
|
||||
options={ttsModels[vendor as keyof TtsModels]}
|
||||
onChange={(e) => {
|
||||
setTtsModelId(e.target.value);
|
||||
}}
|
||||
/>
|
||||
</fieldset>
|
||||
)}
|
||||
{regions &&
|
||||
regions[vendor as keyof RegionVendors] &&
|
||||
vendor !== VENDOR_IBM &&
|
||||
|
||||
Vendored
+33
@@ -5,6 +5,7 @@ import type {
|
||||
SynthesisVendors,
|
||||
RecognizerVendors,
|
||||
RegionVendors,
|
||||
TtsModels,
|
||||
} from "./types";
|
||||
|
||||
export const LANG_EN_US = "en-US";
|
||||
@@ -24,6 +25,7 @@ export const VENDOR_CUSTOM = "custom";
|
||||
export const VENDOR_COBALT = "cobalt";
|
||||
export const VENDOR_ELEVENLABS = "elevenlabs";
|
||||
export const VENDOR_ASSEMBLYAI = "assemblyai";
|
||||
export const VENDOR_WHISPER = "whisper";
|
||||
|
||||
export const vendors: VendorOptions[] = [
|
||||
{
|
||||
@@ -78,8 +80,36 @@ export const vendors: VendorOptions[] = [
|
||||
name: "AssemblyAI",
|
||||
value: VENDOR_ASSEMBLYAI,
|
||||
},
|
||||
{
|
||||
name: "Whisper",
|
||||
value: VENDOR_WHISPER,
|
||||
},
|
||||
].sort((a, b) => a.name.localeCompare(b.name)) as VendorOptions[];
|
||||
|
||||
export const useTtsModels = () => {
|
||||
const [models, setModels] = useState<TtsModels>();
|
||||
|
||||
useEffect(() => {
|
||||
let ignore = false;
|
||||
Promise.all([
|
||||
import("./speech-synthsis-models/elevenlabs-models"),
|
||||
import("./speech-synthsis-models/whisper-models"),
|
||||
]).then(([{ default: elevenlabs }, { default: whisper }]) => {
|
||||
if (!ignore) {
|
||||
setModels({
|
||||
elevenlabs,
|
||||
whisper,
|
||||
});
|
||||
}
|
||||
});
|
||||
return function cleanup() {
|
||||
ignore = true;
|
||||
};
|
||||
}, []);
|
||||
|
||||
return models;
|
||||
};
|
||||
|
||||
export const useRegionVendors = () => {
|
||||
const [regions, setRegions] = useState<RegionVendors>();
|
||||
|
||||
@@ -142,6 +172,7 @@ export const useSpeechVendors = () => {
|
||||
import("./speech-synthesis/ibm-speech-synthesis-lang"),
|
||||
import("./speech-synthesis/nvidia-speech-synthesis-lang"),
|
||||
import("./speech-synthesis/elevellabs-speech-synthesis-lang"),
|
||||
import("./speech-synthesis/whisper-speech-synthesis-lang"),
|
||||
]).then(
|
||||
([
|
||||
{ default: awsRecognizer },
|
||||
@@ -162,6 +193,7 @@ export const useSpeechVendors = () => {
|
||||
{ default: ibmSynthesis },
|
||||
{ default: nvidiaynthesis },
|
||||
{ default: elevenLabsSynthesis },
|
||||
{ default: whisperSynthesis },
|
||||
]) => {
|
||||
if (!ignore) {
|
||||
setSpeech({
|
||||
@@ -174,6 +206,7 @@ export const useSpeechVendors = () => {
|
||||
ibm: ibmSynthesis,
|
||||
nvidia: nvidiaynthesis,
|
||||
elevenlabs: elevenLabsSynthesis,
|
||||
whisper: whisperSynthesis,
|
||||
},
|
||||
recognizers: {
|
||||
aws: awsRecognizer,
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
import type { VoiceLanguage } from "../types";
|
||||
|
||||
export const languages: VoiceLanguage[] = [
|
||||
{
|
||||
code: "en-US",
|
||||
name: "English",
|
||||
voices: [
|
||||
{ value: "alloy", name: "Alloy" },
|
||||
{ value: "echo", name: "Echo" },
|
||||
{ value: "fable", name: "Fable" },
|
||||
{ value: "onyx", name: "Onyx" },
|
||||
{ value: "nova", name: "Nova" },
|
||||
{ value: "shimmer", name: "Shimmer" },
|
||||
],
|
||||
},
|
||||
];
|
||||
|
||||
export default languages;
|
||||
@@ -0,0 +1,9 @@
|
||||
import type { Model } from "../types";
|
||||
|
||||
export const models: Model[] = [
|
||||
{ name: "Multilingual v2", value: "eleven_multilingual_v2" },
|
||||
{ name: "Multilingual v1", value: "eleven_multilingual_v1" },
|
||||
{ name: "English v1", value: "eleven_monolingual_v1" },
|
||||
];
|
||||
|
||||
export default models;
|
||||
@@ -0,0 +1,8 @@
|
||||
import type { Model } from "../types";
|
||||
|
||||
export const models: Model[] = [
|
||||
{ name: "TTS-1", value: "tts-1" },
|
||||
{ name: "TTS-1-HD", value: "tts-1-hd" },
|
||||
];
|
||||
|
||||
export default models;
|
||||
Vendored
+13
-1
@@ -11,7 +11,8 @@ export type Vendor =
|
||||
| "Cobalt"
|
||||
| "Custom"
|
||||
| "ElevenLabs"
|
||||
| "assemblyai";
|
||||
| "assemblyai"
|
||||
| "whisper";
|
||||
|
||||
export interface VendorOptions {
|
||||
name: Vendor;
|
||||
@@ -28,6 +29,11 @@ export interface Region {
|
||||
value: string;
|
||||
}
|
||||
|
||||
export interface Model {
|
||||
name: string;
|
||||
value: string;
|
||||
}
|
||||
|
||||
export interface Voice {
|
||||
name: string;
|
||||
value: string;
|
||||
@@ -64,6 +70,11 @@ export interface RegionVendors {
|
||||
ibm: Region[];
|
||||
}
|
||||
|
||||
export interface TtsModels {
|
||||
elevenlabs: Model[];
|
||||
whisper: Model[];
|
||||
}
|
||||
|
||||
export interface RecognizerVendors {
|
||||
aws: Language[];
|
||||
google: Language[];
|
||||
@@ -86,6 +97,7 @@ export interface SynthesisVendors {
|
||||
ibm: VoiceLanguage[];
|
||||
nvidia: VoiceLanguage[];
|
||||
elevenlabs: VoiceLanguage[];
|
||||
whisper: VoiceLanguage[];
|
||||
}
|
||||
|
||||
export interface MSRawSpeech {
|
||||
|
||||
Reference in New Issue
Block a user