Support whisper TTS (#346)

* support tts whisper

* support tts whisper

* wip

* wip

* fix wrong language and voice
This commit is contained in:
Hoan Luu Huu
2023-11-09 21:50:38 +07:00
committed by GitHub
parent adafff7ec3
commit 87b3ca7e94
9 changed files with 152 additions and 33 deletions
+1 -5
View File
@@ -200,11 +200,7 @@ export const AUDIO_FORMAT_OPTIONS = [
export const DEFAULT_ELEVENLABS_MODEL = "eleven_multilingual_v2";
export const ELEVENLABS_MODEL_OPTIONS = [
{ name: "Multilingual v2", value: "eleven_multilingual_v2" },
{ name: "Multilingual v1", value: "eleven_multilingual_v1" },
{ name: "English v1", value: "eleven_monolingual_v1" },
];
export const DEFAULT_WHISPER_MODEL = "tts-1";
// Google Custom Voice reported usage options
+1
View File
@@ -416,6 +416,7 @@ export interface SpeechCredential {
label: null | string;
cobalt_server_uri: null | string;
model_id: null | string;
model: null | string;
}
export interface Alert {
@@ -1,4 +1,4 @@
import React, { useEffect, useState } from "react";
import React, { useEffect, useRef, useState } from "react";
import {
getGoogleCustomVoices,
postSpeechServiceLanguages,
@@ -24,6 +24,7 @@ import {
VENDOR_MICROSOFT,
VENDOR_SONIOX,
VENDOR_WELLSAID,
VENDOR_WHISPER,
} from "src/vendor";
import {
LabelOptions,
@@ -89,14 +90,18 @@ export const SpeechProviderSelection = ({
const currentServiceProvider = useSelectState("currentServiceProvider");
const currentVendor = useRef(synthVendor);
useEffect(() => {
currentVendor.current = synthVendor;
if (!synthesis) {
return;
}
let options = synthesis[synthVendor as keyof SynthesisVendors]
const voiceOpts = synthesis[synthVendor as keyof SynthesisVendors]
.filter((lang: VoiceLanguage) => {
// ELEVENLABS has same voice for all lange, take voices from the 1st language
if (synthVendor === VENDOR_ELEVENLABS) {
// Only first language has voices, the rest has empty voices
if (synthVendor === VENDOR_ELEVENLABS && lang.voices.length > 0) {
return true;
}
return lang.code === synthLang;
@@ -107,15 +112,15 @@ export const SpeechProviderSelection = ({
value: voice.value,
}))
) as Voice[];
setSynthesisVoiceOptions(options);
setSynthesisVoiceOptions(voiceOpts);
options = synthesis[synthVendor as keyof SynthesisVendors].map(
const langOpts = synthesis[synthVendor as keyof SynthesisVendors].map(
(lang: VoiceLanguage) => ({
name: lang.name,
value: lang.code,
})
);
setSynthesisLanguageOptions(options);
setSynthesisLanguageOptions(langOpts);
if (synthVendor === VENDOR_ELEVENLABS) {
postSpeechServiceVoices(
@@ -127,6 +132,10 @@ export const SpeechProviderSelection = ({
label: synthLabel,
}
).then(({ json }) => {
// If after successfully fetching data, vendor is still good, then apply value
if (currentVendor.current !== VENDOR_ELEVENLABS) {
return;
}
if (json.length > 0) {
setSynthesisVoiceOptions(json);
}
@@ -151,11 +160,15 @@ export const SpeechProviderSelection = ({
account_sid: accountSid,
service_provider_sid: serviceProviderSid,
}).then(({ json }) => {
// If after successfully fetching data, vendor is still good, then apply value
if (currentVendor.current !== VENDOR_GOOGLE) {
return;
}
const customVOices = json.map((v) => ({
name: `${v.name} (Custom)`,
value: `custom_${v.google_custom_voice_sid}`,
}));
options = synthesis[synthVendor as keyof SynthesisVendors]
const options = synthesis[synthVendor as keyof SynthesisVendors]
.filter((lang: VoiceLanguage) => {
return lang.code === synthLang;
})
@@ -228,6 +241,15 @@ export const SpeechProviderSelection = ({
return;
}
if (vendor === VENDOR_WHISPER) {
const newLang = synthesis[vendor].find(
(lang) => lang.code === LANG_EN_US
);
setSynthLang(LANG_EN_US);
setSynthVoice(newLang!.voices[0].value);
return;
}
/** Google and AWS have different language lists */
/** If the new language doesn't map then default to "en-US" */
let newLang = synthesis[vendor].find(
@@ -359,6 +381,7 @@ export const SpeechProviderSelection = ({
(vendor) =>
vendor.value != VENDOR_WELLSAID &&
vendor.value != VENDOR_ELEVENLABS &&
vendor.value != VENDOR_WHISPER &&
vendor.value !== VENDOR_CUSTOM
)}
onChange={(e) => {
@@ -38,6 +38,8 @@ import {
VENDOR_COBALT,
VENDOR_ELEVENLABS,
VENDOR_ASSEMBLYAI,
VENDOR_WHISPER,
useTtsModels,
} from "src/vendor";
import { MSG_REQUIRED_FIELDS } from "src/constants";
import {
@@ -50,7 +52,12 @@ import {
import { getObscuredGoogleServiceKey } from "./utils";
import { CredentialStatus } from "./status";
import type { RegionVendors, GoogleServiceKey, Vendor } from "src/vendor/types";
import type {
RegionVendors,
GoogleServiceKey,
Vendor,
TtsModels,
} from "src/vendor/types";
import type {
Account,
GoogleCustomVoice,
@@ -59,10 +66,8 @@ import type {
} from "src/api/types";
import { setAccountFilter, setLocation } from "src/store/localStore";
import {
DEFAULT_ELEVENLABS_MODEL,
DEFAULT_GOOGLE_CUSTOM_VOICES_REPORTED_USAGE,
DISABLE_CUSTOM_SPEECH,
ELEVENLABS_MODEL_OPTIONS,
GOOGLE_CUSTOM_VOICES_REPORTED_USAGE,
} from "src/api/constants";
@@ -96,7 +101,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
const [sttApiKey, setSttApiKey] = useState("");
const [ttsRegion, setTtsRegion] = useState("");
const [ttsApiKey, setTtsApiKey] = useState("");
const [ttsModelId, setTtsModelId] = useState(DEFAULT_ELEVENLABS_MODEL);
const [ttsModelId, setTtsModelId] = useState("");
const [instanceId, setInstanceId] = useState("");
const [initialCheckCustomTts, setInitialCheckCustomTts] = useState(false);
const [initialCheckCustomStt, setInitialCheckCustomStt] = useState(false);
@@ -134,6 +139,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
const [useCustomVoicesCheck, setUseCustomVoicesCheck] = useState(false);
const [customVoices, setCustomVoices] = useState<GoogleCustomVoice[]>([]);
const [customVoicesMessage, setCustomVoicesMessage] = useState("");
const ttsModels = useTtsModels();
const handleFile = (file: File) => {
const handleError = () => {
@@ -273,7 +279,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
...(vendor === VENDOR_COBALT && {
cobalt_server_uri: cobaltServerUri || null,
}),
...(vendor === VENDOR_ELEVENLABS && {
...((vendor === VENDOR_ELEVENLABS || vendor === VENDOR_WHISPER) && {
model_id: ttsModelId || null,
}),
};
@@ -316,7 +322,8 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
vendor === VENDOR_DEEPGRAM ||
vendor === VENDOR_ASSEMBLYAI ||
vendor === VENDOR_SONIOX ||
vendor === VENDOR_ELEVENLABS
vendor === VENDOR_ELEVENLABS ||
vendor === VENDOR_WHISPER
? apiKey
: null,
}),
@@ -560,6 +567,15 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
setRegion("");
setApiKey("");
setGoogleServiceKey(null);
if (
ttsModels &&
(e.target.value === VENDOR_ELEVENLABS ||
e.target.value === VENDOR_WHISPER)
) {
setTtsModelId(
ttsModels[e.target.value as keyof TtsModels][0].value
);
}
}}
disabled={credential ? true : false}
required
@@ -627,6 +643,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
)}
{vendor !== VENDOR_WELLSAID &&
vendor !== VENDOR_CUSTOM &&
vendor !== VENDOR_WHISPER &&
vendor !== VENDOR_ELEVENLABS && (
<label htmlFor="use_for_stt" className="chk">
<input
@@ -1072,6 +1089,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
vendor === VENDOR_DEEPGRAM ||
vendor === VENDOR_ASSEMBLYAI ||
vendor == VENDOR_ELEVENLABS ||
vendor === VENDOR_WHISPER ||
vendor === VENDOR_SONIOX) && (
<fieldset>
<label htmlFor={`${vendor}_apikey`}>
@@ -1088,20 +1106,21 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
/>
</fieldset>
)}
{vendor == VENDOR_ELEVENLABS && (
<fieldset>
<label htmlFor={`${vendor}_apikey`}>Model</label>
<Selector
id={"audio_format"}
name={"audio_format"}
value={ttsModelId}
options={ELEVENLABS_MODEL_OPTIONS}
onChange={(e) => {
setTtsModelId(e.target.value);
}}
/>
</fieldset>
)}
{(vendor == VENDOR_ELEVENLABS || vendor == VENDOR_WHISPER) &&
ttsModels && (
<fieldset>
<label htmlFor={`${vendor}_tts_model_id`}>Model</label>
<Selector
id={"tts_model_id"}
name={"tts_model_id"}
value={ttsModelId}
options={ttsModels[vendor as keyof TtsModels]}
onChange={(e) => {
setTtsModelId(e.target.value);
}}
/>
</fieldset>
)}
{regions &&
regions[vendor as keyof RegionVendors] &&
vendor !== VENDOR_IBM &&
+33
View File
@@ -5,6 +5,7 @@ import type {
SynthesisVendors,
RecognizerVendors,
RegionVendors,
TtsModels,
} from "./types";
export const LANG_EN_US = "en-US";
@@ -24,6 +25,7 @@ export const VENDOR_CUSTOM = "custom";
export const VENDOR_COBALT = "cobalt";
export const VENDOR_ELEVENLABS = "elevenlabs";
export const VENDOR_ASSEMBLYAI = "assemblyai";
export const VENDOR_WHISPER = "whisper";
export const vendors: VendorOptions[] = [
{
@@ -78,8 +80,36 @@ export const vendors: VendorOptions[] = [
name: "AssemblyAI",
value: VENDOR_ASSEMBLYAI,
},
{
name: "Whisper",
value: VENDOR_WHISPER,
},
].sort((a, b) => a.name.localeCompare(b.name)) as VendorOptions[];
export const useTtsModels = () => {
const [models, setModels] = useState<TtsModels>();
useEffect(() => {
let ignore = false;
Promise.all([
import("./speech-synthsis-models/elevenlabs-models"),
import("./speech-synthsis-models/whisper-models"),
]).then(([{ default: elevenlabs }, { default: whisper }]) => {
if (!ignore) {
setModels({
elevenlabs,
whisper,
});
}
});
return function cleanup() {
ignore = true;
};
}, []);
return models;
};
export const useRegionVendors = () => {
const [regions, setRegions] = useState<RegionVendors>();
@@ -142,6 +172,7 @@ export const useSpeechVendors = () => {
import("./speech-synthesis/ibm-speech-synthesis-lang"),
import("./speech-synthesis/nvidia-speech-synthesis-lang"),
import("./speech-synthesis/elevellabs-speech-synthesis-lang"),
import("./speech-synthesis/whisper-speech-synthesis-lang"),
]).then(
([
{ default: awsRecognizer },
@@ -162,6 +193,7 @@ export const useSpeechVendors = () => {
{ default: ibmSynthesis },
{ default: nvidiaynthesis },
{ default: elevenLabsSynthesis },
{ default: whisperSynthesis },
]) => {
if (!ignore) {
setSpeech({
@@ -174,6 +206,7 @@ export const useSpeechVendors = () => {
ibm: ibmSynthesis,
nvidia: nvidiaynthesis,
elevenlabs: elevenLabsSynthesis,
whisper: whisperSynthesis,
},
recognizers: {
aws: awsRecognizer,
@@ -0,0 +1,18 @@
import type { VoiceLanguage } from "../types";
export const languages: VoiceLanguage[] = [
{
code: "en-US",
name: "English",
voices: [
{ value: "alloy", name: "Alloy" },
{ value: "echo", name: "Echo" },
{ value: "fable", name: "Fable" },
{ value: "onyx", name: "Onyx" },
{ value: "nova", name: "Nova" },
{ value: "shimmer", name: "Shimmer" },
],
},
];
export default languages;
@@ -0,0 +1,9 @@
import type { Model } from "../types";
export const models: Model[] = [
{ name: "Multilingual v2", value: "eleven_multilingual_v2" },
{ name: "Multilingual v1", value: "eleven_multilingual_v1" },
{ name: "English v1", value: "eleven_monolingual_v1" },
];
export default models;
+8
View File
@@ -0,0 +1,8 @@
import type { Model } from "../types";
export const models: Model[] = [
{ name: "TTS-1", value: "tts-1" },
{ name: "TTS-1-HD", value: "tts-1-hd" },
];
export default models;
+13 -1
View File
@@ -11,7 +11,8 @@ export type Vendor =
| "Cobalt"
| "Custom"
| "ElevenLabs"
| "assemblyai";
| "assemblyai"
| "whisper";
export interface VendorOptions {
name: Vendor;
@@ -28,6 +29,11 @@ export interface Region {
value: string;
}
export interface Model {
name: string;
value: string;
}
export interface Voice {
name: string;
value: string;
@@ -64,6 +70,11 @@ export interface RegionVendors {
ibm: Region[];
}
export interface TtsModels {
elevenlabs: Model[];
whisper: Model[];
}
export interface RecognizerVendors {
aws: Language[];
google: Language[];
@@ -86,6 +97,7 @@ export interface SynthesisVendors {
ibm: VoiceLanguage[];
nvidia: VoiceLanguage[];
elevenlabs: VoiceLanguage[];
whisper: VoiceLanguage[];
}
export interface MSRawSpeech {