diff --git a/src/api/constants.ts b/src/api/constants.ts index a5086d0..3061e7a 100644 --- a/src/api/constants.ts +++ b/src/api/constants.ts @@ -200,11 +200,7 @@ export const AUDIO_FORMAT_OPTIONS = [ export const DEFAULT_ELEVENLABS_MODEL = "eleven_multilingual_v2"; -export const ELEVENLABS_MODEL_OPTIONS = [ - { name: "Multilingual v2", value: "eleven_multilingual_v2" }, - { name: "Multilingual v1", value: "eleven_multilingual_v1" }, - { name: "English v1", value: "eleven_monolingual_v1" }, -]; +export const DEFAULT_WHISPER_MODEL = "tts-1"; // Google Custom Voice reported usage options diff --git a/src/api/types.ts b/src/api/types.ts index d17ca17..0518482 100644 --- a/src/api/types.ts +++ b/src/api/types.ts @@ -416,6 +416,7 @@ export interface SpeechCredential { label: null | string; cobalt_server_uri: null | string; model_id: null | string; + model: null | string; } export interface Alert { diff --git a/src/containers/internal/views/applications/speech-selection.tsx b/src/containers/internal/views/applications/speech-selection.tsx index 0e7b0df..ca0bfd5 100644 --- a/src/containers/internal/views/applications/speech-selection.tsx +++ b/src/containers/internal/views/applications/speech-selection.tsx @@ -1,4 +1,4 @@ -import React, { useEffect, useState } from "react"; +import React, { useEffect, useRef, useState } from "react"; import { getGoogleCustomVoices, postSpeechServiceLanguages, @@ -24,6 +24,7 @@ import { VENDOR_MICROSOFT, VENDOR_SONIOX, VENDOR_WELLSAID, + VENDOR_WHISPER, } from "src/vendor"; import { LabelOptions, @@ -89,14 +90,18 @@ export const SpeechProviderSelection = ({ const currentServiceProvider = useSelectState("currentServiceProvider"); + const currentVendor = useRef(synthVendor); + useEffect(() => { + currentVendor.current = synthVendor; if (!synthesis) { return; } - let options = synthesis[synthVendor as keyof SynthesisVendors] + const voiceOpts = synthesis[synthVendor as keyof SynthesisVendors] .filter((lang: VoiceLanguage) => { // ELEVENLABS has same voice for all lange, take voices from the 1st language - if (synthVendor === VENDOR_ELEVENLABS) { + // Only first language has voices, the rest has empty voices + if (synthVendor === VENDOR_ELEVENLABS && lang.voices.length > 0) { return true; } return lang.code === synthLang; @@ -107,15 +112,15 @@ export const SpeechProviderSelection = ({ value: voice.value, })) ) as Voice[]; - setSynthesisVoiceOptions(options); + setSynthesisVoiceOptions(voiceOpts); - options = synthesis[synthVendor as keyof SynthesisVendors].map( + const langOpts = synthesis[synthVendor as keyof SynthesisVendors].map( (lang: VoiceLanguage) => ({ name: lang.name, value: lang.code, }) ); - setSynthesisLanguageOptions(options); + setSynthesisLanguageOptions(langOpts); if (synthVendor === VENDOR_ELEVENLABS) { postSpeechServiceVoices( @@ -127,6 +132,10 @@ export const SpeechProviderSelection = ({ label: synthLabel, } ).then(({ json }) => { + // If after successfully fetching data, vendor is still good, then apply value + if (currentVendor.current !== VENDOR_ELEVENLABS) { + return; + } if (json.length > 0) { setSynthesisVoiceOptions(json); } @@ -151,11 +160,15 @@ export const SpeechProviderSelection = ({ account_sid: accountSid, service_provider_sid: serviceProviderSid, }).then(({ json }) => { + // If after successfully fetching data, vendor is still good, then apply value + if (currentVendor.current !== VENDOR_GOOGLE) { + return; + } const customVOices = json.map((v) => ({ name: `${v.name} (Custom)`, value: `custom_${v.google_custom_voice_sid}`, })); - options = synthesis[synthVendor as keyof SynthesisVendors] + const options = synthesis[synthVendor as keyof SynthesisVendors] .filter((lang: VoiceLanguage) => { return lang.code === synthLang; }) @@ -228,6 +241,15 @@ export const SpeechProviderSelection = ({ return; } + if (vendor === VENDOR_WHISPER) { + const newLang = synthesis[vendor].find( + (lang) => lang.code === LANG_EN_US + ); + setSynthLang(LANG_EN_US); + setSynthVoice(newLang!.voices[0].value); + return; + } + /** Google and AWS have different language lists */ /** If the new language doesn't map then default to "en-US" */ let newLang = synthesis[vendor].find( @@ -359,6 +381,7 @@ export const SpeechProviderSelection = ({ (vendor) => vendor.value != VENDOR_WELLSAID && vendor.value != VENDOR_ELEVENLABS && + vendor.value != VENDOR_WHISPER && vendor.value !== VENDOR_CUSTOM )} onChange={(e) => { diff --git a/src/containers/internal/views/speech-services/form.tsx b/src/containers/internal/views/speech-services/form.tsx index d673177..24845b6 100644 --- a/src/containers/internal/views/speech-services/form.tsx +++ b/src/containers/internal/views/speech-services/form.tsx @@ -38,6 +38,8 @@ import { VENDOR_COBALT, VENDOR_ELEVENLABS, VENDOR_ASSEMBLYAI, + VENDOR_WHISPER, + useTtsModels, } from "src/vendor"; import { MSG_REQUIRED_FIELDS } from "src/constants"; import { @@ -50,7 +52,12 @@ import { import { getObscuredGoogleServiceKey } from "./utils"; import { CredentialStatus } from "./status"; -import type { RegionVendors, GoogleServiceKey, Vendor } from "src/vendor/types"; +import type { + RegionVendors, + GoogleServiceKey, + Vendor, + TtsModels, +} from "src/vendor/types"; import type { Account, GoogleCustomVoice, @@ -59,10 +66,8 @@ import type { } from "src/api/types"; import { setAccountFilter, setLocation } from "src/store/localStore"; import { - DEFAULT_ELEVENLABS_MODEL, DEFAULT_GOOGLE_CUSTOM_VOICES_REPORTED_USAGE, DISABLE_CUSTOM_SPEECH, - ELEVENLABS_MODEL_OPTIONS, GOOGLE_CUSTOM_VOICES_REPORTED_USAGE, } from "src/api/constants"; @@ -96,7 +101,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => { const [sttApiKey, setSttApiKey] = useState(""); const [ttsRegion, setTtsRegion] = useState(""); const [ttsApiKey, setTtsApiKey] = useState(""); - const [ttsModelId, setTtsModelId] = useState(DEFAULT_ELEVENLABS_MODEL); + const [ttsModelId, setTtsModelId] = useState(""); const [instanceId, setInstanceId] = useState(""); const [initialCheckCustomTts, setInitialCheckCustomTts] = useState(false); const [initialCheckCustomStt, setInitialCheckCustomStt] = useState(false); @@ -134,6 +139,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => { const [useCustomVoicesCheck, setUseCustomVoicesCheck] = useState(false); const [customVoices, setCustomVoices] = useState([]); const [customVoicesMessage, setCustomVoicesMessage] = useState(""); + const ttsModels = useTtsModels(); const handleFile = (file: File) => { const handleError = () => { @@ -273,7 +279,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => { ...(vendor === VENDOR_COBALT && { cobalt_server_uri: cobaltServerUri || null, }), - ...(vendor === VENDOR_ELEVENLABS && { + ...((vendor === VENDOR_ELEVENLABS || vendor === VENDOR_WHISPER) && { model_id: ttsModelId || null, }), }; @@ -316,7 +322,8 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => { vendor === VENDOR_DEEPGRAM || vendor === VENDOR_ASSEMBLYAI || vendor === VENDOR_SONIOX || - vendor === VENDOR_ELEVENLABS + vendor === VENDOR_ELEVENLABS || + vendor === VENDOR_WHISPER ? apiKey : null, }), @@ -560,6 +567,15 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => { setRegion(""); setApiKey(""); setGoogleServiceKey(null); + if ( + ttsModels && + (e.target.value === VENDOR_ELEVENLABS || + e.target.value === VENDOR_WHISPER) + ) { + setTtsModelId( + ttsModels[e.target.value as keyof TtsModels][0].value + ); + } }} disabled={credential ? true : false} required @@ -627,6 +643,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => { )} {vendor !== VENDOR_WELLSAID && vendor !== VENDOR_CUSTOM && + vendor !== VENDOR_WHISPER && vendor !== VENDOR_ELEVENLABS && (