Support whisper TTS (#346)

* support tts whisper * support tts whisper * wip * wip * fix wrong language and voice
2026-07-04 19:21:58 +00:00 · 2023-11-09 21:50:38 +07:00
parent adafff7ec3
commit 87b3ca7e94
9 changed files with 152 additions and 33 deletions
@@ -200,11 +200,7 @@ export const AUDIO_FORMAT_OPTIONS = [

 export const DEFAULT_ELEVENLABS_MODEL = "eleven_multilingual_v2";

-export const ELEVENLABS_MODEL_OPTIONS = [
-  { name: "Multilingual v2", value: "eleven_multilingual_v2" },
-  { name: "Multilingual v1", value: "eleven_multilingual_v1" },
-  { name: "English v1", value: "eleven_monolingual_v1" },
-];
+export const DEFAULT_WHISPER_MODEL = "tts-1";

 // Google Custom Voice reported usage options

@@ -416,6 +416,7 @@ export interface SpeechCredential {
  label: null | string;
  cobalt_server_uri: null | string;
  model_id: null | string;
+  model: null | string;
 }

 export interface Alert {
@@ -1,4 +1,4 @@
-import React, { useEffect, useState } from "react";
+import React, { useEffect, useRef, useState } from "react";
 import {
  getGoogleCustomVoices,
  postSpeechServiceLanguages,
@@ -24,6 +24,7 @@ import {
  VENDOR_MICROSOFT,
  VENDOR_SONIOX,
  VENDOR_WELLSAID,
+  VENDOR_WHISPER,
 } from "src/vendor";
 import {
  LabelOptions,
@@ -89,14 +90,18 @@ export const SpeechProviderSelection = ({

  const currentServiceProvider = useSelectState("currentServiceProvider");

+  const currentVendor = useRef(synthVendor);
+
  useEffect(() => {
+    currentVendor.current = synthVendor;
    if (!synthesis) {
      return;
    }
-    let options = synthesis[synthVendor as keyof SynthesisVendors]
+    const voiceOpts = synthesis[synthVendor as keyof SynthesisVendors]
      .filter((lang: VoiceLanguage) => {
        // ELEVENLABS has same voice for all lange, take voices from the 1st language
-        if (synthVendor === VENDOR_ELEVENLABS) {
+        // Only first language has voices, the rest has empty voices
+        if (synthVendor === VENDOR_ELEVENLABS && lang.voices.length > 0) {
          return true;
        }
        return lang.code === synthLang;
@@ -107,15 +112,15 @@ export const SpeechProviderSelection = ({
          value: voice.value,
        }))
      ) as Voice[];
-    setSynthesisVoiceOptions(options);
+    setSynthesisVoiceOptions(voiceOpts);

-    options = synthesis[synthVendor as keyof SynthesisVendors].map(
+    const langOpts = synthesis[synthVendor as keyof SynthesisVendors].map(
      (lang: VoiceLanguage) => ({
        name: lang.name,
        value: lang.code,
      })
    );
-    setSynthesisLanguageOptions(options);
+    setSynthesisLanguageOptions(langOpts);

    if (synthVendor === VENDOR_ELEVENLABS) {
      postSpeechServiceVoices(
@@ -127,6 +132,10 @@ export const SpeechProviderSelection = ({
          label: synthLabel,
        }
      ).then(({ json }) => {
+        // If after successfully fetching data, vendor is still good, then apply value
+        if (currentVendor.current !== VENDOR_ELEVENLABS) {
+          return;
+        }
        if (json.length > 0) {
          setSynthesisVoiceOptions(json);
        }
@@ -151,11 +160,15 @@ export const SpeechProviderSelection = ({
        account_sid: accountSid,
        service_provider_sid: serviceProviderSid,
      }).then(({ json }) => {
+        // If after successfully fetching data, vendor is still good, then apply value
+        if (currentVendor.current !== VENDOR_GOOGLE) {
+          return;
+        }
        const customVOices = json.map((v) => ({
          name: `${v.name} (Custom)`,
          value: `custom_${v.google_custom_voice_sid}`,
        }));
-        options = synthesis[synthVendor as keyof SynthesisVendors]
+        const options = synthesis[synthVendor as keyof SynthesisVendors]
          .filter((lang: VoiceLanguage) => {
            return lang.code === synthLang;
          })
@@ -228,6 +241,15 @@ export const SpeechProviderSelection = ({
                return;
              }

+              if (vendor === VENDOR_WHISPER) {
+                const newLang = synthesis[vendor].find(
+                  (lang) => lang.code === LANG_EN_US
+                );
+                setSynthLang(LANG_EN_US);
+                setSynthVoice(newLang!.voices[0].value);
+                return;
+              }
+
              /** Google and AWS have different language lists */
              /** If the new language doesn't map then default to "en-US" */
              let newLang = synthesis[vendor].find(
@@ -359,6 +381,7 @@ export const SpeechProviderSelection = ({
              (vendor) =>
                vendor.value != VENDOR_WELLSAID &&
                vendor.value != VENDOR_ELEVENLABS &&
+                vendor.value != VENDOR_WHISPER &&
                vendor.value !== VENDOR_CUSTOM
            )}
            onChange={(e) => {
@@ -38,6 +38,8 @@ import {
  VENDOR_COBALT,
  VENDOR_ELEVENLABS,
  VENDOR_ASSEMBLYAI,
+  VENDOR_WHISPER,
+  useTtsModels,
 } from "src/vendor";
 import { MSG_REQUIRED_FIELDS } from "src/constants";
 import {
@@ -50,7 +52,12 @@ import {
 import { getObscuredGoogleServiceKey } from "./utils";
 import { CredentialStatus } from "./status";

-import type { RegionVendors, GoogleServiceKey, Vendor } from "src/vendor/types";
+import type {
+  RegionVendors,
+  GoogleServiceKey,
+  Vendor,
+  TtsModels,
+} from "src/vendor/types";
 import type {
  Account,
  GoogleCustomVoice,
@@ -59,10 +66,8 @@ import type {
 } from "src/api/types";
 import { setAccountFilter, setLocation } from "src/store/localStore";
 import {
-  DEFAULT_ELEVENLABS_MODEL,
  DEFAULT_GOOGLE_CUSTOM_VOICES_REPORTED_USAGE,
  DISABLE_CUSTOM_SPEECH,
-  ELEVENLABS_MODEL_OPTIONS,
  GOOGLE_CUSTOM_VOICES_REPORTED_USAGE,
 } from "src/api/constants";

@@ -96,7 +101,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
  const [sttApiKey, setSttApiKey] = useState("");
  const [ttsRegion, setTtsRegion] = useState("");
  const [ttsApiKey, setTtsApiKey] = useState("");
-  const [ttsModelId, setTtsModelId] = useState(DEFAULT_ELEVENLABS_MODEL);
+  const [ttsModelId, setTtsModelId] = useState("");
  const [instanceId, setInstanceId] = useState("");
  const [initialCheckCustomTts, setInitialCheckCustomTts] = useState(false);
  const [initialCheckCustomStt, setInitialCheckCustomStt] = useState(false);
@@ -134,6 +139,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
  const [useCustomVoicesCheck, setUseCustomVoicesCheck] = useState(false);
  const [customVoices, setCustomVoices] = useState<GoogleCustomVoice[]>([]);
  const [customVoicesMessage, setCustomVoicesMessage] = useState("");
+  const ttsModels = useTtsModels();

  const handleFile = (file: File) => {
    const handleError = () => {
@@ -273,7 +279,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
        ...(vendor === VENDOR_COBALT && {
          cobalt_server_uri: cobaltServerUri || null,
        }),
-        ...(vendor === VENDOR_ELEVENLABS && {
+        ...((vendor === VENDOR_ELEVENLABS || vendor === VENDOR_WHISPER) && {
          model_id: ttsModelId || null,
        }),
      };
@@ -316,7 +322,8 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
              vendor === VENDOR_DEEPGRAM ||
              vendor === VENDOR_ASSEMBLYAI ||
              vendor === VENDOR_SONIOX ||
-              vendor === VENDOR_ELEVENLABS
+              vendor === VENDOR_ELEVENLABS ||
+              vendor === VENDOR_WHISPER
                ? apiKey
                : null,
          }),
@@ -560,6 +567,15 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
              setRegion("");
              setApiKey("");
              setGoogleServiceKey(null);
+              if (
+                ttsModels &&
+                (e.target.value === VENDOR_ELEVENLABS ||
+                  e.target.value === VENDOR_WHISPER)
+              ) {
+                setTtsModelId(
+                  ttsModels[e.target.value as keyof TtsModels][0].value
+                );
+              }
            }}
            disabled={credential ? true : false}
            required
@@ -627,6 +643,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
              )}
            {vendor !== VENDOR_WELLSAID &&
              vendor !== VENDOR_CUSTOM &&
+              vendor !== VENDOR_WHISPER &&
              vendor !== VENDOR_ELEVENLABS && (
                <label htmlFor="use_for_stt" className="chk">
                  <input
@@ -1072,6 +1089,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
          vendor === VENDOR_DEEPGRAM ||
          vendor === VENDOR_ASSEMBLYAI ||
          vendor == VENDOR_ELEVENLABS ||
+          vendor === VENDOR_WHISPER ||
          vendor === VENDOR_SONIOX) && (
          <fieldset>
            <label htmlFor={`${vendor}_apikey`}>
@@ -1088,20 +1106,21 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
            />
          </fieldset>
        )}
-        {vendor == VENDOR_ELEVENLABS && (
-          <fieldset>
-            <label htmlFor={`${vendor}_apikey`}>Model</label>
-            <Selector
-              id={"audio_format"}
-              name={"audio_format"}
-              value={ttsModelId}
-              options={ELEVENLABS_MODEL_OPTIONS}
-              onChange={(e) => {
-                setTtsModelId(e.target.value);
-              }}
-            />
-          </fieldset>
-        )}
+        {(vendor == VENDOR_ELEVENLABS || vendor == VENDOR_WHISPER) &&
+          ttsModels && (
+            <fieldset>
+              <label htmlFor={`${vendor}_tts_model_id`}>Model</label>
+              <Selector
+                id={"tts_model_id"}
+                name={"tts_model_id"}
+                value={ttsModelId}
+                options={ttsModels[vendor as keyof TtsModels]}
+                onChange={(e) => {
+                  setTtsModelId(e.target.value);
+                }}
+              />
+            </fieldset>
+          )}
        {regions &&
          regions[vendor as keyof RegionVendors] &&
          vendor !== VENDOR_IBM &&
@@ -5,6 +5,7 @@ import type {
  SynthesisVendors,
  RecognizerVendors,
  RegionVendors,
+  TtsModels,
 } from "./types";

 export const LANG_EN_US = "en-US";
@@ -24,6 +25,7 @@ export const VENDOR_CUSTOM = "custom";
 export const VENDOR_COBALT = "cobalt";
 export const VENDOR_ELEVENLABS = "elevenlabs";
 export const VENDOR_ASSEMBLYAI = "assemblyai";
+export const VENDOR_WHISPER = "whisper";

 export const vendors: VendorOptions[] = [
  {
@@ -78,8 +80,36 @@ export const vendors: VendorOptions[] = [
    name: "AssemblyAI",
    value: VENDOR_ASSEMBLYAI,
  },
+  {
+    name: "Whisper",
+    value: VENDOR_WHISPER,
+  },
 ].sort((a, b) => a.name.localeCompare(b.name)) as VendorOptions[];

+export const useTtsModels = () => {
+  const [models, setModels] = useState<TtsModels>();
+
+  useEffect(() => {
+    let ignore = false;
+    Promise.all([
+      import("./speech-synthsis-models/elevenlabs-models"),
+      import("./speech-synthsis-models/whisper-models"),
+    ]).then(([{ default: elevenlabs }, { default: whisper }]) => {
+      if (!ignore) {
+        setModels({
+          elevenlabs,
+          whisper,
+        });
+      }
+    });
+    return function cleanup() {
+      ignore = true;
+    };
+  }, []);
+
+  return models;
+};
+
 export const useRegionVendors = () => {
  const [regions, setRegions] = useState<RegionVendors>();

@@ -142,6 +172,7 @@ export const useSpeechVendors = () => {
      import("./speech-synthesis/ibm-speech-synthesis-lang"),
      import("./speech-synthesis/nvidia-speech-synthesis-lang"),
      import("./speech-synthesis/elevellabs-speech-synthesis-lang"),
+      import("./speech-synthesis/whisper-speech-synthesis-lang"),
    ]).then(
      ([
        { default: awsRecognizer },
@@ -162,6 +193,7 @@ export const useSpeechVendors = () => {
        { default: ibmSynthesis },
        { default: nvidiaynthesis },
        { default: elevenLabsSynthesis },
+        { default: whisperSynthesis },
      ]) => {
        if (!ignore) {
          setSpeech({
@@ -174,6 +206,7 @@ export const useSpeechVendors = () => {
              ibm: ibmSynthesis,
              nvidia: nvidiaynthesis,
              elevenlabs: elevenLabsSynthesis,
+              whisper: whisperSynthesis,
            },
            recognizers: {
              aws: awsRecognizer,
@@ -0,0 +1,18 @@
+import type { VoiceLanguage } from "../types";
+
+export const languages: VoiceLanguage[] = [
+  {
+    code: "en-US",
+    name: "English",
+    voices: [
+      { value: "alloy", name: "Alloy" },
+      { value: "echo", name: "Echo" },
+      { value: "fable", name: "Fable" },
+      { value: "onyx", name: "Onyx" },
+      { value: "nova", name: "Nova" },
+      { value: "shimmer", name: "Shimmer" },
+    ],
+  },
+];
+
+export default languages;
@@ -0,0 +1,9 @@
+import type { Model } from "../types";
+
+export const models: Model[] = [
+  { name: "Multilingual v2", value: "eleven_multilingual_v2" },
+  { name: "Multilingual v1", value: "eleven_multilingual_v1" },
+  { name: "English v1", value: "eleven_monolingual_v1" },
+];
+
+export default models;
@@ -0,0 +1,8 @@
+import type { Model } from "../types";
+
+export const models: Model[] = [
+  { name: "TTS-1", value: "tts-1" },
+  { name: "TTS-1-HD", value: "tts-1-hd" },
+];
+
+export default models;
@@ -11,7 +11,8 @@ export type Vendor =
  | "Cobalt"
  | "Custom"
  | "ElevenLabs"
-  | "assemblyai";
+  | "assemblyai"
+  | "whisper";

 export interface VendorOptions {
  name: Vendor;
@@ -28,6 +29,11 @@ export interface Region {
  value: string;
 }

+export interface Model {
+  name: string;
+  value: string;
+}
+
 export interface Voice {
  name: string;
  value: string;
@@ -64,6 +70,11 @@ export interface RegionVendors {
  ibm: Region[];
 }

+export interface TtsModels {
+  elevenlabs: Model[];
+  whisper: Model[];
+}
+
 export interface RecognizerVendors {
  aws: Language[];
  google: Language[];
@@ -86,6 +97,7 @@ export interface SynthesisVendors {
  ibm: VoiceLanguage[];
  nvidia: VoiceLanguage[];
  elevenlabs: VoiceLanguage[];
+  whisper: VoiceLanguage[];
 }

 export interface MSRawSpeech {