support inworld tts (#537)

* support inworld tts * wip
2026-01-25 02:08:19 +00:00 · 2025-06-27 18:13:51 +07:00
parent c648afcb1a
commit 24d646f705
6 changed files with 54 additions and 4 deletions
--- a/src/api/constants.ts
+++ b/src/api/constants.ts
@@ -3,6 +3,7 @@ import type {
  Currency,
  ElevenLabsOptions,
  GoogleCustomVoice,
+  InworldOptions,
  LimitField,
  LimitUnitOption,
  PasswordSettings,
@@ -277,6 +278,14 @@ export const DEFAULT_RIMELABS_OPTIONS: Partial<RimelabsOptions> = {
  reduceLatency: true,
 };

+export const DEFAULT_INWORLD_OPTIONS: Partial<InworldOptions> = {
+  audioConfig: {
+    pitch: 0.0,
+    speakingRate: 1.0,
+  },
+  temperature: 0.8,
+};
+
 // PlayHT options
 export const DEFAULT_PLAYHT_OPTIONS: Partial<PlayHTOptions> = {
  quality: "medium",
--- a/src/api/types.ts
+++ b/src/api/types.ts
@@ -781,6 +781,16 @@ export interface RimelabsOptions {
  reduceLatency: boolean;
 }

+export interface InworldOptions {
+  audioConfig: {
+    bitRate?: number;
+    sampleRateHertz?: number;
+    pitch?: number;
+    speakingRate?: number;
+  };
+  temperature?: number;
+}
+
 export type CartesiaEmotions =
  | "anger:lowest"
  | "anger:low"
--- a/src/containers/internal/views/applications/speech-selection.tsx
+++ b/src/containers/internal/views/applications/speech-selection.tsx
@@ -35,6 +35,7 @@ import {
  VENDOR_VOXIST,
  VENDOR_RIMELABS,
  VENDOR_OPENAI,
+  VENDOR_INWORLD,
 } from "src/vendor";
 import {
  LabelOptions,
@@ -312,6 +313,15 @@ export const SpeechProviderSelection = ({
            updateTtsVoice(newLang!.value, newLang!.voices[0].value);
            return;
          }
+          if (synthVendor === VENDOR_INWORLD) {
+            let newLang = json.tts.find((lang) => lang.value === "en");
+            // If the new language doesn't map then default to the first one
+            if (!newLang) {
+              newLang = json.tts[0];
+            }
+            updateTtsVoice(newLang!.value, newLang!.voices[0].value);
+            return;
+          }
          /** Google and AWS have different language lists */
          /** If the new language doesn't map then default to "en-US" */
          let newLang = json.tts.find((lang) => lang.value === synthLang);
--- a/src/containers/internal/views/speech-services/form.tsx
+++ b/src/containers/internal/views/speech-services/form.tsx
@@ -52,6 +52,7 @@ import {
  VENDOR_CARTESIA,
  VENDOR_VOXIST,
  VENDOR_OPENAI,
+  VENDOR_INWORLD,
 } from "src/vendor";
 import { MSG_REQUIRED_FIELDS } from "src/constants";
 import {
@@ -83,6 +84,7 @@ import {
  DEFAULT_CARTESIA_OPTIONS,
  DEFAULT_ELEVENLABS_OPTIONS,
  DEFAULT_GOOGLE_CUSTOM_VOICE,
+  DEFAULT_INWORLD_OPTIONS,
  DEFAULT_PLAYHT_OPTIONS,
  DEFAULT_RIMELABS_OPTIONS,
  DEFAULT_VERBIO_MODEL,
@@ -233,6 +235,8 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
          return DEFAULT_PLAYHT_OPTIONS;
        case VENDOR_RIMELABS:
          return DEFAULT_RIMELABS_OPTIONS;
+        case VENDOR_INWORLD:
+          return DEFAULT_INWORLD_OPTIONS;
        case VENDOR_CARTESIA:
          return DEFAULT_CARTESIA_OPTIONS;
      }
@@ -249,6 +253,8 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
          return "https://docs.play.ht/reference/api-generate-tts-audio-stream";
        case VENDOR_RIMELABS:
          return "https://rimelabs.mintlify.app/api-reference/endpoint/streaming-mp3#variable-parameters";
+        case VENDOR_INWORLD:
+          return "https://docs.inworld.ai/api-reference/ttsAPI/texttospeech/synthesize-speech-stream";
        case VENDOR_CARTESIA:
          return "https://docs.cartesia.ai/api-reference/tts/bytes";
      }
@@ -432,11 +438,13 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
        }),
        ...((vendor === VENDOR_ELEVENLABS ||
          vendor === VENDOR_WHISPER ||
+          vendor === VENDOR_INWORLD ||
          vendor === VENDOR_RIMELABS) && {
          model_id: ttsModelId || null,
        }),
        ...((vendor === VENDOR_ELEVENLABS ||
          vendor === VENDOR_PLAYHT ||
+          vendor === VENDOR_INWORLD ||
          vendor === VENDOR_RIMELABS) && {
          options: options || null,
        }),
@@ -509,6 +517,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
              vendor === VENDOR_ELEVENLABS ||
              vendor === VENDOR_PLAYHT ||
              vendor === VENDOR_RIMELABS ||
+              vendor === VENDOR_INWORLD ||
              vendor === VENDOR_WHISPER ||
              vendor === VENDOR_CARTESIA ||
              vendor === VENDOR_OPENAI
@@ -578,6 +587,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
      vendor === VENDOR_WHISPER ||
      vendor === VENDOR_PLAYHT ||
      vendor === VENDOR_RIMELABS ||
+      vendor === VENDOR_INWORLD ||
      vendor === VENDOR_CARTESIA ||
      vendor === VENDOR_OPENAI ||
      vendor === VENDOR_DEEPGRAM
@@ -976,6 +986,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
              vendor !== VENDOR_WHISPER &&
              vendor !== VENDOR_PLAYHT &&
              vendor !== VENDOR_RIMELABS &&
+              vendor !== VENDOR_INWORLD &&
              vendor !== VENDOR_ELEVENLABS && (
                <label htmlFor="use_for_stt" className="chk">
                  <input
@@ -1705,6 +1716,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
          vendor == VENDOR_ELEVENLABS ||
          vendor === VENDOR_WHISPER ||
          vendor === VENDOR_RIMELABS ||
+          vendor === VENDOR_INWORLD ||
          vendor === VENDOR_SONIOX ||
          vendor === VENDOR_CARTESIA ||
          vendor === VENDOR_OPENAI ||
@@ -1724,10 +1736,11 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
            />
          </fieldset>
        )}
-        {(vendor == VENDOR_ELEVENLABS ||
-          vendor == VENDOR_WHISPER ||
+        {(vendor === VENDOR_ELEVENLABS ||
+          vendor === VENDOR_WHISPER ||
          vendor === VENDOR_PLAYHT ||
-          vendor == VENDOR_RIMELABS ||
+          vendor === VENDOR_RIMELABS ||
+          vendor === VENDOR_INWORLD ||
          (ttsCheck && vendor === VENDOR_CARTESIA)) &&
          ttsModels.length > 0 && (
            <fieldset>
@@ -1767,7 +1780,8 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
        {(vendor === VENDOR_ELEVENLABS ||
          vendor === VENDOR_PLAYHT ||
          vendor === VENDOR_CARTESIA ||
-          vendor === VENDOR_RIMELABS) && (
+          vendor === VENDOR_RIMELABS ||
+          vendor === VENDOR_INWORLD) && (
          <fieldset>
            <Checkzone
              hidden
--- a/src/vendor/index.tsx
+++ b/src/vendor/index.tsx
@@ -24,6 +24,7 @@ export const VENDOR_VOXIST = "voxist";
 export const VENDOR_WHISPER = "whisper";
 export const VENDOR_PLAYHT = "playht";
 export const VENDOR_RIMELABS = "rimelabs";
+export const VENDOR_INWORLD = "inworld";
 export const VENDOR_VERBIO = "verbio";
 export const VENDOR_CARTESIA = "cartesia";
 export const VENDOR_OPENAI = "openai";
@@ -101,6 +102,10 @@ export const vendors: VendorOptions[] = [
    name: "RimeLabs",
    value: VENDOR_RIMELABS,
  },
+  {
+    name: "Inworld",
+    value: VENDOR_INWORLD,
+  },
  {
    name: "Verbio",
    value: VENDOR_VERBIO,
--- a/src/vendor/types.ts
+++ b/src/vendor/types.ts
@@ -17,6 +17,7 @@ export type Vendor =
  | "whisper"
  | "playht"
  | "rimelabs"
+  | "inworld"
  | "verbio"
  | "openai"
  | "Cartesia";
@@ -112,6 +113,7 @@ export interface SynthesisVendors {
  playht: VoiceLanguage[];
  cartesia: VoiceLanguage[];
  rimelabs: VoiceLanguage[];
+  inworld: VoiceLanguage[];
 }

 export interface MSRawSpeech {