From 24d646f705f55b8a033c9dabf1531ea512ba636b Mon Sep 17 00:00:00 2001 From: Hoan Luu Huu <110280845+xquanluu@users.noreply.github.com> Date: Fri, 27 Jun 2025 18:13:51 +0700 Subject: [PATCH] support inworld tts (#537) * support inworld tts * wip --- src/api/constants.ts | 9 ++++++++ src/api/types.ts | 10 +++++++++ .../views/applications/speech-selection.tsx | 10 +++++++++ .../internal/views/speech-services/form.tsx | 22 +++++++++++++++---- src/vendor/index.tsx | 5 +++++ src/vendor/types.ts | 2 ++ 6 files changed, 54 insertions(+), 4 deletions(-) diff --git a/src/api/constants.ts b/src/api/constants.ts index 73321c9..9b3b9e4 100644 --- a/src/api/constants.ts +++ b/src/api/constants.ts @@ -3,6 +3,7 @@ import type { Currency, ElevenLabsOptions, GoogleCustomVoice, + InworldOptions, LimitField, LimitUnitOption, PasswordSettings, @@ -277,6 +278,14 @@ export const DEFAULT_RIMELABS_OPTIONS: Partial = { reduceLatency: true, }; +export const DEFAULT_INWORLD_OPTIONS: Partial = { + audioConfig: { + pitch: 0.0, + speakingRate: 1.0, + }, + temperature: 0.8, +}; + // PlayHT options export const DEFAULT_PLAYHT_OPTIONS: Partial = { quality: "medium", diff --git a/src/api/types.ts b/src/api/types.ts index 93547f4..5da0783 100644 --- a/src/api/types.ts +++ b/src/api/types.ts @@ -781,6 +781,16 @@ export interface RimelabsOptions { reduceLatency: boolean; } +export interface InworldOptions { + audioConfig: { + bitRate?: number; + sampleRateHertz?: number; + pitch?: number; + speakingRate?: number; + }; + temperature?: number; +} + export type CartesiaEmotions = | "anger:lowest" | "anger:low" diff --git a/src/containers/internal/views/applications/speech-selection.tsx b/src/containers/internal/views/applications/speech-selection.tsx index e7685c8..fe87a75 100644 --- a/src/containers/internal/views/applications/speech-selection.tsx +++ b/src/containers/internal/views/applications/speech-selection.tsx @@ -35,6 +35,7 @@ import { VENDOR_VOXIST, VENDOR_RIMELABS, VENDOR_OPENAI, + VENDOR_INWORLD, } from "src/vendor"; import { LabelOptions, @@ -312,6 +313,15 @@ export const SpeechProviderSelection = ({ updateTtsVoice(newLang!.value, newLang!.voices[0].value); return; } + if (synthVendor === VENDOR_INWORLD) { + let newLang = json.tts.find((lang) => lang.value === "en"); + // If the new language doesn't map then default to the first one + if (!newLang) { + newLang = json.tts[0]; + } + updateTtsVoice(newLang!.value, newLang!.voices[0].value); + return; + } /** Google and AWS have different language lists */ /** If the new language doesn't map then default to "en-US" */ let newLang = json.tts.find((lang) => lang.value === synthLang); diff --git a/src/containers/internal/views/speech-services/form.tsx b/src/containers/internal/views/speech-services/form.tsx index 4a78519..20c39fc 100644 --- a/src/containers/internal/views/speech-services/form.tsx +++ b/src/containers/internal/views/speech-services/form.tsx @@ -52,6 +52,7 @@ import { VENDOR_CARTESIA, VENDOR_VOXIST, VENDOR_OPENAI, + VENDOR_INWORLD, } from "src/vendor"; import { MSG_REQUIRED_FIELDS } from "src/constants"; import { @@ -83,6 +84,7 @@ import { DEFAULT_CARTESIA_OPTIONS, DEFAULT_ELEVENLABS_OPTIONS, DEFAULT_GOOGLE_CUSTOM_VOICE, + DEFAULT_INWORLD_OPTIONS, DEFAULT_PLAYHT_OPTIONS, DEFAULT_RIMELABS_OPTIONS, DEFAULT_VERBIO_MODEL, @@ -233,6 +235,8 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => { return DEFAULT_PLAYHT_OPTIONS; case VENDOR_RIMELABS: return DEFAULT_RIMELABS_OPTIONS; + case VENDOR_INWORLD: + return DEFAULT_INWORLD_OPTIONS; case VENDOR_CARTESIA: return DEFAULT_CARTESIA_OPTIONS; } @@ -249,6 +253,8 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => { return "https://docs.play.ht/reference/api-generate-tts-audio-stream"; case VENDOR_RIMELABS: return "https://rimelabs.mintlify.app/api-reference/endpoint/streaming-mp3#variable-parameters"; + case VENDOR_INWORLD: + return "https://docs.inworld.ai/api-reference/ttsAPI/texttospeech/synthesize-speech-stream"; case VENDOR_CARTESIA: return "https://docs.cartesia.ai/api-reference/tts/bytes"; } @@ -432,11 +438,13 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => { }), ...((vendor === VENDOR_ELEVENLABS || vendor === VENDOR_WHISPER || + vendor === VENDOR_INWORLD || vendor === VENDOR_RIMELABS) && { model_id: ttsModelId || null, }), ...((vendor === VENDOR_ELEVENLABS || vendor === VENDOR_PLAYHT || + vendor === VENDOR_INWORLD || vendor === VENDOR_RIMELABS) && { options: options || null, }), @@ -509,6 +517,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => { vendor === VENDOR_ELEVENLABS || vendor === VENDOR_PLAYHT || vendor === VENDOR_RIMELABS || + vendor === VENDOR_INWORLD || vendor === VENDOR_WHISPER || vendor === VENDOR_CARTESIA || vendor === VENDOR_OPENAI @@ -578,6 +587,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => { vendor === VENDOR_WHISPER || vendor === VENDOR_PLAYHT || vendor === VENDOR_RIMELABS || + vendor === VENDOR_INWORLD || vendor === VENDOR_CARTESIA || vendor === VENDOR_OPENAI || vendor === VENDOR_DEEPGRAM @@ -976,6 +986,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => { vendor !== VENDOR_WHISPER && vendor !== VENDOR_PLAYHT && vendor !== VENDOR_RIMELABS && + vendor !== VENDOR_INWORLD && vendor !== VENDOR_ELEVENLABS && (