support inworld tts (#537)

* support inworld tts

* wip
This commit is contained in:
Hoan Luu Huu
2025-06-27 18:13:51 +07:00
committed by GitHub
parent c648afcb1a
commit 24d646f705
6 changed files with 54 additions and 4 deletions

View File

@@ -3,6 +3,7 @@ import type {
Currency,
ElevenLabsOptions,
GoogleCustomVoice,
InworldOptions,
LimitField,
LimitUnitOption,
PasswordSettings,
@@ -277,6 +278,14 @@ export const DEFAULT_RIMELABS_OPTIONS: Partial<RimelabsOptions> = {
reduceLatency: true,
};
export const DEFAULT_INWORLD_OPTIONS: Partial<InworldOptions> = {
audioConfig: {
pitch: 0.0,
speakingRate: 1.0,
},
temperature: 0.8,
};
// PlayHT options
export const DEFAULT_PLAYHT_OPTIONS: Partial<PlayHTOptions> = {
quality: "medium",

View File

@@ -781,6 +781,16 @@ export interface RimelabsOptions {
reduceLatency: boolean;
}
export interface InworldOptions {
audioConfig: {
bitRate?: number;
sampleRateHertz?: number;
pitch?: number;
speakingRate?: number;
};
temperature?: number;
}
export type CartesiaEmotions =
| "anger:lowest"
| "anger:low"

View File

@@ -35,6 +35,7 @@ import {
VENDOR_VOXIST,
VENDOR_RIMELABS,
VENDOR_OPENAI,
VENDOR_INWORLD,
} from "src/vendor";
import {
LabelOptions,
@@ -312,6 +313,15 @@ export const SpeechProviderSelection = ({
updateTtsVoice(newLang!.value, newLang!.voices[0].value);
return;
}
if (synthVendor === VENDOR_INWORLD) {
let newLang = json.tts.find((lang) => lang.value === "en");
// If the new language doesn't map then default to the first one
if (!newLang) {
newLang = json.tts[0];
}
updateTtsVoice(newLang!.value, newLang!.voices[0].value);
return;
}
/** Google and AWS have different language lists */
/** If the new language doesn't map then default to "en-US" */
let newLang = json.tts.find((lang) => lang.value === synthLang);

View File

@@ -52,6 +52,7 @@ import {
VENDOR_CARTESIA,
VENDOR_VOXIST,
VENDOR_OPENAI,
VENDOR_INWORLD,
} from "src/vendor";
import { MSG_REQUIRED_FIELDS } from "src/constants";
import {
@@ -83,6 +84,7 @@ import {
DEFAULT_CARTESIA_OPTIONS,
DEFAULT_ELEVENLABS_OPTIONS,
DEFAULT_GOOGLE_CUSTOM_VOICE,
DEFAULT_INWORLD_OPTIONS,
DEFAULT_PLAYHT_OPTIONS,
DEFAULT_RIMELABS_OPTIONS,
DEFAULT_VERBIO_MODEL,
@@ -233,6 +235,8 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
return DEFAULT_PLAYHT_OPTIONS;
case VENDOR_RIMELABS:
return DEFAULT_RIMELABS_OPTIONS;
case VENDOR_INWORLD:
return DEFAULT_INWORLD_OPTIONS;
case VENDOR_CARTESIA:
return DEFAULT_CARTESIA_OPTIONS;
}
@@ -249,6 +253,8 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
return "https://docs.play.ht/reference/api-generate-tts-audio-stream";
case VENDOR_RIMELABS:
return "https://rimelabs.mintlify.app/api-reference/endpoint/streaming-mp3#variable-parameters";
case VENDOR_INWORLD:
return "https://docs.inworld.ai/api-reference/ttsAPI/texttospeech/synthesize-speech-stream";
case VENDOR_CARTESIA:
return "https://docs.cartesia.ai/api-reference/tts/bytes";
}
@@ -432,11 +438,13 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
}),
...((vendor === VENDOR_ELEVENLABS ||
vendor === VENDOR_WHISPER ||
vendor === VENDOR_INWORLD ||
vendor === VENDOR_RIMELABS) && {
model_id: ttsModelId || null,
}),
...((vendor === VENDOR_ELEVENLABS ||
vendor === VENDOR_PLAYHT ||
vendor === VENDOR_INWORLD ||
vendor === VENDOR_RIMELABS) && {
options: options || null,
}),
@@ -509,6 +517,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
vendor === VENDOR_ELEVENLABS ||
vendor === VENDOR_PLAYHT ||
vendor === VENDOR_RIMELABS ||
vendor === VENDOR_INWORLD ||
vendor === VENDOR_WHISPER ||
vendor === VENDOR_CARTESIA ||
vendor === VENDOR_OPENAI
@@ -578,6 +587,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
vendor === VENDOR_WHISPER ||
vendor === VENDOR_PLAYHT ||
vendor === VENDOR_RIMELABS ||
vendor === VENDOR_INWORLD ||
vendor === VENDOR_CARTESIA ||
vendor === VENDOR_OPENAI ||
vendor === VENDOR_DEEPGRAM
@@ -976,6 +986,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
vendor !== VENDOR_WHISPER &&
vendor !== VENDOR_PLAYHT &&
vendor !== VENDOR_RIMELABS &&
vendor !== VENDOR_INWORLD &&
vendor !== VENDOR_ELEVENLABS && (
<label htmlFor="use_for_stt" className="chk">
<input
@@ -1705,6 +1716,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
vendor == VENDOR_ELEVENLABS ||
vendor === VENDOR_WHISPER ||
vendor === VENDOR_RIMELABS ||
vendor === VENDOR_INWORLD ||
vendor === VENDOR_SONIOX ||
vendor === VENDOR_CARTESIA ||
vendor === VENDOR_OPENAI ||
@@ -1724,10 +1736,11 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
/>
</fieldset>
)}
{(vendor == VENDOR_ELEVENLABS ||
vendor == VENDOR_WHISPER ||
{(vendor === VENDOR_ELEVENLABS ||
vendor === VENDOR_WHISPER ||
vendor === VENDOR_PLAYHT ||
vendor == VENDOR_RIMELABS ||
vendor === VENDOR_RIMELABS ||
vendor === VENDOR_INWORLD ||
(ttsCheck && vendor === VENDOR_CARTESIA)) &&
ttsModels.length > 0 && (
<fieldset>
@@ -1767,7 +1780,8 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
{(vendor === VENDOR_ELEVENLABS ||
vendor === VENDOR_PLAYHT ||
vendor === VENDOR_CARTESIA ||
vendor === VENDOR_RIMELABS) && (
vendor === VENDOR_RIMELABS ||
vendor === VENDOR_INWORLD) && (
<fieldset>
<Checkzone
hidden

View File

@@ -24,6 +24,7 @@ export const VENDOR_VOXIST = "voxist";
export const VENDOR_WHISPER = "whisper";
export const VENDOR_PLAYHT = "playht";
export const VENDOR_RIMELABS = "rimelabs";
export const VENDOR_INWORLD = "inworld";
export const VENDOR_VERBIO = "verbio";
export const VENDOR_CARTESIA = "cartesia";
export const VENDOR_OPENAI = "openai";
@@ -101,6 +102,10 @@ export const vendors: VendorOptions[] = [
name: "RimeLabs",
value: VENDOR_RIMELABS,
},
{
name: "Inworld",
value: VENDOR_INWORLD,
},
{
name: "Verbio",
value: VENDOR_VERBIO,

2
src/vendor/types.ts vendored
View File

@@ -17,6 +17,7 @@ export type Vendor =
| "whisper"
| "playht"
| "rimelabs"
| "inworld"
| "verbio"
| "openai"
| "Cartesia";
@@ -112,6 +113,7 @@ export interface SynthesisVendors {
playht: VoiceLanguage[];
cartesia: VoiceLanguage[];
rimelabs: VoiceLanguage[];
inworld: VoiceLanguage[];
}
export interface MSRawSpeech {