support openAi stt (#496)

* support openAi stt

* wip

* wip

* add back model selection to openai
This commit is contained in:
Hoan Luu Huu
2025-03-28 21:15:27 +07:00
committed by GitHub
parent c4be87353c
commit f9e4c241f3
6 changed files with 67 additions and 32 deletions

2
.env
View File

@@ -1,4 +1,4 @@
#VITE_API_BASE_URL=http://127.0.0.1:3000/v1
# VITE_API_BASE_URL=http://127.0.0.1:3000/v1
#VITE_DEV_BASE_URL=http://127.0.0.1:3000/v1
## enables choosing units and lisenced account call limits

View File

@@ -728,6 +728,7 @@ export interface SpeechSupportedLanguagesAndVoices {
tts: VoiceLanguage[];
stt: Language[];
models: Model[];
sttModels: Model[];
}
export interface ElevenLabsOptions {

View File

@@ -33,6 +33,7 @@ import {
VENDOR_CARTESIA,
VENDOR_VOXIST,
VENDOR_RIMELABS,
VENDOR_OPENAI,
} from "src/vendor";
import {
LabelOptions,
@@ -397,6 +398,7 @@ export const SpeechProviderSelection = ({
vendor.value !== VENDOR_SONIOX &&
vendor.value !== VENDOR_SPEECHMATICS &&
vendor.value !== VENDOR_CUSTOM &&
vendor.value !== VENDOR_OPENAI &&
vendor.value !== VENDOR_COBALT,
)}
onChange={(e) => {

View File

@@ -51,6 +51,7 @@ import {
VENDOR_SPEECHMATICS,
VENDOR_CARTESIA,
VENDOR_VOXIST,
VENDOR_OPENAI,
} from "src/vendor";
import { MSG_REQUIRED_FIELDS } from "src/constants";
import {
@@ -124,6 +125,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
const [ttsRegion, setTtsRegion] = useState("");
const [ttsApiKey, setTtsApiKey] = useState("");
const [ttsModelId, setTtsModelId] = useState("");
const [sttModelId, setSttModelId] = useState("");
const [engineVersion, setEngineVersion] = useState(DEFAULT_VERBIO_MODEL);
const [instanceId, setInstanceId] = useState("");
const [initialCheckCustomTts, setInitialCheckCustomTts] = useState(false);
@@ -167,6 +169,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
const [customVoices, setCustomVoices] = useState<GoogleCustomVoice[]>([]);
const [customVoicesMessage, setCustomVoicesMessage] = useState("");
const [ttsModels, setTtsModels] = useState<Model[]>([]);
const [sttModels, setSttModels] = useState<Model[]>([]);
const [optionsInitialChecked, setOptionsInitialChecked] = useState(false);
const [options, setOptions] = useState("");
const [tmpOptions, setTmpOptions] = useState("");
@@ -248,6 +251,17 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
return "";
};
const getModelLabelByVendor = (vendor: Lowercase<Vendor>) => {
switch (vendor) {
case VENDOR_PLAYHT:
return "Voice Engine";
case VENDOR_CARTESIA:
return "Model ID";
default:
return "Model";
}
};
const handlePutGoogleCustomVoices = () => {
if (!credential || !credential.data) {
return;
@@ -411,6 +425,10 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
ttsModelId && {
voice_engine: ttsModelId,
}),
...(vendor === VENDOR_OPENAI &&
sttModelId && {
model_id: sttModelId,
}),
...(vendor === VENDOR_DEEPGRAM && {
deepgram_stt_uri: deepgramSttUri || null,
deepgram_tts_uri: deepgramTtsUri || null,
@@ -469,7 +487,8 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
vendor === VENDOR_PLAYHT ||
vendor === VENDOR_RIMELABS ||
vendor === VENDOR_WHISPER ||
vendor === VENDOR_CARTESIA
vendor === VENDOR_CARTESIA ||
vendor === VENDOR_OPENAI
? apiKey
: null,
}),
@@ -536,7 +555,8 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
vendor === VENDOR_WHISPER ||
vendor === VENDOR_PLAYHT ||
vendor === VENDOR_RIMELABS ||
vendor === VENDOR_CARTESIA
vendor === VENDOR_CARTESIA ||
vendor === VENDOR_OPENAI
) {
getSpeechSupportedLanguagesAndVoices(
currentServiceProvider?.service_provider_sid,
@@ -553,6 +573,15 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
setTtsModelId(json.models[0].value);
}
}
if (json.sttModels) {
setSttModels(json.sttModels);
if (
json.sttModels.length > 0 &&
!json.sttModels.some((m) => m.value === sttModelId)
) {
setSttModelId(json.sttModels[0].value);
}
}
});
} else {
setTtsModels([]);
@@ -707,6 +736,9 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
if (credential.data.model_id) {
setTtsModelId(credential.data.model_id);
}
if (credential.data.model_id && vendor === VENDOR_OPENAI) {
setSttModelId(credential.data.model_id);
}
}
if (credential?.data?.options) {
setOptions(credential.data.options);
@@ -887,6 +919,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
vendor !== VENDOR_COBALT &&
vendor !== VENDOR_SONIOX &&
vendor !== VENDOR_SPEECHMATICS &&
vendor !== VENDOR_OPENAI &&
vendor != VENDOR_CUSTOM && (
<label htmlFor="use_for_tts" className="chk">
<input
@@ -1522,6 +1555,7 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
vendor === VENDOR_RIMELABS ||
vendor === VENDOR_SONIOX ||
vendor === VENDOR_CARTESIA ||
vendor === VENDOR_OPENAI ||
vendor === VENDOR_SPEECHMATICS) && (
<fieldset>
{vendor === VENDOR_PLAYHT && (
@@ -1557,40 +1591,16 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
/>
</fieldset>
)}
{vendor === VENDOR_PLAYHT && ttsModels.length > 0 && (
<fieldset>
<label htmlFor={`${vendor}_tts_model_id`}>Voice engine</label>
<Selector
id={"tts_model_id"}
name={"tts_model_id"}
value={ttsModelId}
options={ttsModels}
onChange={(e) => {
setTtsModelId(e.target.value);
}}
/>
</fieldset>
)}
{vendor === VENDOR_CARTESIA && ttsModels.length > 0 && (
<fieldset>
<label htmlFor={`${vendor}_tts_model_id`}>Model Id</label>
<Selector
id={"tts_model_id"}
name={"tts_model_id"}
value={ttsModelId}
options={ttsModels}
onChange={(e) => {
setTtsModelId(e.target.value);
}}
/>
</fieldset>
)}
{(vendor == VENDOR_ELEVENLABS ||
vendor == VENDOR_WHISPER ||
vendor === VENDOR_CARTESIA ||
vendor === VENDOR_PLAYHT ||
vendor == VENDOR_RIMELABS) &&
ttsModels.length > 0 && (
<fieldset>
<label htmlFor={`${vendor}_tts_model_id`}>Model</label>
<label htmlFor={`${vendor}_tts_model_id`}>
{getModelLabelByVendor(vendor)}
</label>
<Selector
id={"tts_model_id"}
name={"tts_model_id"}
@@ -1602,6 +1612,22 @@ export const SpeechServiceForm = ({ credential }: SpeechServiceFormProps) => {
/>
</fieldset>
)}
{vendor == VENDOR_OPENAI && sttModels.length > 0 && (
<fieldset>
<label htmlFor={`${vendor}_stt_model_id`}>
{getModelLabelByVendor(vendor)}
</label>
<Selector
id={"stt_model_id"}
name={"stt_model_id"}
value={sttModelId}
options={sttModels}
onChange={(e) => {
setSttModelId(e.target.value);
}}
/>
</fieldset>
)}
{(vendor === VENDOR_ELEVENLABS ||
vendor === VENDOR_PLAYHT ||
vendor === VENDOR_CARTESIA ||

View File

@@ -26,6 +26,7 @@ export const VENDOR_PLAYHT = "playht";
export const VENDOR_RIMELABS = "rimelabs";
export const VENDOR_VERBIO = "verbio";
export const VENDOR_CARTESIA = "cartesia";
export const VENDOR_OPENAI = "openai";
export const vendors: VendorOptions[] = [
{
@@ -108,6 +109,10 @@ export const vendors: VendorOptions[] = [
name: "Cartesia",
value: VENDOR_CARTESIA,
},
{
name: "OpenAI",
value: VENDOR_OPENAI,
},
].sort((a, b) => a.name.localeCompare(b.name)) as VendorOptions[];
export const AWS_CREDENTIAL_ACCESS_KEY = "access_key";

1
src/vendor/types.ts vendored
View File

@@ -18,6 +18,7 @@ export type Vendor =
| "playht"
| "rimelabs"
| "verbio"
| "openai"
| "Cartesia";
export interface VendorOptions {