mirror of
https://github.com/jambonz/speech-utils.git
synced 2025-12-19 03:37:49 +00:00
78 lines
2.8 KiB
Protocol Buffer
78 lines
2.8 KiB
Protocol Buffer
// SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
syntax = "proto3";
|
|
|
|
package nvidia.riva.tts;
|
|
|
|
option cc_enable_arenas = true;
|
|
option go_package = "nvidia.com/riva_speech";
|
|
|
|
import "riva/proto/riva_audio.proto";
|
|
|
|
service RivaSpeechSynthesis {
|
|
// Used to request text-to-speech from the service. Submit a request containing the
|
|
// desired text and configuration, and receive audio bytes in the requested format.
|
|
rpc Synthesize(SynthesizeSpeechRequest) returns (SynthesizeSpeechResponse) {}
|
|
|
|
// Used to request text-to-speech returned via stream as it becomes available.
|
|
// Submit a SynthesizeSpeechRequest with desired text and configuration,
|
|
// and receive stream of bytes in the requested format.
|
|
rpc SynthesizeOnline(SynthesizeSpeechRequest) returns (stream SynthesizeSpeechResponse) {}
|
|
|
|
//Enables clients to request the configuration of the current Synthesize service, or a specific model within the service.
|
|
rpc GetRivaSynthesisConfig(RivaSynthesisConfigRequest) returns (RivaSynthesisConfigResponse) {}
|
|
}
|
|
|
|
message RivaSynthesisConfigRequest {
|
|
//If model is specified only return config for model, otherwise return all configs.
|
|
string model_name = 1;
|
|
}
|
|
|
|
message RivaSynthesisConfigResponse {
|
|
message Config {
|
|
string model_name = 1;
|
|
map<string,string> parameters = 2;
|
|
}
|
|
|
|
repeated Config model_config = 1;
|
|
}
|
|
|
|
message SynthesizeSpeechRequest {
|
|
string text = 1;
|
|
string language_code = 2;
|
|
// audio encoding params
|
|
AudioEncoding encoding = 3;
|
|
|
|
// The sample rate in hertz (Hz) of the audio output requested through `SynthesizeSpeechRequest` messages.
|
|
// Models produce an output at a fixed rate. The sample rate enables you to resample the generated audio output if required.
|
|
// You use the sample rate to up-sample or down-sample the audio for various scenarios. For example, the sample rate can be set to 8kHz (kilohertz) if the output
|
|
// audio is desired for a low bandwidth application.
|
|
// The sample rate values below 8kHz will not produce any meaningful output. Also, up-sampling too much will increase the
|
|
// size of the output without improving the output audio quality.
|
|
|
|
int32 sample_rate_hz = 4;
|
|
// voice params
|
|
string voice_name = 5;
|
|
}
|
|
|
|
message SynthesizeSpeechResponseMetadata {
|
|
// Currently experimental API addition that returns the input text
|
|
// after preprocessing has been completed as well as the predicted
|
|
// duration for each token.
|
|
// Note: this message is subject to future breaking changes, and potential
|
|
// removal.
|
|
string text = 1;
|
|
string processed_text = 2;
|
|
repeated float predicted_durations = 8;
|
|
}
|
|
|
|
message SynthesizeSpeechResponse {
|
|
bytes audio = 1;
|
|
SynthesizeSpeechResponseMetadata meta = 2;
|
|
}
|
|
|
|
/*
|
|
*
|
|
*/
|