Files
speech-utils/protos/riva/proto/riva_tts.proto
2023-02-21 08:49:15 +07:00

78 lines
2.8 KiB
Protocol Buffer

// SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: MIT
syntax = "proto3";
package nvidia.riva.tts;
option cc_enable_arenas = true;
option go_package = "nvidia.com/riva_speech";
import "riva/proto/riva_audio.proto";
service RivaSpeechSynthesis {
// Used to request text-to-speech from the service. Submit a request containing the
// desired text and configuration, and receive audio bytes in the requested format.
rpc Synthesize(SynthesizeSpeechRequest) returns (SynthesizeSpeechResponse) {}
// Used to request text-to-speech returned via stream as it becomes available.
// Submit a SynthesizeSpeechRequest with desired text and configuration,
// and receive stream of bytes in the requested format.
rpc SynthesizeOnline(SynthesizeSpeechRequest) returns (stream SynthesizeSpeechResponse) {}
//Enables clients to request the configuration of the current Synthesize service, or a specific model within the service.
rpc GetRivaSynthesisConfig(RivaSynthesisConfigRequest) returns (RivaSynthesisConfigResponse) {}
}
message RivaSynthesisConfigRequest {
//If model is specified only return config for model, otherwise return all configs.
string model_name = 1;
}
message RivaSynthesisConfigResponse {
message Config {
string model_name = 1;
map<string,string> parameters = 2;
}
repeated Config model_config = 1;
}
message SynthesizeSpeechRequest {
string text = 1;
string language_code = 2;
// audio encoding params
AudioEncoding encoding = 3;
// The sample rate in hertz (Hz) of the audio output requested through `SynthesizeSpeechRequest` messages.
// Models produce an output at a fixed rate. The sample rate enables you to resample the generated audio output if required.
// You use the sample rate to up-sample or down-sample the audio for various scenarios. For example, the sample rate can be set to 8kHz (kilohertz) if the output
// audio is desired for a low bandwidth application.
// The sample rate values below 8kHz will not produce any meaningful output. Also, up-sampling too much will increase the
// size of the output without improving the output audio quality.
int32 sample_rate_hz = 4;
// voice params
string voice_name = 5;
}
message SynthesizeSpeechResponseMetadata {
// Currently experimental API addition that returns the input text
// after preprocessing has been completed as well as the predicted
// duration for each token.
// Note: this message is subject to future breaking changes, and potential
// removal.
string text = 1;
string processed_text = 2;
repeated float predicted_durations = 8;
}
message SynthesizeSpeechResponse {
bytes audio = 1;
SynthesizeSpeechResponseMetadata meta = 2;
}
/*
*
*/