feat: add nuance, riva, ibm

This commit is contained in:
Quan HL
2023-02-21 08:49:15 +07:00
parent deee11b3f3
commit 1fed5aefab
23 changed files with 6779 additions and 0 deletions

View File

@@ -0,0 +1,36 @@
// SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: MIT
syntax = "proto3";
package nvidia.riva;
option cc_enable_arenas = true;
option go_package = "nvidia.com/riva_speech";
/*
* AudioEncoding specifies the encoding of the audio bytes in the encapsulating message.
*/
enum AudioEncoding {
// Not specified.
ENCODING_UNSPECIFIED = 0;
// Uncompressed 16-bit signed little-endian samples (Linear PCM).
LINEAR_PCM = 1;
// `FLAC` (Free Lossless Audio
// Codec) is the recommended encoding because it is
// lossless--therefore recognition is not compromised--and
// requires only about half the bandwidth of `LINEAR16`. `FLAC` stream
// encoding supports 16-bit and 24-bit samples, however, not all fields in
// `STREAMINFO` are supported.
FLAC = 2;
// 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
MULAW = 3;
OGGOPUS = 4;
// 8-bit samples that compand 13-bit audio samples using G.711 PCMU/a-law.
ALAW = 20;
}

View File

@@ -0,0 +1,77 @@
// SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: MIT
syntax = "proto3";
package nvidia.riva.tts;
option cc_enable_arenas = true;
option go_package = "nvidia.com/riva_speech";
import "riva/proto/riva_audio.proto";
service RivaSpeechSynthesis {
// Used to request text-to-speech from the service. Submit a request containing the
// desired text and configuration, and receive audio bytes in the requested format.
rpc Synthesize(SynthesizeSpeechRequest) returns (SynthesizeSpeechResponse) {}
// Used to request text-to-speech returned via stream as it becomes available.
// Submit a SynthesizeSpeechRequest with desired text and configuration,
// and receive stream of bytes in the requested format.
rpc SynthesizeOnline(SynthesizeSpeechRequest) returns (stream SynthesizeSpeechResponse) {}
//Enables clients to request the configuration of the current Synthesize service, or a specific model within the service.
rpc GetRivaSynthesisConfig(RivaSynthesisConfigRequest) returns (RivaSynthesisConfigResponse) {}
}
message RivaSynthesisConfigRequest {
//If model is specified only return config for model, otherwise return all configs.
string model_name = 1;
}
message RivaSynthesisConfigResponse {
message Config {
string model_name = 1;
map<string,string> parameters = 2;
}
repeated Config model_config = 1;
}
message SynthesizeSpeechRequest {
string text = 1;
string language_code = 2;
// audio encoding params
AudioEncoding encoding = 3;
// The sample rate in hertz (Hz) of the audio output requested through `SynthesizeSpeechRequest` messages.
// Models produce an output at a fixed rate. The sample rate enables you to resample the generated audio output if required.
// You use the sample rate to up-sample or down-sample the audio for various scenarios. For example, the sample rate can be set to 8kHz (kilohertz) if the output
// audio is desired for a low bandwidth application.
// The sample rate values below 8kHz will not produce any meaningful output. Also, up-sampling too much will increase the
// size of the output without improving the output audio quality.
int32 sample_rate_hz = 4;
// voice params
string voice_name = 5;
}
message SynthesizeSpeechResponseMetadata {
// Currently experimental API addition that returns the input text
// after preprocessing has been completed as well as the predicted
// duration for each token.
// Note: this message is subject to future breaking changes, and potential
// removal.
string text = 1;
string processed_text = 2;
repeated float predicted_durations = 8;
}
message SynthesizeSpeechResponse {
bytes audio = 1;
SynthesizeSpeechResponseMetadata meta = 2;
}
/*
*
*/