feat: add nuance, riva, ibm

2025-12-19 03:37:49 +00:00 · 2023-02-21 08:49:15 +07:00
parent deee11b3f3
commit 1fed5aefab
23 changed files with 6779 additions and 0 deletions
--- a/protos/riva/proto/riva_audio.proto
+++ b/protos/riva/proto/riva_audio.proto
@@ -0,0 +1,36 @@
+// SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+syntax = "proto3";
+
+package nvidia.riva;
+
+option cc_enable_arenas = true;
+option go_package = "nvidia.com/riva_speech";
+
+/*
+ * AudioEncoding specifies the encoding of the audio bytes in the encapsulating message.
+ */
+enum AudioEncoding {
+    // Not specified.
+    ENCODING_UNSPECIFIED = 0;
+
+    // Uncompressed 16-bit signed little-endian samples (Linear PCM).
+    LINEAR_PCM = 1;
+
+    // `FLAC` (Free Lossless Audio
+    // Codec) is the recommended encoding because it is
+    // lossless--therefore recognition is not compromised--and
+    // requires only about half the bandwidth of `LINEAR16`. `FLAC` stream
+    // encoding supports 16-bit and 24-bit samples, however, not all fields in
+    // `STREAMINFO` are supported.
+    FLAC = 2;
+
+    // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
+    MULAW = 3;
+
+    OGGOPUS = 4;
+
+    // 8-bit samples that compand 13-bit audio samples using G.711 PCMU/a-law.
+    ALAW = 20;
+}
--- a/protos/riva/proto/riva_tts.proto
+++ b/protos/riva/proto/riva_tts.proto
@@ -0,0 +1,77 @@
+// SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+syntax = "proto3";
+
+package nvidia.riva.tts;
+
+option cc_enable_arenas = true;
+option go_package = "nvidia.com/riva_speech";
+
+import "riva/proto/riva_audio.proto";
+
+service RivaSpeechSynthesis {
+    // Used to request text-to-speech from the service. Submit a request containing the
+    // desired text and configuration, and receive audio bytes in the requested format.
+    rpc Synthesize(SynthesizeSpeechRequest) returns (SynthesizeSpeechResponse) {}
+
+    // Used to request text-to-speech returned via stream as it becomes available.
+    // Submit a SynthesizeSpeechRequest with desired text and configuration,
+    // and receive stream of bytes in the requested format.
+    rpc SynthesizeOnline(SynthesizeSpeechRequest) returns (stream SynthesizeSpeechResponse) {}
+
+    //Enables clients to request the configuration of the current Synthesize service, or a specific model within the service.
+    rpc GetRivaSynthesisConfig(RivaSynthesisConfigRequest) returns (RivaSynthesisConfigResponse) {}
+}
+
+message RivaSynthesisConfigRequest {
+   //If model is specified only return config for model, otherwise return all configs.
+   string model_name = 1;
+}
+
+message RivaSynthesisConfigResponse {
+    message Config {
+       string model_name = 1;
+       map<string,string> parameters = 2;
+    }
+
+    repeated Config model_config = 1;
+}
+
+message SynthesizeSpeechRequest {
+    string text = 1;
+    string language_code = 2;
+    // audio encoding params
+    AudioEncoding encoding = 3;
+ 
+    //  The sample rate in hertz (Hz) of the audio output requested through `SynthesizeSpeechRequest` messages.
+    //  Models produce an output at a fixed rate. The sample rate enables you to resample the generated audio output if required.
+    //  You use the sample rate to up-sample or down-sample the audio for various scenarios. For example, the sample rate can be set to 8kHz (kilohertz) if the output
+    //  audio is desired for a low bandwidth application.
+    //  The sample rate values below 8kHz will not produce any meaningful output. Also, up-sampling too much will increase the
+    //  size of the output without improving the output audio quality.
+
+    int32 sample_rate_hz = 4;
+    // voice params
+    string voice_name = 5;
+}
+
+message SynthesizeSpeechResponseMetadata {
+    // Currently experimental API addition that returns the input text
+    // after preprocessing has been completed as well as the predicted
+    // duration for each token.
+    // Note: this message is subject to future breaking changes, and potential
+    // removal.
+    string text = 1;
+    string processed_text = 2;
+    repeated float predicted_durations = 8;
+}
+
+message SynthesizeSpeechResponse {
+    bytes audio = 1;
+    SynthesizeSpeechResponseMetadata meta = 2;
+}
+
+/*
+ *
+ */