mirror of
https://github.com/jambonz/speech-utils.git
synced 2025-12-19 03:37:49 +00:00
feat: add nuance, riva, ibm
This commit is contained in:
36
protos/riva/proto/riva_audio.proto
Normal file
36
protos/riva/proto/riva_audio.proto
Normal file
@@ -0,0 +1,36 @@
|
||||
// SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
package nvidia.riva;
|
||||
|
||||
option cc_enable_arenas = true;
|
||||
option go_package = "nvidia.com/riva_speech";
|
||||
|
||||
/*
|
||||
* AudioEncoding specifies the encoding of the audio bytes in the encapsulating message.
|
||||
*/
|
||||
enum AudioEncoding {
|
||||
// Not specified.
|
||||
ENCODING_UNSPECIFIED = 0;
|
||||
|
||||
// Uncompressed 16-bit signed little-endian samples (Linear PCM).
|
||||
LINEAR_PCM = 1;
|
||||
|
||||
// `FLAC` (Free Lossless Audio
|
||||
// Codec) is the recommended encoding because it is
|
||||
// lossless--therefore recognition is not compromised--and
|
||||
// requires only about half the bandwidth of `LINEAR16`. `FLAC` stream
|
||||
// encoding supports 16-bit and 24-bit samples, however, not all fields in
|
||||
// `STREAMINFO` are supported.
|
||||
FLAC = 2;
|
||||
|
||||
// 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
|
||||
MULAW = 3;
|
||||
|
||||
OGGOPUS = 4;
|
||||
|
||||
// 8-bit samples that compand 13-bit audio samples using G.711 PCMU/a-law.
|
||||
ALAW = 20;
|
||||
}
|
||||
77
protos/riva/proto/riva_tts.proto
Normal file
77
protos/riva/proto/riva_tts.proto
Normal file
@@ -0,0 +1,77 @@
|
||||
// SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
package nvidia.riva.tts;
|
||||
|
||||
option cc_enable_arenas = true;
|
||||
option go_package = "nvidia.com/riva_speech";
|
||||
|
||||
import "riva/proto/riva_audio.proto";
|
||||
|
||||
service RivaSpeechSynthesis {
|
||||
// Used to request text-to-speech from the service. Submit a request containing the
|
||||
// desired text and configuration, and receive audio bytes in the requested format.
|
||||
rpc Synthesize(SynthesizeSpeechRequest) returns (SynthesizeSpeechResponse) {}
|
||||
|
||||
// Used to request text-to-speech returned via stream as it becomes available.
|
||||
// Submit a SynthesizeSpeechRequest with desired text and configuration,
|
||||
// and receive stream of bytes in the requested format.
|
||||
rpc SynthesizeOnline(SynthesizeSpeechRequest) returns (stream SynthesizeSpeechResponse) {}
|
||||
|
||||
//Enables clients to request the configuration of the current Synthesize service, or a specific model within the service.
|
||||
rpc GetRivaSynthesisConfig(RivaSynthesisConfigRequest) returns (RivaSynthesisConfigResponse) {}
|
||||
}
|
||||
|
||||
message RivaSynthesisConfigRequest {
|
||||
//If model is specified only return config for model, otherwise return all configs.
|
||||
string model_name = 1;
|
||||
}
|
||||
|
||||
message RivaSynthesisConfigResponse {
|
||||
message Config {
|
||||
string model_name = 1;
|
||||
map<string,string> parameters = 2;
|
||||
}
|
||||
|
||||
repeated Config model_config = 1;
|
||||
}
|
||||
|
||||
message SynthesizeSpeechRequest {
|
||||
string text = 1;
|
||||
string language_code = 2;
|
||||
// audio encoding params
|
||||
AudioEncoding encoding = 3;
|
||||
|
||||
// The sample rate in hertz (Hz) of the audio output requested through `SynthesizeSpeechRequest` messages.
|
||||
// Models produce an output at a fixed rate. The sample rate enables you to resample the generated audio output if required.
|
||||
// You use the sample rate to up-sample or down-sample the audio for various scenarios. For example, the sample rate can be set to 8kHz (kilohertz) if the output
|
||||
// audio is desired for a low bandwidth application.
|
||||
// The sample rate values below 8kHz will not produce any meaningful output. Also, up-sampling too much will increase the
|
||||
// size of the output without improving the output audio quality.
|
||||
|
||||
int32 sample_rate_hz = 4;
|
||||
// voice params
|
||||
string voice_name = 5;
|
||||
}
|
||||
|
||||
message SynthesizeSpeechResponseMetadata {
|
||||
// Currently experimental API addition that returns the input text
|
||||
// after preprocessing has been completed as well as the predicted
|
||||
// duration for each token.
|
||||
// Note: this message is subject to future breaking changes, and potential
|
||||
// removal.
|
||||
string text = 1;
|
||||
string processed_text = 2;
|
||||
repeated float predicted_durations = 8;
|
||||
}
|
||||
|
||||
message SynthesizeSpeechResponse {
|
||||
bytes audio = 1;
|
||||
SynthesizeSpeechResponseMetadata meta = 2;
|
||||
}
|
||||
|
||||
/*
|
||||
*
|
||||
*/
|
||||
Reference in New Issue
Block a user