speech-utils/protos/riva/proto/riva_tts.proto

// SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: MIT

syntax = "proto3";

package nvidia.riva.tts;

option cc_enable_arenas = true;
option go_package = "nvidia.com/riva_speech";

import "riva/proto/riva_audio.proto";

service RivaSpeechSynthesis {
    // Used to request text-to-speech from the service. Submit a request containing the
    // desired text and configuration, and receive audio bytes in the requested format.
    rpc Synthesize(SynthesizeSpeechRequest) returns (SynthesizeSpeechResponse) {}

    // Used to request text-to-speech returned via stream as it becomes available.
    // Submit a SynthesizeSpeechRequest with desired text and configuration,
    // and receive stream of bytes in the requested format.
    rpc SynthesizeOnline(SynthesizeSpeechRequest) returns (stream SynthesizeSpeechResponse) {}

    //Enables clients to request the configuration of the current Synthesize service, or a specific model within the service.
    rpc GetRivaSynthesisConfig(RivaSynthesisConfigRequest) returns (RivaSynthesisConfigResponse) {}
}

message RivaSynthesisConfigRequest {
   //If model is specified only return config for model, otherwise return all configs.
   string model_name = 1;
}

message RivaSynthesisConfigResponse {
    message Config {
       string model_name = 1;
       map<string,string> parameters = 2;
    }

    repeated Config model_config = 1;
}

message SynthesizeSpeechRequest {
    string text = 1;
    string language_code = 2;
    // audio encoding params
    AudioEncoding encoding = 3;

    //  The sample rate in hertz (Hz) of the audio output requested through `SynthesizeSpeechRequest` messages.
    //  Models produce an output at a fixed rate. The sample rate enables you to resample the generated audio output if required.
    //  You use the sample rate to up-sample or down-sample the audio for various scenarios. For example, the sample rate can be set to 8kHz (kilohertz) if the output
    //  audio is desired for a low bandwidth application.
    //  The sample rate values below 8kHz will not produce any meaningful output. Also, up-sampling too much will increase the
    //  size of the output without improving the output audio quality.

    int32 sample_rate_hz = 4;
    // voice params
    string voice_name = 5;
}

message SynthesizeSpeechResponseMetadata {
    // Currently experimental API addition that returns the input text
    // after preprocessing has been completed as well as the predicted
    // duration for each token.
    // Note: this message is subject to future breaking changes, and potential
    // removal.
    string text = 1;
    string processed_text = 2;
    repeated float predicted_durations = 8;
}

message SynthesizeSpeechResponse {
    bytes audio = 1;
    SynthesizeSpeechResponseMetadata meta = 2;
}

/*
 *
 */