mod_dub support sayOnTrack Deepgram (#35)

* mod_dub support sayOnTrack Deepgram

Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com>

* mod_dub sayOnTrack support azure

Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com>

* wip

Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com>

* wip

Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com>

* wip

Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com>

* wip

Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com>

* wip

Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com>

* support whisper

* wip

Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com>

---------

Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com>
This commit is contained in:
Hoan Luu Huu
2024-04-05 18:16:31 +07:00
committed by GitHub
parent d05cfb8ef0
commit 94361f1d20
7 changed files with 201 additions and 11 deletions

View File

@@ -142,6 +142,8 @@ void AudioProducerHttp::start(std::function<void(bool, const std::string&)> call
curl_easy_setopt(_easy, CURLOPT_MAX_RECV_SPEED_LARGE, (curl_off_t)31415);
/*Add request body*/
if (!_body.empty()) curl_easy_setopt(_easy, CURLOPT_POSTFIELDS, _body.c_str());
/*Add request proxy*/
if (!_proxy.empty()) curl_easy_setopt(_easy, CURLOPT_PROXY, _proxy.c_str());
/*Add request headers*/
struct curl_slist *hdr_list = nullptr;
@@ -167,11 +169,12 @@ void AudioProducerHttp::queueHttpPostAudio(const std::string& url, int gain, boo
_gain = gain;
_loop = loop;
}
void AudioProducerHttp::queueHttpPostAudio(const std::string& url, const std::string& body, std::vector<std::string>& headers, int gain, bool loop) {
void AudioProducerHttp::queueHttpPostAudio(const std::string& url, const std::string& body, std::vector<std::string>& headers, const std::string& proxy, int gain, bool loop) {
_method = HttpMethod_t::HTTP_METHOD_POST;
_url = url;
_body = body;
_headers = headers;
_proxy = proxy;
_gain = gain;
_loop = loop;
}

View File

@@ -68,7 +68,7 @@ public:
void queueHttpGetAudio(const std::string& url, int gain = 0, bool loop = false);
void queueHttpPostAudio(const std::string& url, int gain = 0, bool loop = false);
void queueHttpPostAudio(const std::string& url, const std::string& body, std::vector<std::string>& headers, int gain = 0, bool loop = false);
void queueHttpPostAudio(const std::string& url, const std::string& body, std::vector<std::string>& headers, const std::string& proxy, int gain = 0, bool loop = false);
Status_t getStatus() const { return _status; }
void setStatus(Status_t status) { _status = status; }
@@ -121,6 +121,7 @@ private:
HttpMethod_t _method;
std::string _url;
std::string _body;
std::string _proxy;
std::vector<std::string> _headers;
Status_t _status;
mpg123_handle *_mh;

View File

@@ -98,18 +98,18 @@ extern "C" {
switch_status_t say_dub_track(struct cap_cb* cb, char* trackName, char* text, int gain) {
std::vector<std::string> headers;
std::string url, body;
std::string url, body, proxy;
Track* track = find_track_by_name(cb->tracks, trackName);
if (!track) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "play_dub_track: track %s not found\n", trackName);
return SWITCH_STATUS_FALSE;
}
if (tts_vendor_parse_text(text, url, body, headers) != SWITCH_STATUS_SUCCESS) {
if (tts_vendor_parse_text(text, url, body, headers, proxy) != SWITCH_STATUS_SUCCESS) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "say_dub_track: failed to parse text\n");
return SWITCH_STATUS_FALSE;
}
track->queueHttpPostAudio(url, body, headers, gain);
track->queueHttpPostAudio(url, body, headers, proxy, gain);
return SWITCH_STATUS_SUCCESS;
}

View File

@@ -114,11 +114,11 @@ void Track::queueHttpPostAudio(const std::string& url, int gain, bool loop) {
}
}
void Track::queueHttpPostAudio(const std::string& url, const std::string& body, std::vector<std::string>& headers, int gain, bool loop) {
void Track::queueHttpPostAudio(const std::string& url, const std::string& body, std::vector<std::string>& headers, const std::string& proxy, int gain, bool loop) {
bool startIt = false;
if (_stopping) return;
auto ap = std::make_shared<AudioProducerHttp>(_mutex, _buffer, _sampleRate);
ap->queueHttpPostAudio(url, body, headers, gain, loop);
ap->queueHttpPostAudio(url, body, headers, proxy, gain, loop);
{
std::lock_guard<std::mutex> lock(_mutex);
_apQueue.push(ap);

View File

@@ -14,7 +14,7 @@ public:
/* audio production methods */
void queueHttpGetAudio(const std::string& url, int gain = 0, bool loop = false);
void queueHttpPostAudio(const std::string& url, int gain = 0, bool loop = false);
void queueHttpPostAudio(const std::string& url, const std::string& body, std::vector<std::string>& headers, int gain = 0, bool loop = false);
void queueHttpPostAudio(const std::string& url, const std::string& body, std::vector<std::string>& headers, const std::string& proxy, int gain = 0, bool loop = false);
void queueFileAudio(const std::string& path, int gain = 0, bool loop = false);
void removeAllAudio();

View File

@@ -4,6 +4,186 @@
#include <switch_json.h>
#include <map>
switch_status_t whisper_parse_text(const std::map<std::string, std::string>& params, const std::string& text,
std::string& url, std::string& body, std::vector<std::string>& headers) {
std::string api_key;
std::string voice_name;
std::string model_id;
std::string speed;
for (const auto& pair : params) {
if (pair.first == "api_key") {
api_key = pair.second;
} else if (pair.first == "voice") {
voice_name = pair.second;
} else if (pair.first == "model_id") {
model_id = pair.second;
} else if (pair.first == "speed") {
speed = pair.second;
}
}
if (api_key.empty()) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "whisper_parse_text: no api_key provided\n");
return SWITCH_STATUS_FALSE;
}
if (model_id.empty()) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "whisper_parse_text: no model_id provided\n");
return SWITCH_STATUS_FALSE;
}
url = "https://api.openai.com/v1/audio/speech";
/* create the JSON body */
cJSON * jResult = cJSON_CreateObject();
cJSON_AddStringToObject(jResult, "model", model_id.c_str());
cJSON_AddStringToObject(jResult, "input", text.c_str());
cJSON_AddStringToObject(jResult, "voice", voice_name.c_str());
cJSON_AddStringToObject(jResult, "response_format", "mp3");
if (!speed.empty()) {
cJSON_AddStringToObject(jResult, "speed", speed.c_str());
}
char* _body = cJSON_PrintUnformatted(jResult);
body = _body;
cJSON_Delete(jResult);
free(_body);
// Create headers
headers.push_back("Authorization: Bearer " + api_key);
headers.push_back("Content-Type: application/json");
return SWITCH_STATUS_SUCCESS;
}
switch_status_t azure_parse_text(const std::map<std::string, std::string>& params, const std::string& text,
std::string& url, std::string& body, std::vector<std::string>& headers, std::string& proxy) {
std::string api_key;
std::string voice_name;
std::string language;
std::string region;
std::string endpoint;
std::string endpointId;
std::string http_proxy_ip;
std::string http_proxy_port;
for (const auto& pair : params) {
if (pair.first == "api_key") {
api_key = pair.second;
} else if (pair.first == "voice") {
voice_name = pair.second;
} else if (pair.first == "language") {
language = pair.second;
} else if (pair.first == "region") {
region = pair.second;
} else if (pair.first == "endpoint") {
endpoint = pair.second;
} else if (pair.first == "endpointId") {
endpointId = pair.second;
} else if (pair.first == "http_proxy_ip") {
http_proxy_ip = pair.second;
} else if (pair.first == "http_proxy_port") {
http_proxy_port = pair.second;
}
}
if (language.empty()) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "azure_parse_text: no language provided\n");
return SWITCH_STATUS_FALSE;
}
if (voice_name.empty()) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "azure_parse_text: no voice_name provided\n");
return SWITCH_STATUS_FALSE;
}
if (region.empty()) {
region = "westus";
}
/* format url*/
url = !endpoint.empty() ? endpoint : "https://" + region + ".tts.speech.microsoft.com/cognitiveservices/v1";
// Body
if (strncmp(text.c_str(), "<speak", 6) == 0) {
body = text;
} else {
std::ostringstream body_stream;
body_stream << "<speak version=\"1.0\" xmlns=\"http://www.w3.org/2001/10/synthesis\" xmlns:mstts=\"https://www.w3.org/2001/mstts\" xml:lang=\"" << language << "\">";
body_stream << "<voice name=\"" << voice_name << "\">";
body_stream << text;
body_stream << "</voice>";
body_stream << "</speak>";
body = body_stream.str();
}
// Create headers
if (!api_key.empty()) {
headers.push_back("Ocp-Apim-Subscription-Key: " + api_key);
}
if (!endpointId.empty()) {
headers.push_back("X-Microsoft-EndpointId: " + endpointId);
}
headers.push_back("Content-Type: application/ssml+xml");
headers.push_back("X-Microsoft-OutputFormat: audio-16khz-32kbitrate-mono-mp3");
// Proxy
std::ostringstream proxy_stream;
if (!http_proxy_ip.empty()) {
proxy_stream << "http://" << http_proxy_ip;
if (!http_proxy_port.empty()) {
proxy_stream << ":" << http_proxy_port;
}
}
proxy = proxy_stream.str();
return SWITCH_STATUS_SUCCESS;
}
switch_status_t deepgram_parse_text(const std::map<std::string, std::string>& params, const std::string& text,
std::string& url, std::string& body, std::vector<std::string>& headers) {
std::string api_key;
std::string voice_name;
for (const auto& pair : params) {
if (pair.first == "api_key") {
api_key = pair.second;
} else if (pair.first == "voice") {
voice_name = pair.second;
}
}
if (api_key.empty()) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "deepgram_parse_text: no api_key provided\n");
return SWITCH_STATUS_FALSE;
}
if (voice_name.empty()) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "deepgram_parse_text: no voice_name provided\n");
return SWITCH_STATUS_FALSE;
}
/* format url*/
std::ostringstream url_stream;
url_stream << "https://api.deepgram.com/v1/speak?model=" << voice_name << "&encoding=mp3";
url = url_stream.str();
/* create the JSON body */
cJSON * jResult = cJSON_CreateObject();
cJSON_AddStringToObject(jResult, "text", text.c_str());
char* _body = cJSON_PrintUnformatted(jResult);
body = _body;
cJSON_Delete(jResult);
free(_body);
// Create headers
headers.push_back("Authorization: Token " + api_key);
headers.push_back("Content-Type: application/json");
return SWITCH_STATUS_SUCCESS;
}
switch_status_t elevenlabs_parse_text(const std::map<std::string, std::string>& params, const std::string& text,
std::string& url, std::string& body, std::vector<std::string>& headers) {
@@ -87,7 +267,7 @@ switch_status_t elevenlabs_parse_text(const std::map<std::string, std::string>&
return SWITCH_STATUS_SUCCESS;
}
switch_status_t tts_vendor_parse_text(const std::string& say, std::string& url, std::string& body, std::vector<std::string>& headers) {
switch_status_t tts_vendor_parse_text(const std::string& say, std::string& url, std::string& body, std::vector<std::string>& headers, std::string& proxy) {
size_t start = say.find("{") + 1;
size_t end = say.find("}");
@@ -111,8 +291,14 @@ switch_status_t tts_vendor_parse_text(const std::string& say, std::string& url,
if (params["vendor"] == "elevenlabs") {
return elevenlabs_parse_text(params, text, url, body, headers);
} else if (params["vendor"] == "deepgram") {
return deepgram_parse_text(params, text, url, body, headers);
} else if (params["vendor"] == "microsoft") {
return azure_parse_text(params, text, url, body, headers, proxy);
} else if (params["vendor"] == "whisper") {
return whisper_parse_text(params, text, url, body, headers);
} else {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "tts_vendor_parse_text: There is no available parser for text\n");
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "tts_vendor_parse_text: There is no available parser for vendor %s\n", params["vendor"]);
return SWITCH_STATUS_FALSE;
}
}

View File

@@ -7,6 +7,6 @@
#include "common.h"
switch_status_t tts_vendor_parse_text(const std::string& say, std::string& url, std::string& body, std::vector<std::string>& headers);
switch_status_t tts_vendor_parse_text(const std::string& say, std::string& url, std::string& body, std::vector<std::string>& headers, std::string& proxy);
#endif