From 94361f1d203939190640dbfc0de79878488280f8 Mon Sep 17 00:00:00 2001 From: Hoan Luu Huu <110280845+xquanluu@users.noreply.github.com> Date: Fri, 5 Apr 2024 18:16:31 +0700 Subject: [PATCH] mod_dub support sayOnTrack Deepgram (#35) * mod_dub support sayOnTrack Deepgram Signed-off-by: Hoan HL * mod_dub sayOnTrack support azure Signed-off-by: Hoan HL * wip Signed-off-by: Hoan HL * wip Signed-off-by: Hoan HL * wip Signed-off-by: Hoan HL * wip Signed-off-by: Hoan HL * wip Signed-off-by: Hoan HL * support whisper * wip Signed-off-by: Hoan HL --------- Signed-off-by: Hoan HL --- mod_dub/ap_http.cpp | 5 +- mod_dub/ap_http.h | 3 +- mod_dub/dub_glue.cpp | 6 +- mod_dub/track.cpp | 4 +- mod_dub/track.h | 2 +- mod_dub/tts_vendor_parser.cpp | 190 +++++++++++++++++++++++++++++++++- mod_dub/tts_vendor_parser.h | 2 +- 7 files changed, 201 insertions(+), 11 deletions(-) diff --git a/mod_dub/ap_http.cpp b/mod_dub/ap_http.cpp index ac4952b..21d4ccf 100644 --- a/mod_dub/ap_http.cpp +++ b/mod_dub/ap_http.cpp @@ -142,6 +142,8 @@ void AudioProducerHttp::start(std::function call curl_easy_setopt(_easy, CURLOPT_MAX_RECV_SPEED_LARGE, (curl_off_t)31415); /*Add request body*/ if (!_body.empty()) curl_easy_setopt(_easy, CURLOPT_POSTFIELDS, _body.c_str()); + /*Add request proxy*/ + if (!_proxy.empty()) curl_easy_setopt(_easy, CURLOPT_PROXY, _proxy.c_str()); /*Add request headers*/ struct curl_slist *hdr_list = nullptr; @@ -167,11 +169,12 @@ void AudioProducerHttp::queueHttpPostAudio(const std::string& url, int gain, boo _gain = gain; _loop = loop; } -void AudioProducerHttp::queueHttpPostAudio(const std::string& url, const std::string& body, std::vector& headers, int gain, bool loop) { +void AudioProducerHttp::queueHttpPostAudio(const std::string& url, const std::string& body, std::vector& headers, const std::string& proxy, int gain, bool loop) { _method = HttpMethod_t::HTTP_METHOD_POST; _url = url; _body = body; _headers = headers; + _proxy = proxy; _gain = gain; _loop = loop; } diff --git a/mod_dub/ap_http.h b/mod_dub/ap_http.h index 45cde7d..f2260f6 100644 --- a/mod_dub/ap_http.h +++ b/mod_dub/ap_http.h @@ -68,7 +68,7 @@ public: void queueHttpGetAudio(const std::string& url, int gain = 0, bool loop = false); void queueHttpPostAudio(const std::string& url, int gain = 0, bool loop = false); - void queueHttpPostAudio(const std::string& url, const std::string& body, std::vector& headers, int gain = 0, bool loop = false); + void queueHttpPostAudio(const std::string& url, const std::string& body, std::vector& headers, const std::string& proxy, int gain = 0, bool loop = false); Status_t getStatus() const { return _status; } void setStatus(Status_t status) { _status = status; } @@ -121,6 +121,7 @@ private: HttpMethod_t _method; std::string _url; std::string _body; + std::string _proxy; std::vector _headers; Status_t _status; mpg123_handle *_mh; diff --git a/mod_dub/dub_glue.cpp b/mod_dub/dub_glue.cpp index 683ea8e..2156b89 100644 --- a/mod_dub/dub_glue.cpp +++ b/mod_dub/dub_glue.cpp @@ -98,18 +98,18 @@ extern "C" { switch_status_t say_dub_track(struct cap_cb* cb, char* trackName, char* text, int gain) { std::vector headers; - std::string url, body; + std::string url, body, proxy; Track* track = find_track_by_name(cb->tracks, trackName); if (!track) { switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "play_dub_track: track %s not found\n", trackName); return SWITCH_STATUS_FALSE; } - if (tts_vendor_parse_text(text, url, body, headers) != SWITCH_STATUS_SUCCESS) { + if (tts_vendor_parse_text(text, url, body, headers, proxy) != SWITCH_STATUS_SUCCESS) { switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "say_dub_track: failed to parse text\n"); return SWITCH_STATUS_FALSE; } - track->queueHttpPostAudio(url, body, headers, gain); + track->queueHttpPostAudio(url, body, headers, proxy, gain); return SWITCH_STATUS_SUCCESS; } diff --git a/mod_dub/track.cpp b/mod_dub/track.cpp index 72ff64d..ddc0e9f 100644 --- a/mod_dub/track.cpp +++ b/mod_dub/track.cpp @@ -114,11 +114,11 @@ void Track::queueHttpPostAudio(const std::string& url, int gain, bool loop) { } } -void Track::queueHttpPostAudio(const std::string& url, const std::string& body, std::vector& headers, int gain, bool loop) { +void Track::queueHttpPostAudio(const std::string& url, const std::string& body, std::vector& headers, const std::string& proxy, int gain, bool loop) { bool startIt = false; if (_stopping) return; auto ap = std::make_shared(_mutex, _buffer, _sampleRate); - ap->queueHttpPostAudio(url, body, headers, gain, loop); + ap->queueHttpPostAudio(url, body, headers, proxy, gain, loop); { std::lock_guard lock(_mutex); _apQueue.push(ap); diff --git a/mod_dub/track.h b/mod_dub/track.h index 870b26a..e421ed5 100644 --- a/mod_dub/track.h +++ b/mod_dub/track.h @@ -14,7 +14,7 @@ public: /* audio production methods */ void queueHttpGetAudio(const std::string& url, int gain = 0, bool loop = false); void queueHttpPostAudio(const std::string& url, int gain = 0, bool loop = false); - void queueHttpPostAudio(const std::string& url, const std::string& body, std::vector& headers, int gain = 0, bool loop = false); + void queueHttpPostAudio(const std::string& url, const std::string& body, std::vector& headers, const std::string& proxy, int gain = 0, bool loop = false); void queueFileAudio(const std::string& path, int gain = 0, bool loop = false); void removeAllAudio(); diff --git a/mod_dub/tts_vendor_parser.cpp b/mod_dub/tts_vendor_parser.cpp index 2c315f0..db0be0b 100644 --- a/mod_dub/tts_vendor_parser.cpp +++ b/mod_dub/tts_vendor_parser.cpp @@ -4,6 +4,186 @@ #include #include +switch_status_t whisper_parse_text(const std::map& params, const std::string& text, + std::string& url, std::string& body, std::vector& headers) { + std::string api_key; + std::string voice_name; + std::string model_id; + std::string speed; + + for (const auto& pair : params) { + if (pair.first == "api_key") { + api_key = pair.second; + } else if (pair.first == "voice") { + voice_name = pair.second; + } else if (pair.first == "model_id") { + model_id = pair.second; + } else if (pair.first == "speed") { + speed = pair.second; + } + } + + if (api_key.empty()) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "whisper_parse_text: no api_key provided\n"); + return SWITCH_STATUS_FALSE; + } + if (model_id.empty()) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "whisper_parse_text: no model_id provided\n"); + return SWITCH_STATUS_FALSE; + } + + url = "https://api.openai.com/v1/audio/speech"; + + /* create the JSON body */ + cJSON * jResult = cJSON_CreateObject(); + cJSON_AddStringToObject(jResult, "model", model_id.c_str()); + cJSON_AddStringToObject(jResult, "input", text.c_str()); + cJSON_AddStringToObject(jResult, "voice", voice_name.c_str()); + cJSON_AddStringToObject(jResult, "response_format", "mp3"); + if (!speed.empty()) { + cJSON_AddStringToObject(jResult, "speed", speed.c_str()); + } + char* _body = cJSON_PrintUnformatted(jResult); + body = _body; + + cJSON_Delete(jResult); + free(_body); + + // Create headers + headers.push_back("Authorization: Bearer " + api_key); + headers.push_back("Content-Type: application/json"); + + return SWITCH_STATUS_SUCCESS; +} + +switch_status_t azure_parse_text(const std::map& params, const std::string& text, + std::string& url, std::string& body, std::vector& headers, std::string& proxy) { + + std::string api_key; + std::string voice_name; + std::string language; + std::string region; + std::string endpoint; + std::string endpointId; + std::string http_proxy_ip; + std::string http_proxy_port; + + for (const auto& pair : params) { + if (pair.first == "api_key") { + api_key = pair.second; + } else if (pair.first == "voice") { + voice_name = pair.second; + } else if (pair.first == "language") { + language = pair.second; + } else if (pair.first == "region") { + region = pair.second; + } else if (pair.first == "endpoint") { + endpoint = pair.second; + } else if (pair.first == "endpointId") { + endpointId = pair.second; + } else if (pair.first == "http_proxy_ip") { + http_proxy_ip = pair.second; + } else if (pair.first == "http_proxy_port") { + http_proxy_port = pair.second; + } + } + + if (language.empty()) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "azure_parse_text: no language provided\n"); + return SWITCH_STATUS_FALSE; + } + if (voice_name.empty()) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "azure_parse_text: no voice_name provided\n"); + return SWITCH_STATUS_FALSE; + } + + if (region.empty()) { + region = "westus"; + } + /* format url*/ + url = !endpoint.empty() ? endpoint : "https://" + region + ".tts.speech.microsoft.com/cognitiveservices/v1"; + + // Body + if (strncmp(text.c_str(), ""; + body_stream << ""; + body_stream << text; + body_stream << ""; + body_stream << ""; + body = body_stream.str(); + } + + // Create headers + if (!api_key.empty()) { + headers.push_back("Ocp-Apim-Subscription-Key: " + api_key); + } + if (!endpointId.empty()) { + headers.push_back("X-Microsoft-EndpointId: " + endpointId); + } + headers.push_back("Content-Type: application/ssml+xml"); + headers.push_back("X-Microsoft-OutputFormat: audio-16khz-32kbitrate-mono-mp3"); + + // Proxy + std::ostringstream proxy_stream; + if (!http_proxy_ip.empty()) { + proxy_stream << "http://" << http_proxy_ip; + if (!http_proxy_port.empty()) { + proxy_stream << ":" << http_proxy_port; + } + } + proxy = proxy_stream.str(); + + return SWITCH_STATUS_SUCCESS; +} + +switch_status_t deepgram_parse_text(const std::map& params, const std::string& text, + std::string& url, std::string& body, std::vector& headers) { + + std::string api_key; + std::string voice_name; + + for (const auto& pair : params) { + if (pair.first == "api_key") { + api_key = pair.second; + } else if (pair.first == "voice") { + voice_name = pair.second; + } + } + + if (api_key.empty()) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "deepgram_parse_text: no api_key provided\n"); + return SWITCH_STATUS_FALSE; + } + if (voice_name.empty()) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "deepgram_parse_text: no voice_name provided\n"); + return SWITCH_STATUS_FALSE; + } + + /* format url*/ + std::ostringstream url_stream; + url_stream << "https://api.deepgram.com/v1/speak?model=" << voice_name << "&encoding=mp3"; + url = url_stream.str(); + + /* create the JSON body */ + cJSON * jResult = cJSON_CreateObject(); + cJSON_AddStringToObject(jResult, "text", text.c_str()); + + char* _body = cJSON_PrintUnformatted(jResult); + body = _body; + + cJSON_Delete(jResult); + free(_body); + + // Create headers + headers.push_back("Authorization: Token " + api_key); + headers.push_back("Content-Type: application/json"); + + return SWITCH_STATUS_SUCCESS; +} + switch_status_t elevenlabs_parse_text(const std::map& params, const std::string& text, std::string& url, std::string& body, std::vector& headers) { @@ -87,7 +267,7 @@ switch_status_t elevenlabs_parse_text(const std::map& return SWITCH_STATUS_SUCCESS; } -switch_status_t tts_vendor_parse_text(const std::string& say, std::string& url, std::string& body, std::vector& headers) { +switch_status_t tts_vendor_parse_text(const std::string& say, std::string& url, std::string& body, std::vector& headers, std::string& proxy) { size_t start = say.find("{") + 1; size_t end = say.find("}"); @@ -111,8 +291,14 @@ switch_status_t tts_vendor_parse_text(const std::string& say, std::string& url, if (params["vendor"] == "elevenlabs") { return elevenlabs_parse_text(params, text, url, body, headers); + } else if (params["vendor"] == "deepgram") { + return deepgram_parse_text(params, text, url, body, headers); + } else if (params["vendor"] == "microsoft") { + return azure_parse_text(params, text, url, body, headers, proxy); + } else if (params["vendor"] == "whisper") { + return whisper_parse_text(params, text, url, body, headers); } else { - switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "tts_vendor_parse_text: There is no available parser for text\n"); + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "tts_vendor_parse_text: There is no available parser for vendor %s\n", params["vendor"]); return SWITCH_STATUS_FALSE; } } \ No newline at end of file diff --git a/mod_dub/tts_vendor_parser.h b/mod_dub/tts_vendor_parser.h index 07cedef..2985f73 100644 --- a/mod_dub/tts_vendor_parser.h +++ b/mod_dub/tts_vendor_parser.h @@ -7,6 +7,6 @@ #include "common.h" -switch_status_t tts_vendor_parse_text(const std::string& say, std::string& url, std::string& body, std::vector& headers); +switch_status_t tts_vendor_parse_text(const std::string& say, std::string& url, std::string& body, std::vector& headers, std::string& proxy); #endif \ No newline at end of file