Fix/azure white noise (#63)

* azure return odd bytes audio, that azure tts mode generate white noise * wip Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com> --------- Signed-off-by: Hoan HL <quan.luuhoang8@gmail.com>
2025-12-19 08:27:44 +00:00 · 2024-05-14 17:55:44 +07:00
parent b019a634bd
commit 19f20bf0e7
3 changed files with 41 additions and 5 deletions
--- a/mod_azure_tts/azure_glue.cpp
+++ b/mod_azure_tts/azure_glue.cpp
@@ -179,23 +179,55 @@ extern "C" {
        if (a->flushed) return;
        bool fireEvent = false;
        CircularBuffer_t *cBuffer = (CircularBuffer_t *) a->circularBuffer;
+         size_t total_bytes_to_process;

        auto audioData = e.Result->GetAudioData();
+        auto bytes_received = audioData->size();
+        // Buffer to hold combined data if there is unprocessed byte from the last call.
+        std::unique_ptr<uint8_t[]> combinedData;
+        if (a->has_last_byte) {
+          a->has_last_byte = false;  // We'll handle the last_byte now, so toggle the flag off
+
+          // Allocate memory for the new data array
+          combinedData.reset(new uint8_t[bytes_received + 1]);
+
+          // Prepend the last byte from previous call
+          combinedData[0] = a->last_byte;
+
+          // Copy the new data following the prepended byte
+          memcpy(combinedData.get() + 1, audioData->data(), bytes_received);
+
+          total_bytes_to_process = bytes_received + 1;
+        } else {
+          // Allocate memory for the new data array
+          combinedData.reset(new uint8_t[bytes_received]);
+          memcpy(combinedData.get(), audioData->data(), bytes_received);
+          total_bytes_to_process = bytes_received;
+        }
+
+        // If we now have an odd total, save the last byte for next time
+        auto data = combinedData.get();
+        if ((total_bytes_to_process % sizeof(int16_t)) != 0) {
+          a->last_byte = data[total_bytes_to_process - 1];
+          a->has_last_byte = true;
+          total_bytes_to_process--;
+        }
+  
        if (a->file) {
-          fwrite(audioData->data(), 1, audioData->size(), a->file);
+          fwrite(data, 1, total_bytes_to_process, a->file);
        }

        /**
         * this sort of reinterpretation can be dangerous as a general rule, but in this case we know that the data
         * is 16-bit PCM, so it's safe to do this and its much faster than copying the data byte by byte
         */
-        const uint16_t* begin = reinterpret_cast<const uint16_t*>(audioData->data());
-        const uint16_t* end = reinterpret_cast<const uint16_t*>(audioData->data() + audioData->size());
+        const uint16_t* begin = reinterpret_cast<const uint16_t*>(data);
+        const uint16_t* end = reinterpret_cast<const uint16_t*>(data + total_bytes_to_process);

        /* lock as briefly as possible */
        switch_mutex_lock(a->mutex);
-        if (cBuffer->capacity() - cBuffer->size() < audioData->size()) {
-          cBuffer->set_capacity(cBuffer->size() + std::max( audioData->size(), (size_t)BUFFER_SIZE));
+        if (cBuffer->capacity() - cBuffer->size() < total_bytes_to_process) {
+          cBuffer->set_capacity(cBuffer->size() + std::max( total_bytes_to_process, (size_t)BUFFER_SIZE));
        }
        cBuffer->insert(cBuffer->end(), begin, end);
        switch_mutex_unlock(a->mutex);
--- a/mod_azure_tts/mod_azure_tts.c
+++ b/mod_azure_tts/mod_azure_tts.c
@@ -84,6 +84,7 @@ static switch_status_t a_speech_feed_tts(switch_speech_handle_t *sh, char *text,
  a->flushed = 0;
  a->response_code = 0;
  a->err_msg = NULL;
+  a->has_last_byte = 0;

  return azure_speech_feed_tts(a, text, flags);
 }
--- a/mod_azure_tts/mod_azure_tts.h
+++ b/mod_azure_tts/mod_azure_tts.h
@@ -32,6 +32,9 @@ typedef struct azure_data {
  SpeexResamplerState *resampler;
  void *circularBuffer;
  switch_mutex_t *mutex;
+
+  int has_last_byte;
+  uint8_t last_byte;
 } azure_t;

 #endif