From 270bcaef4207ec1fcec885ac4adf92faffd359c7 Mon Sep 17 00:00:00 2001
From: Neko Life <nekolife123579@gmail.com>
Date: Thu, 7 Sep 2023 21:27:51 +0700
Subject: [PATCH] fix: fixed discord_voice_client::send_audio_raw blocking
 thread when provided with invalid size (#845)

---
 include/dpp/discordvoiceclient.h | 27 ++++++++++++++++++---------
 src/dpp/discordvoiceclient.cpp   | 32 +++++++++++++++++++++++---------
 2 files changed, 41 insertions(+), 18 deletions(-)
diff --git a/include/dpp/discordvoiceclient.h b/include/dpp/discordvoiceclient.h
index eba704e337..39c5036356 100644
--- a/include/dpp/discordvoiceclient.h
+++ b/include/dpp/discordvoiceclient.h
@@ -50,14 +50,19 @@
 #include <functional>
 #include <chrono>
 
-
-
 struct OpusDecoder;
 struct OpusEncoder;
 struct OpusRepacketizer;
 
 namespace dpp {
 
+// !TODO: change these to constexpr and rename every occurrence across the codebase
+#define AUDIO_TRACK_MARKER (uint16_t)0xFFFF
+
+#define AUDIO_OVERLAP_SLEEP_SAMPLES 30
+
+inline constexpr size_t send_audio_raw_max_length = 11520;
+
 using json = nlohmann::json;
 
 /*
@@ -95,10 +100,6 @@ struct DPP_EXPORT voice_out_packet {
 	uint64_t duration;
 };
 
-#define AUDIO_TRACK_MARKER (uint16_t)0xFFFF
-
-#define AUDIO_OVERLAP_SLEEP_SAMPLES 30
-
 /** @brief Implements a discord voice connection.
  * Each discord_voice_client connects to one voice channel and derives from a websocket client.
  */
@@ -676,7 +677,7 @@ class DPP_EXPORT discord_voice_client : public websocket_client
 	/**
 	 * @brief Send raw audio to the voice channel.
 	 * 
-	 * You should send an audio packet of 11520 bytes.
+	 * You should send an audio packet of `send_audio_raw_max_length` (11520) bytes.
 	 * Note that this function can be costly as it has to opus encode
 	 * the PCM audio on the fly, and also encrypt it with libsodium.
 	 * 
@@ -695,8 +696,16 @@ class DPP_EXPORT discord_voice_client : public websocket_client
 	 * 
 	 * @param length The length of the audio data. The length should
 	 * be a multiple of 4 (2x 16 bit stereo channels) with a maximum
-	 * length of 11520, which is a complete opus frame at highest
-	 * quality.
+	 * length of `send_audio_raw_max_length`, which is a complete opus
+	 * frame at highest quality.
+	 *
+	 * Generally when you're streaming and you know there will be
+	 * more packet to come you should always provide packet data with
+	 * length of `send_audio_raw_max_length`.
+	 * Silence packet will be appended if length is less than
+	 * `send_audio_raw_max_length` as discord expects to receive such
+	 * specific packet size. This can cause gaps in your stream resulting
+	 * in distorted audio if you have more packet to send later on.
 	 * 
 	 * @return discord_voice_client& Reference to self
 	 * 
diff --git a/src/dpp/discordvoiceclient.cpp b/src/dpp/discordvoiceclient.cpp
index 4f09992f72..efe729fe82 100644
--- a/src/dpp/discordvoiceclient.cpp
+++ b/src/dpp/discordvoiceclient.cpp
@@ -1166,20 +1166,34 @@ discord_voice_client& discord_voice_client::set_send_audio_type(send_audio_type_
 
 discord_voice_client& discord_voice_client::send_audio_raw(uint16_t* audio_data, const size_t length)  {
 #if HAVE_VOICE
-	const size_t max_frame_bytes = 11520;
-	if (length > max_frame_bytes) {
+	if (length < 4) {
+		throw dpp::voice_exception("Raw audio packet size can't be less than 4");
+	}
+
+	if ((length % 4) != 0) {
+		throw dpp::voice_exception("Raw audio packet size should be divisible by 4");
+	}
+
+	if (length > send_audio_raw_max_length) {
 		std::string s_audio_data((const char*)audio_data, length);
-		while (s_audio_data.length() > max_frame_bytes) {
-			std::string packet(s_audio_data.substr(0, max_frame_bytes));
-			s_audio_data.erase(s_audio_data.begin(), s_audio_data.begin() + max_frame_bytes);
-			if (packet.size() < max_frame_bytes) {
-				packet.resize(max_frame_bytes, 0);
-			}
-			send_audio_raw((uint16_t*)packet.data(), max_frame_bytes);
+
+		while (s_audio_data.length() > send_audio_raw_max_length) {
+			std::string packet(s_audio_data.substr(0, send_audio_raw_max_length));
+			const auto packet_size = static_cast<ptrdiff_t>(packet.size());
+
+			s_audio_data.erase(s_audio_data.begin(), s_audio_data.begin() + packet_size);
+
+			send_audio_raw((uint16_t*)packet.data(), packet_size);
 		}
 
 		return *this;
+	}
+
+	if (length < send_audio_raw_max_length) {
+		std::string packet((const char*)audio_data, length);
+		packet.resize(send_audio_raw_max_length, 0);
 
+		return send_audio_raw((uint16_t*)packet.data(), packet.size());
 	}
 
 	opus_int32 encodedAudioMaxLength = (opus_int32)length;