From 2488e0e3f775e495ee927507e8faffe441f72bab Mon Sep 17 00:00:00 2001 From: Gary Miguel Date: Sat, 6 Jan 2024 09:58:30 -0800 Subject: [PATCH] support decompressing fixed huffman blocks Change-Id: I5a30394b46e113595336e89c954e03d3acb120fe --- huffman/src/bit_span.hpp | 24 +++++- huffman/src/decode.hpp | 48 ++++++++--- huffman/test/bit_span_test.cpp | 12 +++ src/decompress.cpp | 144 ++++++++++++++++++++++++++++++--- src/decompress.hpp | 27 +++++++ src/test/BUILD.bazel | 1 + src/test/decompress_test.cpp | 26 +++++- 7 files changed, 258 insertions(+), 24 deletions(-) diff --git a/huffman/src/bit_span.hpp b/huffman/src/bit_span.hpp index 22b4383..1d1b1f2 100644 --- a/huffman/src/bit_span.hpp +++ b/huffman/src/bit_span.hpp @@ -17,11 +17,12 @@ namespace starflate::huffman { /// A non-owning span of bits. Allows for iteration over the individual bits. class bit_span : public std::ranges::view_interface { + +public: + // TODO: make private const std::byte* data_{nullptr}; std::size_t bit_size_{}; std::uint8_t bit_offset_{}; // always less than CHAR_BIT - -public: /// An iterator over the bits in a bit_span. class iterator : public detail::iterator_interface { @@ -142,6 +143,25 @@ class bit_span : public std::ranges::view_interface constexpr auto pop_16() -> std::uint16_t { return pop(); } + /// Removes n bits from the beginning of this and returns them. + /// + /// @pre this contains at least n bits. + /// + constexpr auto pop_n(std::uint8_t n) -> std::uint16_t + { + assert(n <= 16); + assert(n <= bit_size_); + auto iter = begin(); + std::uint16_t res{}; + for (std::uint8_t i{}; i < n; i++) { + res |= static_cast( + static_cast(static_cast(*iter)) << i); + iter += 1; + } + consume(n); // invalidates iter, so must come after the loop + return res; + } + /// Consumes the given number of bits. Advances the start of the view. /// /// @pre n <= std::ranges::size(*this) diff --git a/huffman/src/decode.hpp b/huffman/src/decode.hpp index f9178bf..d19989c 100644 --- a/huffman/src/decode.hpp +++ b/huffman/src/decode.hpp @@ -21,30 +21,58 @@ namespace starflate::huffman { /// @tparam Symbol The type of the symbols in the code table. /// @tparam Extent The extent of the code table. /// @tparam O The type of the output iterator. -template < - symbol Symbol, - std::size_t Extent = std::dynamic_extent, - std::output_iterator O> +template O> constexpr auto decode(const table& code_table, bit_span bits, O output) -> O { + while (!bits.empty()) { + auto result = decode_one(code_table, bits); + if (result.encoded_size == 0) { + break; + } + *output = result.symbol; + output++; + bits.consume(result.encoded_size); + } + return output; +} + +template +struct decode_result +{ + Symbol symbol; + std::uint8_t encoded_size; +}; + +/// Decodes a single symbol from \p bits using \p code_table. +/// +/// @param code_table The code table to use for decoding. +/// @param bits The bit stream to decode. +/// +/// @returns The decoded symbol and how many bits its code was. +/// @tparam Symbol The type of the symbols in the code table. +/// @tparam Extent The extent of the code table. +template +constexpr auto +decode_one(const table& code_table, bit_span bits) + -> decode_result +{ + std::uint8_t bits_read{}; code current_code{}; auto code_table_pos = code_table.begin(); for (auto bit : bits) { current_code << bit; + bits_read++; auto found = code_table.find(current_code, code_table_pos); if (found) { - *output = (*found)->symbol; - output++; - code_table_pos = code_table.begin(); - current_code = code{}; - continue; + return {(*found)->symbol, bits_read}; } if (found.error() == code_table.end()) { break; } code_table_pos = found.error(); } - return output; + return {Symbol{}, 0}; } + } // namespace starflate::huffman diff --git a/huffman/test/bit_span_test.cpp b/huffman/test/bit_span_test.cpp index 1ef954a..0bebc3f 100644 --- a/huffman/test/bit_span_test.cpp +++ b/huffman/test/bit_span_test.cpp @@ -116,6 +116,9 @@ auto main() -> int if (std::cmp_less(n, initial_bits.size())) { expect(nth_bit(n) == bits[0]); } + if (n == 0) { + expect(initial_bits.byte_data() == bits.byte_data()); + } } else { expect(aborts([&] { bits.consume(n); })); } @@ -172,6 +175,15 @@ auto main() -> int expect(eq(got_8, expected_8)); expect(aborts([&] { span.pop_8(); })); + + span = huffman::bit_span{data}; + const std::uint16_t got_5{span.pop_n(5)}; + constexpr std::uint16_t expected_5{0b01010}; + expect(eq(got_5, expected_5)); + + const std::uint16_t got_3{span.pop_n(3)}; + constexpr std::uint16_t expected_3{0b101}; + expect(eq(got_3, expected_3)); // NOLINTEND(readability-magic-numbers) }; } diff --git a/src/decompress.cpp b/src/decompress.cpp index b99a22a..6a07436 100644 --- a/src/decompress.cpp +++ b/src/decompress.cpp @@ -1,6 +1,7 @@ #include "decompress.hpp" #include +#include #include #include @@ -31,6 +32,125 @@ auto read_header(huffman::bit_span& compressed_bits) return BlockHeader{final, type}; } +// RFC 3.2.6: static literal/length table +// +// literal/length bitsize code +// ============== ======= ========================= +// 0 - 143 8 0011'0000 - 1011'1111 +// 144 - 255 9 1'1001'0000 - 1'1111'1111 +// 256 - 279 7 000'0000 - 001'0111 +// 280 - 287 8 1100'0000 - 1100'0111 + +constexpr std::size_t fixed_len_table_size = 288; + +constexpr auto fixed_len_table = // clang-format off + huffman::table{ + huffman::symbol_bitsize, + {{{ 0, 143}, 8}, + {{144, 255}, 9}, + {{256, 279}, 7}, + {{280, 287}, 8}}}; +// clang-format on + +constexpr std::size_t fixed_dist_table_size = 32; + +constexpr auto fixed_dist_table = huffman::table< + std::uint16_t, + fixed_dist_table_size>{huffman::symbol_bitsize, {{{0, 31}, 5}}}; + +// RFC 3.2.5: Compressed blocks (length and distance codes) +constexpr auto length_infos = std::array{ + {{0, 3}, {0, 4}, {0, 5}, {0, 6}, {0, 7}, {0, 8}, {0, 9}, + {0, 10}, {1, 11}, {1, 13}, {1, 15}, {1, 17}, {2, 19}, {2, 23}, + {2, 27}, {2, 31}, {3, 35}, {3, 43}, {3, 51}, {3, 59}, {4, 67}, + {4, 83}, {4, 99}, {4, 115}, {5, 131}, {5, 163}, {5, 195}, {5, 227}}}; + +constexpr auto distance_infos = std::array{ + {{0, 1}, {0, 2}, {0, 3}, {0, 4}, {1, 5}, + {1, 7}, {2, 9}, {2, 13}, {3, 17}, {3, 25}, + {4, 33}, {4, 49}, {5, 65}, {5, 97}, {6, 129}, + {6, 193}, {7, 257}, {7, 385}, {8, 513}, {8, 769}, + {9, 1025}, {9, 1537}, {10, 2049}, {10, 3073}, {11, 4097}, + {11, 6145}, {12, 8193}, {12, 12289}, {13, 16385}, {13, 24577}}}; + +auto decompress_block_huffman( + huffman::bit_span& src_bits, + std::span dst, + std::ptrdiff_t& dst_written, + const huffman::table& len_table, + const huffman::table& dist_table) + -> DecompressStatus +{ + std::uint16_t lit_or_len{}; + while (true) { + const auto lit_or_len_decoded = huffman::decode_one(len_table, src_bits); + if (not lit_or_len_decoded.encoded_size) { + return DecompressStatus::InvalidLitOrLen; + } + lit_or_len = lit_or_len_decoded.symbol; + src_bits.consume(lit_or_len_decoded.encoded_size); + if (lit_or_len < detail::lit_or_len_end_of_block) { + dst[static_cast(dst_written++)] = + static_cast(lit_or_len); + continue; + } + if (lit_or_len == detail::lit_or_len_end_of_block) { + break; + } + if (lit_or_len > detail::lit_or_len_max) { + return DecompressStatus::InvalidLitOrLen; + } + std::uint16_t len{}; + if (lit_or_len == detail::lit_or_len_max) { + len = detail::lit_or_len_max_decoded; + } else { + const auto len_idx = + static_cast(lit_or_len - detail::lit_or_len_end_of_block - 1); + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-constant-array-index) + const auto& len_info = detail::length_infos[len_idx]; + const auto extra_len = src_bits.pop_n(len_info.extra_bits); + len = len_info.base + extra_len; + } + const auto dist_decoded = huffman::decode_one(dist_table, src_bits); + const auto dist_code = dist_decoded.symbol; + if (not dist_decoded.encoded_size) { + return DecompressStatus::InvalidDistance; + } + src_bits.consume(dist_decoded.encoded_size); + if (dist_code >= detail::distance_infos.size()) { + return DecompressStatus::InvalidLitOrLen; + } + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-constant-array-index) + const auto& dist_info = detail::distance_infos[dist_code]; + const std::uint16_t distance = + dist_info.base + src_bits.pop_n(dist_info.extra_bits); + if (distance > dst_written) { + return DecompressStatus::InvalidDistance; + } + if (dst.size() - static_cast(dst_written) < len) { + return DecompressStatus::DstTooSmall; + } + starflate::detail::copy_n( + dst.begin() + (dst_written - distance), len, dst.begin() + dst_written); + dst_written += len; + } + return DecompressStatus::Success; +} + +void copy_n( + std::span::iterator src, + std::uint16_t n, + std::span::iterator dst) +{ + std::ptrdiff_t n_signed{n}; + while (n_signed > 0) { + const auto n_to_copy = std::min(n_signed, dst - src); + std::copy_n(src, n_to_copy, dst); + n_signed -= n_to_copy; + dst += n_to_copy; + } +} + } // namespace detail auto decompress(std::span src, std::span dst) @@ -39,7 +159,8 @@ auto decompress(std::span src, std::span dst) using enum detail::BlockType; huffman::bit_span src_bits{src}; - // std::size_t dst_written{}; + // will always be > 0, but signed type to minimize conversions. + std::ptrdiff_t dst_written{}; for (bool was_final = false; not was_final;) { const auto header = detail::read_header(src_bits); if (not header) { @@ -58,22 +179,27 @@ auto decompress(std::span src, std::span dst) return DecompressStatus::SrcTooSmall; } - if (dst.size() < len) { + if (dst.size() - static_cast(dst_written) < len) { return DecompressStatus::DstTooSmall; } - std::copy_n(src_bits.byte_data(), len, dst.begin()); + std::copy_n(src_bits.byte_data(), len, dst.begin() + dst_written); src_bits.consume(CHAR_BIT * len); - dst = dst.subspan(len); - // dst_written += len; + dst_written += len; + } else if (header->type == FixedHuffman) { + const auto block_status = detail::decompress_block_huffman( + src_bits, + dst, + dst_written, + detail::fixed_len_table, + detail::fixed_dist_table); + if (block_status != DecompressStatus::Success) { + return block_status; + } } else { // TODO: implement return DecompressStatus::Error; } - const auto distance = - std::distance(std::ranges::data(src), src_bits.byte_data()); - assert(distance >= 0 and "distance must be positive"); - src = src.subspan(static_cast(distance)); } return DecompressStatus::Success; } diff --git a/src/decompress.hpp b/src/decompress.hpp index 07f88f4..adfcd60 100644 --- a/src/decompress.hpp +++ b/src/decompress.hpp @@ -2,6 +2,7 @@ #include "huffman/huffman.hpp" +#include #include #include #include @@ -18,6 +19,8 @@ enum class DecompressStatus : std::uint8_t NoCompressionLenMismatch, DstTooSmall, SrcTooSmall, + InvalidLitOrLen, + InvalidDistance, }; namespace detail { @@ -37,6 +40,30 @@ struct BlockHeader auto read_header(huffman::bit_span& compressed_bits) -> std::expected; + +struct LengthInfo +{ + std::uint8_t extra_bits; + std::uint16_t base; +}; + +extern const huffman::table fixed_table; +extern const std::array length_infos; +constexpr auto lit_or_len_end_of_block = std::uint16_t{256}; +constexpr auto lit_or_len_max = std::uint16_t{285}; +constexpr auto lit_or_len_max_decoded = std::uint16_t{258}; + +/// Copies n bytes from src to dst, repeating the source data if necessary. +/// +/// From the standard section 3.2.3: +/// "Note also that the referenced string may overlap the current +/// position; for example, if the last 2 bytes decoded have values +/// X and Y, a string reference with +/// adds X,Y,X,Y,X to the output stream." +void copy_n( + std::span::iterator src, + std::uint16_t n, + std::span::iterator dst); } // namespace detail /// Decompresses the given source data into the destination buffer. diff --git a/src/test/BUILD.bazel b/src/test/BUILD.bazel index 5579a95..f8b3a27 100644 --- a/src/test/BUILD.bazel +++ b/src/test/BUILD.bazel @@ -6,6 +6,7 @@ cc_test( timeout = "short", srcs = ["decompress_test.cpp"], data = [ + ":starfleet.html", ":starfleet.html.dynamic", ":starfleet.html.fixed", ], diff --git a/src/test/decompress_test.cpp b/src/test/decompress_test.cpp index c80083f..da1f251 100644 --- a/src/test/decompress_test.cpp +++ b/src/test/decompress_test.cpp @@ -28,8 +28,12 @@ auto read_runfile(const char* argv0, const std::string& path) const std::string abs_path{runfiles->Rlocation(path)}; std::ifstream file{abs_path, std::ios::binary}; - ::boost::ut::expect(::boost::ut::fatal(file.is_open())) - << "failed to open " << path; + if (not file.is_open()) { + // ::boost::ut::fatal swallows log messages, so log before. + ::boost::ut::log("failed to open file: " + abs_path); + ::boost::ut::expect(::boost::ut::fatal(false)); + } + std::vector chars( (std::istreambuf_iterator(file)), std::istreambuf_iterator()); @@ -50,7 +54,6 @@ auto main(int, char* argv[]) -> int { using ::boost::ut::eq; using ::boost::ut::expect; - using ::boost::ut::fatal; using ::boost::ut::test; using namespace starflate; @@ -137,6 +140,15 @@ auto main(int, char* argv[]) -> int expect(header.has_value()) << "got error: " << static_cast(header.error()); expect(header->type == detail::BlockType::FixedHuffman); + + const std::vector expected_bytes = + read_runfile(*argv, "starflate/src/test/starfleet.html"); + std::vector dst(expected_bytes.size()); + const auto status = decompress(input_bytes, dst); + expect(status == DecompressStatus::Success) + << "got error code: " << static_cast(status); + expect(std::ranges::equal(dst, expected_bytes)) + << "decompressed does not match expected"; }; test("dynamic huffman") = [argv] { @@ -149,4 +161,12 @@ auto main(int, char* argv[]) -> int << "got error: " << static_cast(header.error()); expect(header->type == detail::BlockType::DynamicHuffman); }; + + test("copy_n") = [] { + auto src_and_dst = huffman::byte_array(1, 2, 0, 0, 0, 0); + const auto src_span = std::span{src_and_dst}; + const auto dst_span = std::span{src_and_dst}.subspan(2); + detail::copy_n(src_span.begin(), 3, dst_span.begin()); + expect(eq(src_and_dst, huffman::byte_array(1, 2, 1, 2, 1, 0))); + }; };