Skip to content

Commit

Permalink
WIP fixed huffman block type
Browse files Browse the repository at this point in the history
Change-Id: I5a30394b46e113595336e89c954e03d3acb120fe
  • Loading branch information
garymm committed Feb 12, 2024
1 parent 51a2ddb commit ea4ce7c
Show file tree
Hide file tree
Showing 6 changed files with 224 additions and 19 deletions.
18 changes: 18 additions & 0 deletions huffman/src/bit_span.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,24 @@ class bit_span : public std::ranges::view_interface<bit_span>

constexpr auto pop_16() -> std::uint16_t { return pop<std::uint16_t>(); }

/// Removes n bits from the beginning of this and returns them.
///
/// @pre this contains at least n bits.
///
constexpr auto pop_n(std::uint8_t n) -> std::uint8_t
{
assert(n <= CHAR_BIT);
auto iter = begin();
consume(n);
std::uint8_t res{};
for (; n != 0; n--) {
res = static_cast<std::uint8_t>(res << 1) |
static_cast<std::uint8_t>((static_cast<bool>(*iter)));
iter += 1;
}
return res;
}

/// Consumes the given number of bits. Advances the start of the view.
///
/// @pre n <= std::ranges::size(*this)
Expand Down
44 changes: 38 additions & 6 deletions huffman/src/decode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,23 +28,55 @@ template <
constexpr auto
decode(const table<Symbol, Extent>& code_table, bit_span bits, O output) -> O
{
while (!bits.empty()) {
auto result = decode_one(code_table, bits);
if (result.encoded_size == 0) {
break;
}
*output = result.symbol;
output++;
bits.consume(result.encoded_size);
}
return output;
}

template <symbol Symbol>
struct decode_result
{
Symbol symbol;
std::uint8_t encoded_size;
};

/// Decodes a single symbol from \p bits using \p code_table.
///
/// @param code_table The code table to use for decoding.
/// @param bits The bit stream to decode.
///
/// @returns The decoded symbol, or std::nullopt if the bits could not be
/// decoded.
/// @tparam Symbol The type of the symbols in the code table.
/// @tparam Extent The extent of the code table.
template <symbol Symbol, std::size_t Extent = std::dynamic_extent>
constexpr auto
decode_one(const table<Symbol, Extent>& code_table, bit_span bits)
-> decode_result<Symbol>
{
std::uint8_t bits_read{};
code current_code{};
auto code_table_pos = code_table.begin();
for (auto bit : bits) {
current_code << bit;
bits_read++;
auto found = code_table.find(current_code, code_table_pos);
if (found) {
*output = (*found)->symbol;
output++;
code_table_pos = code_table.begin();
current_code = code{};
continue;
return {(*found)->symbol, bits_read};
}
if (found.error() == code_table.end()) {
break;
}
code_table_pos = found.error();
}
return output;
return {Symbol{}, 0};
}

} // namespace starflate::huffman
143 changes: 133 additions & 10 deletions src/decompress.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "decompress.hpp"

#include <cstdint>
#include <iostream>
#include <iterator>
#include <utility>

Expand Down Expand Up @@ -31,6 +32,124 @@ auto read_header(huffman::bit_span& compressed_bits)
return BlockHeader{final, type};
}

// RFC 3.2.6: static literal/length table
//
// literal/length bitsize code
// ============== ======= =========================
// 0 - 143 8 0011'0000 - 1011'1111
// 144 - 255 9 1'1001'0000 - 1'1111'1111
// 256 - 279 7 000'0000 - 001'0111
// 280 - 287 8 1100'0000 - 1100'0111

constexpr std::size_t fixed_table_size = 288;

constexpr auto fixed_table = // clang-format off
huffman::table<std::uint16_t, fixed_table_size>{
huffman::symbol_bitsize,
{{{ 0, 143}, 8},
{{144, 255}, 9},
{{256, 279}, 7},
{{280, 287}, 8}}};
// clang-format on

// RFC 3.2.5: Compressed blocks (length and distance codes)
constexpr auto length_infos = std::array<LengthInfo, 28>{
{{0, 3}, {0, 4}, {0, 5}, {0, 6}, {0, 7}, {0, 8}, {0, 9},
{0, 10}, {1, 11}, {1, 13}, {1, 15}, {1, 17}, {2, 19}, {2, 23},
{2, 27}, {2, 31}, {3, 35}, {3, 43}, {3, 51}, {3, 59}, {4, 67},
{4, 83}, {4, 99}, {4, 115}, {5, 131}, {5, 163}, {5, 195}, {5, 227}}};

constexpr auto distance_infos = std::array<LengthInfo, 30>{
{{0, 1}, {0, 2}, {0, 3}, {0, 4}, {1, 5},
{1, 7}, {2, 9}, {2, 13}, {3, 17}, {3, 25},
{4, 33}, {4, 49}, {5, 65}, {5, 97}, {6, 129},
{6, 193}, {7, 257}, {7, 385}, {8, 513}, {8, 769},
{9, 1025}, {9, 1537}, {10, 2049}, {10, 3073}, {11, 4097},
{11, 6145}, {12, 8193}, {12, 12289}, {13, 16385}, {13, 24577}}};

/// Returns a span of the remaining bytes starting at \p bits' underlying data.
auto subspan_starting_at_bits(
const std::span<const std::byte> bytes, const huffman::bit_span& bits)
-> std::span<const std::byte>
{
const auto distance =
std::distance(std::ranges::data(bytes), bits.byte_data());
assert(distance >= 0 and "distance must be positive");
return bytes.subspan(static_cast<std::size_t>(distance));
}

auto decompress_huffman(
std::span<std::byte> src,
huffman::bit_span src_bits,
std::span<std::byte> dst,
std::ptrdiff_t dst_written,
const huffman::table<std::uint16_t, fixed_table_size>& table)
-> std::expected<DecompressResult, DecompressError>
{
std::uint16_t lit_or_len{};
std::int32_t num_subblocks{}; // TODO: delete, just for debugging
while (true) {
std::cerr << "num_subblocks: " << num_subblocks << '\n';
num_subblocks++;
const auto decoded = huffman::decode_one(table, src_bits);
if (not decoded.encoded_size) {
return std::unexpected{DecompressError::InvalidLitOrLen};
}
lit_or_len = decoded.symbol;
src_bits.consume(decoded.encoded_size);
if (lit_or_len < detail::lit_or_len_end_of_block) {
dst[static_cast<std::size_t>(dst_written++)] =
static_cast<std::byte>(lit_or_len);
continue;
}
if (lit_or_len == detail::lit_or_len_end_of_block) {
break;
}
if (lit_or_len > detail::lit_or_len_max) {
return std::unexpected{DecompressError::InvalidLitOrLen};
}
std::uint16_t len{};
if (lit_or_len == detail::lit_or_len_max) {
len = detail::lit_or_len_max_decoded;
} else {
const auto len_idx =
static_cast<size_t>(lit_or_len - detail::lit_or_len_end_of_block);
// NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-constant-array-index)
const auto& len_info = detail::length_infos[len_idx];
len = len_info.base + src_bits.pop_n(len_info.extra_bits);
}
const auto dist_code = src_bits.pop_n(5);
if (dist_code >= detail::distance_infos.size()) {
return std::unexpected{DecompressError::InvalidLitOrLen};
}
// NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-constant-array-index)
const auto& dist_info = detail::distance_infos[dist_code];
const std::uint16_t distance =
dist_info.base + src_bits.pop_n(dist_info.extra_bits);
if (distance > dst_written) {
return std::unexpected{DecompressError::InvalidDistance};
}
if (dst.size() - static_cast<std::size_t>(dst_written) < len) {
return DecompressResult{
// TODO: this seems wrong because the next block does not necessarily
// start at a byte boundary.
// Look at ZLib to see how it handles this.
subspan_starting_at_bits(src, src_bits),
static_cast<std::size_t>(dst_written),
len};
}
std::copy_n(
dst.begin() + (dst_written - distance), len, dst.begin() + dst_written);
dst_written += len;
}
return DecompressResult{
// TODO: this seems wrong because the next block does not necessarily
// start at a byte boundary
subspan_starting_at_bits(src, src_bits),
static_cast<std::size_t>(dst_written),
0};
}

} // namespace detail

auto decompress(std::span<const std::byte> src, std::span<std::byte> dst)
Expand All @@ -39,7 +158,8 @@ auto decompress(std::span<const std::byte> src, std::span<std::byte> dst)
using enum detail::BlockType;

huffman::bit_span src_bits{src};
std::size_t dst_written{};
// will always be > 0, but signed type to minimize conversions.
std::ptrdiff_t dst_written{};
for (bool was_final = false; not was_final;) {
const auto header = detail::read_header(src_bits);
if (not header) {
Expand All @@ -60,24 +180,27 @@ auto decompress(std::span<const std::byte> src, std::span<std::byte> dst)
src_bits.size(), std::size_t{len} * CHAR_BIT) and
"not enough bits in src");

if (std::ranges::size(dst) < len) {
return DecompressResult{src, dst_written, len};
if (dst.size() - static_cast<std::size_t>(dst_written) < len) {
return DecompressResult{
src, static_cast<std::size_t>(dst_written), len};
}

std::copy_n(src_bits.byte_data(), len, dst.begin());
std::copy_n(src_bits.byte_data(), len, dst.begin() + dst_written);
src_bits.consume(CHAR_BIT * len);
dst = dst.subspan(len);
dst_written += len;

} else if (header->type == FixedHuffman) {
// TODO: add proper args, handle the return value.
detail::decompress_huffman(detail::fixed_table);
} else {
// TODO: implement
return std::unexpected{DecompressError::Error};
}
const auto distance =
std::distance(std::ranges::data(src), src_bits.byte_data());
assert(distance >= 0 and "distance must be positive");
src = src.subspan(static_cast<size_t>(distance));
// TODO: this seems wrong because the next block does not necessarily start
// at a byte boundary
src = detail::subspan_starting_at_bits(src, src_bits);
}
return DecompressResult{src, dst_written, 0};
return DecompressResult{src, static_cast<std::size_t>(dst_written), 0};
}

} // namespace starflate
16 changes: 16 additions & 0 deletions src/decompress.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include "huffman/huffman.hpp"

#include <array>
#include <cstddef>
#include <expected>
#include <ranges>
Expand All @@ -15,6 +16,8 @@ enum class DecompressError : std::uint8_t
Error,
InvalidBlockHeader,
NoCompressionLenMismatch,
InvalidLitOrLen,
InvalidDistance,
};

namespace detail {
Expand All @@ -34,6 +37,19 @@ struct BlockHeader

auto read_header(huffman::bit_span& compressed_bits)
-> std::expected<BlockHeader, DecompressError>;

struct LengthInfo
{
std::uint8_t extra_bits;
std::uint16_t base;
};

extern const huffman::table<std::uint16_t, 288> fixed_table;
extern const std::array<LengthInfo, 28> length_infos;
constexpr auto lit_or_len_end_of_block = std::uint16_t{256};
constexpr auto lit_or_len_max = std::uint16_t{285};
constexpr auto lit_or_len_max_decoded = std::uint16_t{258};

} // namespace detail

/// The result of decompress.
Expand Down
1 change: 1 addition & 0 deletions src/test/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ cc_test(
timeout = "short",
srcs = ["decompress_test.cpp"],
data = [
":starfleet.html",
":starfleet.html.dynamic",
":starfleet.html.fixed",
],
Expand Down
21 changes: 18 additions & 3 deletions src/test/decompress_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,12 @@ auto read_runfile(const char* argv0, const std::string& path)
const std::string abs_path{runfiles->Rlocation(path)};

std::ifstream file{abs_path, std::ios::binary};
::boost::ut::expect(::boost::ut::fatal(file.is_open()))
<< "failed to open " << path;
if (not file.is_open()) {
// ::boost::ut::fatal swallows log messages, so log before.
::boost::ut::log("failed to open file: " + abs_path);
::boost::ut::expect(::boost::ut::fatal(false));
}

std::vector<char> chars(
(std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());

Expand All @@ -50,7 +54,6 @@ auto main(int, char* argv[]) -> int
{
using ::boost::ut::eq;
using ::boost::ut::expect;
using ::boost::ut::fatal;
using ::boost::ut::test;
using namespace starflate;

Expand Down Expand Up @@ -144,6 +147,18 @@ auto main(int, char* argv[]) -> int
expect(header.has_value())
<< "got error: " << static_cast<int>(header.error());
expect(header->type == detail::BlockType::FixedHuffman);

const std::vector<std::byte> expected_bytes =
read_runfile(*argv, "starflate/src/test/starfleet.html");
std::vector<std::byte> dst(expected_bytes.size());
// TODO: this fails with an assertion.
const auto result = decompress(input_bytes, dst);
expect(result.has_value())
<< "got error code: " << static_cast<std::int32_t>(result.error());
expect(result->remaining_src.empty());
expect(result->min_next_dst_size == 0);
expect(result->dst_written == expected_bytes.size());
expect(std::ranges::equal(dst, expected_bytes));
};

test("dynamic huffman") = [argv] {
Expand Down

0 comments on commit ea4ce7c

Please sign in to comment.