Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support decompressing fixed huffman blocks #124

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 37 additions & 10 deletions huffman/src/decode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,30 +21,57 @@
/// @tparam Symbol The type of the symbols in the code table.
/// @tparam Extent The extent of the code table.
/// @tparam O The type of the output iterator.
template <
symbol Symbol,
std::size_t Extent = std::dynamic_extent,
std::output_iterator<Symbol> O>
template <symbol Symbol, std::size_t Extent, std::output_iterator<Symbol> O>
constexpr auto
decode(const table<Symbol, Extent>& code_table, bit_span bits, O output) -> O
{
while (!bits.empty()) {
auto result = decode_one(code_table, bits);
if (result.encoded_size == 0) {
break;
}
*output = result.symbol;
output++;
bits.consume(result.encoded_size);
garymm marked this conversation as resolved.
Show resolved Hide resolved
}
return output;
}

Check warning on line 38 in huffman/src/decode.hpp

View check run for this annotation

Codecov / codecov/patch

huffman/src/decode.hpp#L27-L38

Added lines #L27 - L38 were not covered by tests

template <symbol Symbol>
struct decode_result
{
Symbol symbol;
std::uint8_t encoded_size;
};

/// Decodes a single symbol from \p bits using \p code_table.
///
/// @param code_table The code table to use for decoding.
/// @param bits The bit stream to decode.
///
/// @returns The decoded symbol and how many bits its code was.
/// If no symbol was found, result.encoded_size == 0.
/// @tparam Symbol The type of the symbols in the code table.
/// @tparam Extent The extent of the code table.
template <symbol Symbol, std::size_t Extent>
constexpr auto
decode_one(const table<Symbol, Extent>& code_table, bit_span bits)
-> decode_result<Symbol>
{
code current_code{};
auto code_table_pos = code_table.begin();
for (auto bit : bits) {
current_code << bit;
auto found = code_table.find(current_code, code_table_pos);
if (found) {
*output = (*found)->symbol;
output++;
code_table_pos = code_table.begin();
current_code = code{};
continue;
return {(*found)->symbol, (*found)->bitsize()};
}
if (found.error() == code_table.end()) {
break;
}
code_table_pos = found.error();
}
return output;
return {Symbol{}, 0};

Check warning on line 74 in huffman/src/decode.hpp

View check run for this annotation

Codecov / codecov/patch

huffman/src/decode.hpp#L74

Added line #L74 was not covered by tests
oliverlee marked this conversation as resolved.
Show resolved Hide resolved
}

} // namespace starflate::huffman
4 changes: 3 additions & 1 deletion huffman/test/bit_span_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
#include <array>
#include <climits>
#include <cstdint>
#include <numeric>
#include <ranges>
#include <vector>

Expand Down Expand Up @@ -116,6 +115,9 @@ auto main() -> int
if (std::cmp_less(n, initial_bits.size())) {
expect(nth_bit(n) == bits[0]);
}
if (n == 0) {
expect(initial_bits.byte_data() == bits.byte_data());
}
} else {
expect(aborts([&] { bits.consume(n); }));
}
Expand Down
192 changes: 183 additions & 9 deletions src/decompress.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "decompress.hpp"

#include <cstdint>
#include <iostream>
#include <iterator>
#include <utility>

Expand Down Expand Up @@ -31,6 +32,173 @@
return BlockHeader{final, type};
}

// RFC 3.2.6: static literal/length table
//
// literal/length bitsize code
// ============== ======= =========================
// 0 - 143 8 0011'0000 - 1011'1111
// 144 - 255 9 1'1001'0000 - 1'1111'1111
// 256 - 279 7 000'0000 - 001'0111
// 280 - 287 8 1100'0000 - 1100'0111

constexpr std::size_t fixed_len_table_size = 288;

constexpr auto fixed_len_table = // clang-format off
huffman::table<std::uint16_t, fixed_len_table_size>{
huffman::symbol_bitsize,
{{{ 0, 143}, 8},
{{144, 255}, 9},
{{256, 279}, 7},
{{280, 287}, 8}}};
// clang-format on

constexpr std::size_t fixed_dist_table_size = 32;

constexpr auto fixed_dist_table = huffman::table<
std::uint16_t,
fixed_dist_table_size>{huffman::symbol_bitsize, {{{0, 31}, 5}}};

// RFC 3.2.5: Compressed blocks (length and distance codes)
constexpr auto length_infos = std::array<LengthInfo, 28>{
{{0, 3}, {0, 4}, {0, 5}, {0, 6}, {0, 7}, {0, 8}, {0, 9},
{0, 10}, {1, 11}, {1, 13}, {1, 15}, {1, 17}, {2, 19}, {2, 23},
{2, 27}, {2, 31}, {3, 35}, {3, 43}, {3, 51}, {3, 59}, {4, 67},
{4, 83}, {4, 99}, {4, 115}, {5, 131}, {5, 163}, {5, 195}, {5, 227}}};

constexpr auto distance_infos = std::array<LengthInfo, 30>{
{{0, 1}, {0, 2}, {0, 3}, {0, 4}, {1, 5},
{1, 7}, {2, 9}, {2, 13}, {3, 17}, {3, 25},
{4, 33}, {4, 49}, {5, 65}, {5, 97}, {6, 129},
{6, 193}, {7, 257}, {7, 385}, {8, 513}, {8, 769},
{9, 1025}, {9, 1537}, {10, 2049}, {10, 3073}, {11, 4097},
{11, 6145}, {12, 8193}, {12, 12289}, {13, 16385}, {13, 24577}}};

/// Removes n bits from the beginning of bits and returns them.
///
/// @pre bits contains at least n bits.
/// @pre n <= 16
///
/// @returns the n bits removed from the beginning of this.
/// The bits are in the lower (rightmost) part of the return value.
///
auto pop_extra_bits(huffman::bit_span& bits, std::uint8_t n) -> std::uint16_t
{
assert(n <= 16);
auto iter = bits.begin();
std::uint16_t res{};
for (std::uint8_t i{}; i < n; i++) {
res |= static_cast<std::uint16_t>(
static_cast<std::uint16_t>(static_cast<bool>(*iter)) << i);
iter += 1;
}
bits.consume(n); // invalidates iter, so must come after the loop
return res;
}

enum class ParseLitOrLenStatus : std::uint8_t
{
EndOfBlock,
Error,
};

auto parse_lit_or_len(
std::uint16_t lit_or_len, huffman::bit_span& src_bits) -> std::
expected<std::variant<std::byte, std::uint16_t>, ParseLitOrLenStatus>
{
if (lit_or_len < detail::lit_or_len_end_of_block) {
return static_cast<std::byte>(lit_or_len);
}
if (lit_or_len == detail::lit_or_len_end_of_block) {
return std::unexpected{ParseLitOrLenStatus::EndOfBlock};
}
if (lit_or_len > detail::lit_or_len_max) {
return std::unexpected{ParseLitOrLenStatus::Error};
}

Check warning on line 116 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L115-L116

Added lines #L115 - L116 were not covered by tests
std::uint16_t len{};
if (lit_or_len == detail::lit_or_len_max) {
len = detail::lit_or_len_max_decoded;
} else {
Comment on lines +127 to +130
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like you can just return instead of construction + assignment of len?

if (lit_or_len == detail::lit_or_len_max) {
    return detail::lit_or_len_max_decoded;
}

const auto len_idx =
static_cast<size_t>(lit_or_len - detail::lit_or_len_end_of_block - 1);
// NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-constant-array-index)
const auto& len_info = detail::length_infos[len_idx];
const auto extra_len = pop_extra_bits(src_bits, len_info.extra_bits);
len = len_info.base + extra_len;
}
return len;
}

auto decompress_block_huffman(
huffman::bit_span& src_bits,
std::span<std::byte> dst,
std::ptrdiff_t& dst_written,
Comment on lines +143 to +144
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
std::span<std::byte> dst,
std::ptrdiff_t& dst_written,
std::span<std::byte>& dst,

const huffman::table<std::uint16_t, fixed_len_table_size>& len_table,
const huffman::table<std::uint16_t, fixed_dist_table_size>& dist_table)
-> DecompressStatus
{
while (true) {
oliverlee marked this conversation as resolved.
Show resolved Hide resolved
const auto lit_or_len_decoded = huffman::decode_one(len_table, src_bits);
if (not lit_or_len_decoded.encoded_size) {
return DecompressStatus::InvalidLitOrLen;
}

Check warning on line 143 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L142-L143

Added lines #L142 - L143 were not covered by tests
Comment on lines +150 to +153
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think if src_bits doesn't have enough bits to decode a symbol - that would be "success" and this function should exit?

If decode_one is changed to return expected<>, maybe something like this?

const auto lit_or_len = huffman::decode_one(len_table, src_bits);
if (not lit_or_len) {
  return status_for(lit_or_len.error());
}

If src_bits doesn't have enough bits, you do have have a partially constructed code and iterator in the code table. You could potentially keep those in order to skip some computation when more bits arrive - although it's certainly simpler to just restart.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you thinking of a chunked / streaming input?
If not, I don't understand why not having enough bits would be success?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I'm thinking of the chunked/streaming case.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably worth adding a TODO to handle later.

src_bits.consume(lit_or_len_decoded.encoded_size);
const auto maybe_lit_or_len =
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks better to me.

I think this could still benefit from breaking out smaller functions - although if you think that's too much work, feel free to ignore.

  const auto done = 
    parse_lit_or_len(lit_or_len_decoded.symbol, src_bits)
      .and_then(decode_lit_or_len_into(dst))
      .or_else(convert_error_status)
      
   if (done) {
     return *done;
   }

   // done is empty, so the the loop continues

where convert_error_status: ParseLitOrLenStatus -> optional<DecompressStatus>

return {in_place, e == EndOfBlock ? Success : InvalidLitOrLen};

and decode_lit_or_len_into uses the poor version of pattern matching available in C++:

auto decode_lit_or_len_into(std::span<std::byte>& dst)
{
  return [&dst](variant<std::byte, std::uint16_t> lit_or_len) {
    return visit(
      overloaded({
        [&dst](byte lit) { return decode_into(lit, dst); },
        [&dst](uint16_t len) { return decode_into(lit, dst); }
      }),
      lit_or_len
    );
  };
}

// https://en.cppreference.com/w/cpp/utility/variant/visit
template <class... Ts>
struct overloaded : Ts... { using Ts::operator()...; };

and

auto decode_into(byte lit, span& dst)
  -> optional<DecompressStatus>
{
  // lines 133-137
}
auto decode_into(uint16_t len, span& dst)
  -> optional<DecompressStatus>
{
 // lines 139-161
}

although that's a bit of a simplification - you'll also need to capture dst_written.

The different cases are broken out into different functions, although there's some logic inversion with variant::visit.

parse_lit_or_len(lit_or_len_decoded.symbol, src_bits);
if (not maybe_lit_or_len) {
if (maybe_lit_or_len.error() == ParseLitOrLenStatus::EndOfBlock) {
return DecompressStatus::Success;
}
return DecompressStatus::InvalidLitOrLen;

Check warning on line 151 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L151

Added line #L151 was not covered by tests
}
const auto lit_or_len = maybe_lit_or_len.value();
if (std::holds_alternative<std::byte>(lit_or_len)) {
if (dst.size() - static_cast<std::size_t>(dst_written) < 1) {
return DecompressStatus::DstTooSmall;
}

Check warning on line 157 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L156-L157

Added lines #L156 - L157 were not covered by tests
dst[static_cast<size_t>(dst_written++)] = std::get<std::byte>(lit_or_len);
continue;
}
// It's not a literal, so handle length and distance
const auto len = std::get<std::uint16_t>(lit_or_len);
const auto dist_decoded = huffman::decode_one(dist_table, src_bits);
const auto dist_code = dist_decoded.symbol;
if (not dist_decoded.encoded_size) {
return DecompressStatus::InvalidDistance;
}

Check warning on line 167 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L166-L167

Added lines #L166 - L167 were not covered by tests
src_bits.consume(dist_decoded.encoded_size);
if (dist_code >= detail::distance_infos.size()) {
return DecompressStatus::InvalidLitOrLen;
}

Check warning on line 171 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L170-L171

Added lines #L170 - L171 were not covered by tests
// NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-constant-array-index)
const auto& dist_info = detail::distance_infos[dist_code];
const std::uint16_t distance =
dist_info.base + pop_extra_bits(src_bits, dist_info.extra_bits);
if (distance > dst_written) {
return DecompressStatus::InvalidDistance;
}

Check warning on line 178 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L177-L178

Added lines #L177 - L178 were not covered by tests
if (dst.size() - static_cast<std::size_t>(dst_written) < len) {
oliverlee marked this conversation as resolved.
Show resolved Hide resolved
return DecompressStatus::DstTooSmall;
}

Check warning on line 181 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L180-L181

Added lines #L180 - L181 were not covered by tests
starflate::detail::copy_from_before(
distance, dst.begin() + dst_written, len);
dst_written += len;
}
return DecompressStatus::Success;

Check warning on line 186 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L186

Added line #L186 was not covered by tests
}

/// Copy n bytes from distance bytes before dst to dst.
void copy_from_before(
std::uint16_t distance, std::span<std::byte>::iterator dst, std::uint16_t n)
{
std::ptrdiff_t n_signed{n};
const auto src = dst - distance;
while (n_signed > 0) {
const auto n_to_copy = std::min(n_signed, dst - src);
oliverlee marked this conversation as resolved.
Show resolved Hide resolved
dst = std::copy_n(src, n_to_copy, dst);
n_signed -= n_to_copy;
}
}

} // namespace detail

auto decompress(std::span<const std::byte> src, std::span<std::byte> dst)
Expand All @@ -39,7 +207,8 @@
using enum detail::BlockType;

huffman::bit_span src_bits{src};
// std::size_t dst_written{};
// will always be > 0, but signed type to minimize conversions.
std::ptrdiff_t dst_written{};
for (bool was_final = false; not was_final;) {
const auto header = detail::read_header(src_bits);
if (not header) {
Expand All @@ -60,22 +229,27 @@
return DecompressStatus::SrcTooSmall;
}

if (dst.size() < len) {
if (dst.size() - static_cast<std::size_t>(dst_written) < len) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (dst.size() - static_cast<std::size_t>(dst_written) < len) {
if (dst.size() < len) {

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC this comment is irrelevant unless we drpo the prefix of dst every time as suggested below.

return DecompressStatus::DstTooSmall;
}

std::copy_n(src_bits.byte_data(), len, dst.begin());
std::copy_n(src_bits.byte_data(), len, dst.begin() + dst_written);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
std::copy_n(src_bits.byte_data(), len, dst.begin() + dst_written);
std::copy_n(src_bits.byte_data(), len, dst.begin());
dst = dst | std::views::drop(len);

views::drop has a specialization for span which can be used to trim off the front
https://en.cppreference.com/w/cpp/ranges/drop_view

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we advance dst, my guess would be it's UB to do dst.begin() - distance. I need to go backwards. If you have a better suggestion LMK.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It will be UB if the pointer goes off the front of the original buffer.

buf = array<byte, 8>{};
buf: [-|-|-|-|-|-|-|-]

dst = span{buf}
dst: [-|-|-|-|-|-|-|-]

dst = dst | drop(4)
dst:         [-|-|-|-]

it1 = dst.begin()
buf: [-|-|-|-|-|-|-|-]
it1:          ^ 

it2 = it1 - 2
buf: [-|-|-|-|-|-|-|-]
it2:      ^

Maybe it would make sense to create a destination_buffer type that handles this logic and catches potential UB.

As for how to define the destination buffer, that could depend on how you want to handle growth.

Would it act like a std::vector where it allocates a new buffer and copies elements over? In this case, maybe destination_buffer stores a pointer and 2 integers.

Or would it maintain a list of chunks so growth doesn't require copying elements from the old buffer to a new buffer? I'm not sure which approach is better, but it would probably help to abstract this part so the rest of the decompression implementation is independent.

src_bits.consume(CHAR_BIT * len);
dst = dst.subspan(len);
// dst_written += len;
dst_written += len;
} else if (header->type == FixedHuffman) {
const auto block_status = detail::decompress_block_huffman(
src_bits,
dst,
dst_written,
detail::fixed_len_table,
detail::fixed_dist_table);
if (block_status != DecompressStatus::Success) {
return block_status;
}

Check warning on line 248 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L247-L248

Added lines #L247 - L248 were not covered by tests
} else {
// TODO: implement
return DecompressStatus::Error;
}
const auto distance =
std::distance(std::ranges::data(src), src_bits.byte_data());
assert(distance >= 0 and "distance must be positive");
src = src.subspan(static_cast<size_t>(distance));
}
return DecompressStatus::Success;
}
Expand Down
28 changes: 28 additions & 0 deletions src/decompress.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include "huffman/huffman.hpp"

#include <array>
#include <cstddef>
#include <expected>
#include <ranges>
Expand All @@ -18,6 +19,8 @@ enum class DecompressStatus : std::uint8_t
NoCompressionLenMismatch,
DstTooSmall,
SrcTooSmall,
InvalidLitOrLen,
InvalidDistance,
};

namespace detail {
Expand All @@ -37,6 +40,31 @@ struct BlockHeader

auto read_header(huffman::bit_span& compressed_bits)
-> std::expected<BlockHeader, DecompressStatus>;

struct LengthInfo
{
std::uint8_t extra_bits;
std::uint16_t base;
};

extern const huffman::table<std::uint16_t, 288> fixed_table;
extern const std::array<LengthInfo, 28> length_infos;
constexpr auto lit_or_len_end_of_block = std::uint16_t{256};
constexpr auto lit_or_len_max = std::uint16_t{285};
constexpr auto lit_or_len_max_decoded = std::uint16_t{258};

/// Copies n bytes from (dst - distance) to dst, handling overlap by repeating.
///
/// From the standard section 3.2.3:
/// the referenced string may overlap the current position; for example, if the
/// last 2 bytes decoded have values X and Y, a string reference with
/// <length = 5, distance = 2> adds X,Y,X,Y,X to the output stream.
///
/// @pre dst - distance is valid.
void copy_from_before(
std::uint16_t distance,
std::span<std::byte>::iterator dst,
std::uint16_t n);
} // namespace detail

/// Decompresses the given source data into the destination buffer.
Expand Down
1 change: 1 addition & 0 deletions src/test/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ cc_test(
timeout = "short",
srcs = ["decompress_test.cpp"],
data = [
":starfleet.html",
":starfleet.html.dynamic",
":starfleet.html.fixed",
],
Expand Down
Loading
Loading