Skip to content

Commit

Permalink
Rethink and simplify the encoder context class (#849)
Browse files Browse the repository at this point in the history
Signed-off-by: Juan Cruz Viotti <[email protected]>
  • Loading branch information
jviotti authored Oct 9, 2024
1 parent 2d09248 commit 96fc75e
Show file tree
Hide file tree
Showing 10 changed files with 315 additions and 322 deletions.
3 changes: 2 additions & 1 deletion src/runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@ noa_library(NAMESPACE sourcemeta PROJECT jsonbinpack NAME runtime
encoder.h
input_stream.h
output_stream.h
encoder_context.h
encoder_cache.h
encoding.h
SOURCES
input_stream.cc
output_stream.cc
varint.h
unreachable.h
cache.cc

loader.cc
loader_v1_any.h
Expand Down
66 changes: 66 additions & 0 deletions src/runtime/cache.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#include <sourcemeta/jsonbinpack/runtime_encoder_cache.h>

namespace sourcemeta::jsonbinpack {

auto Cache::record(const sourcemeta::jsontoolkit::JSON::String &value,
const std::uint64_t offset, const Type type) -> void {
// Encoding a shared string has some overhead, such as the
// shared string marker + the offset, so its not worth
// doing for strings that are too small.
constexpr auto MINIMUM_STRING_LENGTH{3};

// We don't want to allow the context to grow
// forever, otherwise an attacker could force the
// program to exhaust memory given an input
// document that contains a high number of large strings.
constexpr auto MAXIMUM_BYTE_SIZE{20971520};

const auto value_size{value.size()};
if (value_size < MINIMUM_STRING_LENGTH || value_size >= MAXIMUM_BYTE_SIZE) {
return;
}

// Remove the oldest entries to make space if needed
while (!this->data.empty() &&
this->byte_size + value_size >= MAXIMUM_BYTE_SIZE) {
this->remove_oldest();
}

auto result{this->data.insert({std::make_pair(value, type), offset})};
if (result.second) {
this->byte_size += value_size;
this->order.emplace(offset, result.first->first);
} else if (offset > result.first->second) {
this->order.erase(result.first->second);
// If the string already exists, we want to
// bump the offset for locality purposes.
result.first->second = offset;
this->order.emplace(offset, result.first->first);
}

// Otherwise we are doing something wrong
assert(this->order.size() == this->data.size());
}

auto Cache::remove_oldest() -> void {
assert(!this->data.empty());
// std::map are by definition ordered by key,
// so the begin iterator points to the entry
// with the lowest offset, a.k.a. the oldest.
const auto iterator{this->order.cbegin()};
this->byte_size -= iterator->second.get().first.size();
this->data.erase(iterator->second.get());
this->order.erase(iterator);
}

auto Cache::find(const sourcemeta::jsontoolkit::JSON::String &value,
const Type type) const -> std::optional<std::uint64_t> {
const auto result{this->data.find(std::make_pair(value, type))};
if (result == this->data.cend()) {
return std::nullopt;
}

return result->second;
}

} // namespace sourcemeta::jsonbinpack
5 changes: 2 additions & 3 deletions src/runtime/encoder_any.cc
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ auto Encoder::ANY_PACKED_TYPE_TAG_BYTE_PREFIX(
} else if (document.is_string()) {
const sourcemeta::jsontoolkit::JSON::String value{document.to_string()};
const auto size{document.byte_size()};
const auto shared{this->context_.find(value, Context::Type::Standalone)};
const auto shared{this->cache_.find(value, Cache::Type::Standalone)};
if (size < uint_max<5>) {
const std::uint8_t type{shared.has_value() ? TYPE_SHARED_STRING
: TYPE_STRING};
Expand All @@ -119,8 +119,7 @@ auto Encoder::ANY_PACKED_TYPE_TAG_BYTE_PREFIX(
if (shared.has_value()) {
this->put_varint(this->position() - shared.value());
} else {
this->context_.record(value, this->position(),
Context::Type::Standalone);
this->cache_.record(value, this->position(), Cache::Type::Standalone);
this->put_string_utf8(value, size);
}
} else if (size >= uint_max<5> && size < uint_max<5> * 2 &&
Expand Down
24 changes: 12 additions & 12 deletions src/runtime/encoder_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ auto Encoder::FLOOR_VARINT_PREFIX_UTF8_STRING_SHARED(
const sourcemeta::jsontoolkit::JSON::String value{document.to_string()};
const auto size{value.size()};
assert(document.size() == size);
const auto shared{this->context_.find(value, Context::Type::Standalone)};
const auto shared{this->cache_.find(value, Cache::Type::Standalone)};

// (1) Write 0x00 if shared, else do nothing
if (shared.has_value()) {
Expand All @@ -35,7 +35,7 @@ auto Encoder::FLOOR_VARINT_PREFIX_UTF8_STRING_SHARED(
if (shared.has_value()) {
this->put_varint(this->position() - shared.value());
} else {
this->context_.record(value, this->position(), Context::Type::Standalone);
this->cache_.record(value, this->position(), Cache::Type::Standalone);
this->put_string_utf8(value, size);
}
}
Expand All @@ -48,7 +48,7 @@ auto Encoder::ROOF_VARINT_PREFIX_UTF8_STRING_SHARED(
const auto size{value.size()};
assert(document.size() == size);
assert(size <= options.maximum);
const auto shared{this->context_.find(value, Context::Type::Standalone)};
const auto shared{this->cache_.find(value, Cache::Type::Standalone)};

// (1) Write 0x00 if shared, else do nothing
if (shared.has_value()) {
Expand All @@ -62,7 +62,7 @@ auto Encoder::ROOF_VARINT_PREFIX_UTF8_STRING_SHARED(
if (shared.has_value()) {
this->put_varint(this->position() - shared.value());
} else {
this->context_.record(value, this->position(), Context::Type::Standalone);
this->cache_.record(value, this->position(), Cache::Type::Standalone);
this->put_string_utf8(value, size);
}
}
Expand All @@ -77,7 +77,7 @@ auto Encoder::BOUNDED_8BIT_PREFIX_UTF8_STRING_SHARED(
assert(options.minimum <= options.maximum);
assert(is_byte(options.maximum - options.minimum + 1));
assert(is_within(size, options.minimum, options.maximum));
const auto shared{this->context_.find(value, Context::Type::Standalone)};
const auto shared{this->cache_.find(value, Cache::Type::Standalone)};

// (1) Write 0x00 if shared, else do nothing
if (shared.has_value()) {
Expand All @@ -91,7 +91,7 @@ auto Encoder::BOUNDED_8BIT_PREFIX_UTF8_STRING_SHARED(
if (shared.has_value()) {
this->put_varint(this->position() - shared.value());
} else {
this->context_.record(value, this->position(), Context::Type::Standalone);
this->cache_.record(value, this->position(), Cache::Type::Standalone);
this->put_string_utf8(value, size);
}
}
Expand Down Expand Up @@ -127,22 +127,22 @@ auto Encoder::PREFIX_VARINT_LENGTH_STRING_SHARED(
const sourcemeta::jsontoolkit::JSON::String value{document.to_string()};

const auto shared{
this->context_.find(value, Context::Type::PrefixLengthVarintPlusOne)};
this->cache_.find(value, Cache::Type::PrefixLengthVarintPlusOne)};
if (shared.has_value()) {
const auto new_offset{this->position()};
this->put_byte(0);
this->put_varint(this->position() - shared.value());
// Bump the context cache for locality purposes
this->context_.record(value, new_offset,
Context::Type::PrefixLengthVarintPlusOne);
this->cache_.record(value, new_offset,
Cache::Type::PrefixLengthVarintPlusOne);
} else {
const auto size{value.size()};
assert(document.size() == size);
this->context_.record(value, this->position(),
Context::Type::PrefixLengthVarintPlusOne);
this->cache_.record(value, this->position(),
Cache::Type::PrefixLengthVarintPlusOne);
this->put_varint(size + 1);
// Also record a standalone variant of it
this->context_.record(value, this->position(), Context::Type::Standalone);
this->cache_.record(value, this->position(), Cache::Type::Standalone);
this->put_string_utf8(value, size);
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/runtime/include/sourcemeta/jsonbinpack/runtime_encoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

#include "runtime_export.h"

#include <sourcemeta/jsonbinpack/runtime_encoder_context.h>
#include <sourcemeta/jsonbinpack/runtime_encoder_cache.h>
#include <sourcemeta/jsonbinpack/runtime_encoding.h>
#include <sourcemeta/jsonbinpack/runtime_output_stream.h>

Expand Down Expand Up @@ -66,7 +66,7 @@ class SOURCEMETA_JSONBINPACK_RUNTIME_EXPORT Encoder : private OutputStream {
#endif

private:
Context context_;
Cache cache_;
};

} // namespace sourcemeta::jsonbinpack
Expand Down
48 changes: 48 additions & 0 deletions src/runtime/include/sourcemeta/jsonbinpack/runtime_encoder_cache.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#ifndef SOURCEMETA_JSONBINPACK_RUNTIME_ENCODER_CACHE_H_
#define SOURCEMETA_JSONBINPACK_RUNTIME_ENCODER_CACHE_H_
#ifndef DOXYGEN

#include "runtime_export.h"

#include <sourcemeta/jsontoolkit/json.h>

#include <functional> // std::reference_wrapper
#include <map> // std::map
#include <optional> // std::optional
#include <utility> // std::pair

namespace sourcemeta::jsonbinpack {

class SOURCEMETA_JSONBINPACK_RUNTIME_EXPORT Cache {
public:
enum class Type { Standalone, PrefixLengthVarintPlusOne };
auto record(const sourcemeta::jsontoolkit::JSON::String &value,
const std::uint64_t offset, const Type type) -> void;
auto find(const sourcemeta::jsontoolkit::JSON::String &value,
const Type type) const -> std::optional<std::uint64_t>;

#ifndef DOXYGEN
// This method is considered private. We only expose it for testing purposes
auto remove_oldest() -> void;
#endif

private:
// Exporting symbols that depends on the standard C++ library is considered
// safe.
// https://learn.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-2-c4275?view=msvc-170&redirectedfrom=MSDN
#if defined(_MSC_VER)
#pragma warning(disable : 4251 4275)
#endif
std::uint64_t byte_size{0};
using Entry = std::pair<sourcemeta::jsontoolkit::JSON::String, Type>;
std::map<Entry, std::uint64_t> data;
std::map<std::uint64_t, std::reference_wrapper<const Entry>> order;
#if defined(_MSC_VER)
#pragma warning(default : 4251 4275)
#endif
};

} // namespace sourcemeta::jsonbinpack

#endif
#endif
120 changes: 0 additions & 120 deletions src/runtime/include/sourcemeta/jsonbinpack/runtime_encoder_context.h

This file was deleted.

2 changes: 1 addition & 1 deletion test/runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ add_executable(sourcemeta_jsonbinpack_runtime_unit
decode_utils.h
encode_any_test.cc
encode_array_test.cc
encode_context_test.cc
encode_cache_test.cc
encode_integer_test.cc
encode_number_test.cc
encode_object_test.cc
Expand Down
Loading

0 comments on commit 96fc75e

Please sign in to comment.