Skip to content

Commit

Permalink
Parquet: normalize dictionary encoding to use RLE_DICTIONARY
Browse files Browse the repository at this point in the history
  • Loading branch information
mapleFU committed May 24, 2024
1 parent 3a4fcff commit 4d664b5
Show file tree
Hide file tree
Showing 5 changed files with 14 additions and 18 deletions.
11 changes: 3 additions & 8 deletions cpp/src/parquet/column_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -642,12 +642,6 @@ namespace {
// ----------------------------------------------------------------------
// Impl base class for TypedColumnReader and RecordReader

// PLAIN_DICTIONARY is deprecated but used to be used as a dictionary index
// encoding.
static bool IsDictionaryIndexEncoding(const Encoding::type& e) {
return e == Encoding::RLE_DICTIONARY || e == Encoding::PLAIN_DICTIONARY;
}

template <typename DType>
class ColumnReaderImplBase {
public:
Expand Down Expand Up @@ -876,8 +870,9 @@ class ColumnReaderImplBase {
}

Encoding::type encoding = page.encoding();

if (IsDictionaryIndexEncoding(encoding)) {
// Normalizing the PLAIN_DICTIONARY to RLE_DICTIONARY encoding
// in decoder.
encoding = Encoding::RLE_DICTIONARY;
}

Expand Down Expand Up @@ -950,7 +945,7 @@ class ColumnReaderImplBase {

/// Flag to signal when a new dictionary has been set, for the benefit of
/// DictionaryRecordReader
bool new_dictionary_;
bool new_dictionary_ = false;

// The exposed encoding
ExposedEncoding exposed_encoding_ = ExposedEncoding::NO_ENCODING;
Expand Down
8 changes: 2 additions & 6 deletions cpp/src/parquet/column_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1205,10 +1205,6 @@ Status ConvertDictionaryToDense(const ::arrow::Array& array, MemoryPool* pool,
return Status::OK();
}

static inline bool IsDictionaryEncoding(Encoding::type encoding) {
return encoding == Encoding::PLAIN_DICTIONARY;
}

template <typename DType>
class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<DType> {
public:
Expand Down Expand Up @@ -1565,7 +1561,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<
}

void FallbackToPlainEncoding() {
if (IsDictionaryEncoding(current_encoder_->encoding())) {
if (IsDictionaryIndexEncoding(current_encoder_->encoding())) {
WriteDictionaryPage();
// Serialize the buffered Dictionary Indices
FlushBufferedDataPages();
Expand Down Expand Up @@ -1661,7 +1657,7 @@ Status TypedColumnWriterImpl<DType>::WriteArrowDictionary(
maybe_parent_nulls);
};

if (!IsDictionaryEncoding(current_encoder_->encoding()) ||
if (!IsDictionaryIndexEncoding(current_encoder_->encoding()) ||
!DictionaryDirectWriteSupported(array)) {
// No longer dictionary-encoding for whatever reason, maybe we never were
// or we decided to stop. Note that WriteArrow can be invoked multiple
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/parquet/encoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -442,12 +442,12 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder<DType> {
constexpr static int32_t kDataPageBitWidthBytes = 1;

explicit DictEncoderImpl(const ColumnDescriptor* desc, MemoryPool* pool)
: EncoderImpl(desc, Encoding::PLAIN_DICTIONARY, pool),
: EncoderImpl(desc, Encoding::RLE_DICTIONARY, pool),
buffered_indices_(::arrow::stl::allocator<int32_t>(pool)),
dict_encoded_size_(0),
memo_table_(pool, kInitialHashTableSize) {}

~DictEncoderImpl() = default;
~DictEncoderImpl() override = default;

int dict_encoded_size() const override { return dict_encoded_size_; }

Expand Down
3 changes: 1 addition & 2 deletions cpp/src/parquet/file_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,7 @@ bool IsColumnChunkFullyDictionaryEncoded(const ColumnChunkMetaData& col) {
}
// The following pages should be dictionary encoded data pages.
for (size_t idx = 1; idx < encoding_stats.size(); ++idx) {
if ((encoding_stats[idx].encoding != Encoding::RLE_DICTIONARY &&
encoding_stats[idx].encoding != Encoding::PLAIN_DICTIONARY) ||
if (!IsDictionaryIndexEncoding(encoding_stats[idx].encoding) ||
(encoding_stats[idx].page_type != PageType::DATA_PAGE &&
encoding_stats[idx].page_type != PageType::DATA_PAGE_V2)) {
// Return false if any following page is not a dictionary encoded data
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/parquet/properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -1180,4 +1180,10 @@ struct ArrowWriteContext {
PARQUET_EXPORT
std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties();

// PLAIN_DICTIONARY is deprecated but used to be used as a dictionary index
// encoding.
constexpr bool IsDictionaryIndexEncoding(Encoding::type e) {
return e == Encoding::RLE_DICTIONARY || e == Encoding::PLAIN_DICTIONARY;
}

} // namespace parquet

0 comments on commit 4d664b5

Please sign in to comment.