Skip to content

Commit

Permalink
maybe add to python bindings
Browse files Browse the repository at this point in the history
  • Loading branch information
paleolimbot committed Jun 21, 2024
1 parent cdf4b88 commit 19cedac
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 1 deletion.
2 changes: 2 additions & 0 deletions python/pyarrow/_parquet.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,8 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil:
CCacheOptions cache_options() const
void set_coerce_int96_timestamp_unit(TimeUnit unit)
TimeUnit coerce_int96_timestamp_unit() const
void set_convert_unknown_logical_types(c_bool convert_unknown_logical_types)
c_bool convert_unknown_logical_types() const

ArrowReaderProperties default_arrow_reader_properties()

Expand Down
6 changes: 5 additions & 1 deletion python/pyarrow/_parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1428,7 +1428,8 @@ cdef class ParquetReader(_Weakrefable):
FileDecryptionProperties decryption_properties=None,
thrift_string_size_limit=None,
thrift_container_size_limit=None,
page_checksum_verification=False):
page_checksum_verification=False,
convert_unknown_logical_types=False):
"""
Open a parquet file for reading.
Expand All @@ -1445,6 +1446,7 @@ cdef class ParquetReader(_Weakrefable):
thrift_string_size_limit : int, optional
thrift_container_size_limit : int, optional
page_checksum_verification : bool, default False
convert_unknown_logical_types : bool, default False
"""
cdef:
shared_ptr[CFileMetaData] c_metadata
Expand Down Expand Up @@ -1491,6 +1493,8 @@ cdef class ParquetReader(_Weakrefable):
arrow_props.set_coerce_int96_timestamp_unit(
string_to_timeunit(coerce_int96_timestamp_unit))

arrow_props.set_convert_unknown_logical_types(convert_unknown_logical_types)

self.source = source
get_reader(source, use_memory_map, &self.rd_handle)

Expand Down
19 changes: 19 additions & 0 deletions python/pyarrow/parquet/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,10 @@ class ParquetFile:
resolution (e.g. 'ms'). Setting to None is equivalent to 'ns'
and therefore INT96 timestamps will be inferred as timestamps
in nanoseconds.
convert_unknown_logical_types : bool, default false
When enabled, the Arrow reader will use the underlying physical type
of a logical type that it does not recognize (e.g., one that was added
to the spec but not implemented in Parquet C++).
decryption_properties : FileDecryptionProperties, default None
File decryption properties for Parquet Modular Encryption.
thrift_string_size_limit : int, default None
Expand Down Expand Up @@ -301,6 +305,7 @@ class ParquetFile:
def __init__(self, source, *, metadata=None, common_metadata=None,
read_dictionary=None, memory_map=False, buffer_size=0,
pre_buffer=False, coerce_int96_timestamp_unit=None,
convert_unknown_logical_types=False,
decryption_properties=None, thrift_string_size_limit=None,
thrift_container_size_limit=None, filesystem=None,
page_checksum_verification=False):
Expand All @@ -319,6 +324,7 @@ def __init__(self, source, *, metadata=None, common_metadata=None,
buffer_size=buffer_size, pre_buffer=pre_buffer,
read_dictionary=read_dictionary, metadata=metadata,
coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
convert_unknown_logical_types=convert_unknown_logical_types,
decryption_properties=decryption_properties,
thrift_string_size_limit=thrift_string_size_limit,
thrift_container_size_limit=thrift_container_size_limit,
Expand Down Expand Up @@ -1232,6 +1238,10 @@ class ParquetDataset:
Cast timestamps that are stored in INT96 format to a particular resolution
(e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96
timestamps will be inferred as timestamps in nanoseconds.
convert_unknown_logical_types : bool, default False
When enabled, the Arrow reader will use the underlying physical type
of a logical type that it does not recognize (e.g., one that was added
to the spec but not implemented in Parquet C++).
decryption_properties : FileDecryptionProperties or None
File-level decryption properties.
The decryption properties can be created using
Expand All @@ -1258,6 +1268,7 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, *, filters=None,
read_dictionary=None, memory_map=False, buffer_size=None,
partitioning="hive", ignore_prefixes=None, pre_buffer=True,
coerce_int96_timestamp_unit=None,
convert_unknown_logical_types=False,
decryption_properties=None, thrift_string_size_limit=None,
thrift_container_size_limit=None,
page_checksum_verification=False,
Expand All @@ -1275,6 +1286,7 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, *, filters=None,
read_options = {
"pre_buffer": pre_buffer,
"coerce_int96_timestamp_unit": coerce_int96_timestamp_unit,
"convert_unknown_logical_types": convert_unknown_logical_types,
"thrift_string_size_limit": thrift_string_size_limit,
"thrift_container_size_limit": thrift_container_size_limit,
"page_checksum_verification": page_checksum_verification,
Expand Down Expand Up @@ -1653,6 +1665,10 @@ def partitioning(self):
resolution (e.g. 'ms'). Setting to None is equivalent to 'ns'
and therefore INT96 timestamps will be inferred as timestamps
in nanoseconds.
convert_unknown_logical_types : bool, default False
When enabled, the Arrow reader will use the underlying physical type
of a logical type that it does not recognize (e.g., one that was added
to the spec but not implemented in Parquet C++).
decryption_properties : FileDecryptionProperties or None
File-level decryption properties.
The decryption properties can be created using
Expand Down Expand Up @@ -1760,6 +1776,7 @@ def read_table(source, *, columns=None, use_threads=True,
filesystem=None, filters=None, use_legacy_dataset=None,
ignore_prefixes=None, pre_buffer=True,
coerce_int96_timestamp_unit=None,
convert_unknown_logical_types=False,
decryption_properties=None, thrift_string_size_limit=None,
thrift_container_size_limit=None,
page_checksum_verification=False):
Expand All @@ -1783,6 +1800,7 @@ def read_table(source, *, columns=None, use_threads=True,
ignore_prefixes=ignore_prefixes,
pre_buffer=pre_buffer,
coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
convert_unknown_logical_types=convert_unknown_logical_types,
decryption_properties=decryption_properties,
thrift_string_size_limit=thrift_string_size_limit,
thrift_container_size_limit=thrift_container_size_limit,
Expand Down Expand Up @@ -1815,6 +1833,7 @@ def read_table(source, *, columns=None, use_threads=True,
memory_map=memory_map, buffer_size=buffer_size,
pre_buffer=pre_buffer,
coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
convert_unknown_logical_types=convert_unknown_logical_types,
decryption_properties=decryption_properties,
thrift_string_size_limit=thrift_string_size_limit,
thrift_container_size_limit=thrift_container_size_limit,
Expand Down

0 comments on commit 19cedac

Please sign in to comment.