Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Fix](Outfile) upgrade apache-arrow version to 13.0.0 #35142

Draft
wants to merge 3 commits into
base: branch-2.0
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions thirdparty/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,13 @@ Now there will be 2 set of libhdfs, one is without kerberos, the other is with k

## v20211215

## v20240521
- Modified: arrow 7.0.0 -> 13.0.0
- Modified: jemalloc for arrow 5.2.1 -> 5.3.0
- Modified: xsimd 7.0.0 -> 13.0.0
- Added: c-ares -> 1.19.1
- Added: grpc -> 1.54.3

### Changes

- Added: cyrus-sasl
Expand Down
72 changes: 71 additions & 1 deletion thirdparty/build-thirdparty.sh
Original file line number Diff line number Diff line change
Expand Up @@ -958,6 +958,8 @@ build_arrow() {
export ARROW_ZLIB_URL="${TP_SOURCE_DIR}/${ZLIB_NAME}"
export ARROW_XSIMD_URL="${TP_SOURCE_DIR}/${XSIMD_NAME}"
export ARROW_ORC_URL="${TP_SOURCE_DIR}/${ORC_NAME}"
export ARROW_GRPC_URL="${TP_SOURCE_DIR}/${GRPC_NAME}"
export ARROW_PROTOBUF_URL="${TP_SOURCE_DIR}/${PROTOBUF_NAME}"

if [[ "${KERNEL}" != 'Darwin' ]]; then
ldflags="-L${TP_LIB_DIR} -static-libstdc++ -static-libgcc"
Expand All @@ -973,22 +975,38 @@ build_arrow() {
-DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \
-DCMAKE_INSTALL_LIBDIR=lib64 \
-DARROW_BOOST_USE_SHARED=OFF \
-DARROW_WITH_GRPC=ON \
-DgRPC_SOURCE=SYSTEM \
-DgRPC_ROOT="${TP_INSTALL_DIR}" \
-DARROW_WITH_PROTOBUF=ON \
-DProtobuf_SOURCE=SYSTEM \
-DProtobuf_LIB="${TP_INSTALL_DIR}/lib/libprotoc.a" -DProtobuf_INCLUDE_DIR="${TP_INSTALL_DIR}/include" \
-DARROW_FLIGHT=ON \
-DARROW_FLIGHT_SQL=ON \
-DBoost_USE_STATIC_RUNTIME=ON \
-DARROW_GFLAGS_USE_SHARED=OFF \
-Dgflags_ROOT="${TP_INSTALL_DIR}" \
-DGLOG_ROOT="${TP_INSTALL_DIR}" \
-DRE2_ROOT="${TP_INSTALL_DIR}" \
-DZLIB_SOURCE=SYSTEM \
-DZLIB_LIBRARY="${TP_INSTALL_DIR}/lib/libz.a" -DZLIB_INCLUDE_DIR="${TP_INSTALL_DIR}/include" \
-DRapidJSON_SOURCE=SYSTEM \
-DRapidJSON_ROOT="${TP_INSTALL_DIR}" \
-DORC_ROOT="${TP_INSTALL_DIR}" \
-Dxsimd_SOURCE=BUNDLED \
-DBrotli_SOURCE=BUNDLED \
-DARROW_LZ4_USE_SHARED=OFF \
-DLZ4_LIB="${TP_INSTALL_DIR}/lib/liblz4.a" -DLZ4_INCLUDE_DIR="${TP_INSTALL_DIR}/include/lz4" \
-DLz4_SOURCE=SYSTEM \
-DARROW_ZSTD_USE_SHARED=OFF \
-DZSTD_LIB="${TP_INSTALL_DIR}/lib/libzstd.a" -DZSTD_INCLUDE_DIR="${TP_INSTALL_DIR}/include" \
-Dzstd_SOURCE=SYSTEM \
-DSnappy_LIB="${TP_INSTALL_DIR}/lib/libsnappy.a" -DSnappy_INCLUDE_DIR="${TP_INSTALL_DIR}/include" \
-DSnappy_SOURCE=SYSTEM \
-DBOOST_ROOT="${TP_INSTALL_DIR}" --no-warn-unused-cli \
-Djemalloc_SOURCE=BUNDLED \
-DARROW_THRIFT_USE_SHARED=OFF \
-DThrift_SOURCE=SYSTEM \
-DThrift_ROOT="${TP_INSTALL_DIR}" ..

"${BUILD_SYSTEM}" -j "${PARALLEL}"
Expand Down Expand Up @@ -1662,6 +1680,56 @@ build_libdeflate() {
"${BUILD_SYSTEM}" install
}

# c-ares
build_cares() {
check_if_source_exist "${CARES_SOURCE}"
cd "${TP_SOURCE_DIR}/${CARES_SOURCE}"

mkdir -p build
cd build
cmake -DCMAKE_BUILD_TYPE=Release \
-DCARES_STATIC=ON \
-DCARES_SHARED=OFF \
-DCARES_STATIC_PIC=ON \
-DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" ..
make
make install
}

# grpc
build_grpc() {
check_if_source_exist "${GRPC_SOURCE}"
cd "${TP_SOURCE_DIR}/${GRPC_SOURCE}"

mkdir -p cmake/build
cd cmake/build

cmake -DgRPC_INSTALL=ON \
-DgRPC_BUILD_TESTS=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \
-DgRPC_CARES_PROVIDER=package \
-Dc-ares_DIR="${TP_INSTALL_DIR}" \
-DgRPC_ABSL_PROVIDER=package \
-Dabsl_DIR="${TP_INSTALL_DIR}" \
-DgRPC_PROTOBUF_PROVIDER=package \
-DProtobuf_DIR="${TP_INSTALL_DIR}" \
-DgRPC_RE2_PROVIDER=package \
-Dre2_DIR:STRING="${TP_INSTALL_DIR}" \
-DgRPC_SSL_PROVIDER=package \
-DOPENSSL_ROOT_DIR="${TP_INSTALL_DIR}" \
-DgRPC_ZLIB_PROVIDER=package \
-DZLIB_ROOT="${TP_INSTALL_DIR}" \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
../..

make -j "${PARALLEL}"
make install

# for grpc > v1.55, cmake 2.22 does not support find_dependency, delete this line after cmake version upgrade.
# sed -i 's/find_dependency/find_package/g' "${TP_INSTALL_DIR}"/lib64/cmake/grpc/gRPCConfig.cmake
}

if [[ "${#packages[@]}" -eq 0 ]]; then
packages=(
libunixodbc
Expand All @@ -1673,9 +1741,9 @@ if [[ "${#packages[@]}" -eq 0 ]]; then
lzo2
zstd
boost # must before thrift
protobuf
gflags
gtest
protobuf # after gtest
glog
rapidjson
snappy
Expand All @@ -1693,6 +1761,8 @@ if [[ "${#packages[@]}" -eq 0 ]]; then
librdkafka
flatbuffers
orc
cares
grpc # after cares, protobuf
arrow
abseil
s2
Expand Down
4 changes: 2 additions & 2 deletions thirdparty/download-thirdparty.sh
Original file line number Diff line number Diff line change
Expand Up @@ -323,10 +323,10 @@ fi
echo "Finished patching ${OPENTELEMETRY_SOURCE}"

# arrow patch is used to get the raw orc reader for filter prune.
if [[ "${ARROW_SOURCE}" == "apache-arrow-7.0.0" ]]; then
if [[ "${ARROW_SOURCE}" == "arrow-apache-arrow-13.0.0" ]]; then
cd "${TP_SOURCE_DIR}/${ARROW_SOURCE}"
if [[ ! -f "${PATCHED_MARK}" ]]; then
patch -p1 <"${TP_PATCH_DIR}/apache-arrow-7.0.0.patch"
patch -p1 <"${TP_PATCH_DIR}/apache-arrow-13.0.0.patch"
touch "${PATCHED_MARK}"
fi
cd -
Expand Down
120 changes: 120 additions & 0 deletions thirdparty/patches/apache-arrow-13.0.0.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc
index 2466e7433..46b4402d4 100644
--- a/cpp/src/arrow/adapters/orc/adapter.cc
+++ b/cpp/src/arrow/adapters/orc/adapter.cc
@@ -47,9 +47,6 @@
#include "arrow/util/visibility.h"
#include "orc/Exceptions.hh"

-// alias to not interfere with nested orc namespace
-namespace liborc = orc;
-
#define ORC_THROW_NOT_OK(s) \
do { \
Status _s = (s); \
@@ -202,6 +199,8 @@ class ORCFileReader::Impl {
return Init();
}

+ virtual liborc::Reader* GetRawORCReader() { return reader_.get(); }
+
Status Init() {
int64_t nstripes = reader_->getNumberOfStripes();
stripes_.resize(nstripes);
@@ -479,6 +478,31 @@ class ORCFileReader::Impl {
return Status::OK();
}

+ Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
+ int64_t batch_size, const std::vector<std::string>& include_names) {
+ if (current_row_ >= NumberOfRows()) {
+ return nullptr;
+ }
+
+ liborc::RowReaderOptions opts = default_row_reader_options();
+ if (!include_names.empty()) {
+ RETURN_NOT_OK(SelectNames(&opts, include_names));
+ }
+ StripeInformation stripe_info({0, 0, 0, 0});
+ RETURN_NOT_OK(SelectStripeWithRowNumber(&opts, current_row_, &stripe_info));
+ ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts));
+ std::unique_ptr<liborc::RowReader> row_reader;
+
+ ORC_BEGIN_CATCH_NOT_OK
+ row_reader = reader_->createRowReader(opts);
+ row_reader->seekToRow(current_row_);
+ current_row_ = stripe_info.first_row_id + stripe_info.num_rows;
+ ORC_END_CATCH_NOT_OK
+
+ return std::make_shared<OrcStripeReader>(std::move(row_reader), schema, batch_size,
+ pool_);
+ }
+
Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
int64_t batch_size, const std::vector<int>& include_indices) {
if (current_row_ >= NumberOfRows()) {
@@ -544,6 +568,8 @@ Result<std::unique_ptr<ORCFileReader>> ORCFileReader::Open(
return std::move(result);
}

+liborc::Reader* ORCFileReader::GetRawORCReader() { return impl_->GetRawORCReader(); }
+
Result<std::shared_ptr<const KeyValueMetadata>> ORCFileReader::ReadMetadata() {
return impl_->ReadMetadata();
}
@@ -605,6 +631,11 @@ Result<std::shared_ptr<RecordBatchReader>> ORCFileReader::NextStripeReader(
return impl_->NextStripeReader(batch_size, include_indices);
}

+Result<std::shared_ptr<RecordBatchReader>> ORCFileReader::NextStripeReader(
+ int64_t batch_size, const std::vector<std::string>& include_names) {
+ return impl_->NextStripeReader(batch_size, include_names);
+}
+
int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); }

int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); }
diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h
index 013be7860..7fd06bcb8 100644
--- a/cpp/src/arrow/adapters/orc/adapter.h
+++ b/cpp/src/arrow/adapters/orc/adapter.h
@@ -30,6 +30,10 @@
#include "arrow/type_fwd.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
+#include "orc/Reader.hh"
+
+// alias to not interfere with nested orc namespace
+namespace liborc = orc;

namespace arrow {
namespace adapters {
@@ -53,6 +57,9 @@ class ARROW_EXPORT ORCFileReader {
public:
~ORCFileReader();

+ /// \brief Get ORC reader from inside.
+ liborc::Reader* GetRawORCReader();
+
/// \brief Creates a new ORC reader
///
/// \param[in] file the data source
@@ -174,6 +181,19 @@ class ARROW_EXPORT ORCFileReader {
Result<std::shared_ptr<RecordBatchReader>> GetRecordBatchReader(
int64_t batch_size, const std::vector<std::string>& include_names);

+ /// \brief Get a stripe level record batch iterator with specified row count
+ /// in each record batch. NextStripeReader serves as a fine grain
+ /// alternative to ReadStripe which may cause OOM issue by loading
+ /// the whole stripes into memory.
+ ///
+ /// \param[in] batch_size Get a stripe level record batch iterator with specified row
+ /// count in each record batch.
+ ///
+ /// \param[in] include_names the selected field names to read
+ /// \return the returned stripe reader
+ Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
+ int64_t batch_size, const std::vector<std::string>& include_names);
+
/// \brief The number of stripes in the file
int64_t NumberOfStripes();
39 changes: 27 additions & 12 deletions thirdparty/vars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -238,11 +238,24 @@ FLATBUFFERS_NAME=flatbuffers-2.0.0.tar.gz
FLATBUFFERS_SOURCE=flatbuffers-2.0.0
FLATBUFFERS_MD5SUM="a27992324c3cbf86dd888268a23d17bd"

# c-ares
CARES_DOWNLOAD="https://github.com/c-ares/c-ares/releases/download/cares-1_19_1/c-ares-1.19.1.tar.gz"
CARES_NAME="c-ares-1.19.1.tar.gz"
CARES_SOURCE=c-ares-1.19.1
CARES_MD5SUM="dafc5825a92dc907e144570e4e75a908"

# grpc
# grpc v1.55 and above require protobuf >= 22
GRPC_DOWNLOAD="https://github.com/grpc/grpc/archive/refs/tags/v1.54.3.tar.gz"
GRPC_NAME="grpc-v1.54.3.tar.gz"
GRPC_SOURCE=grpc-1.54.3
GRPC_MD5SUM="af00a2edeae0f02bb25917cc3473b7de"

# arrow
ARROW_DOWNLOAD="https://archive.apache.org/dist/arrow/arrow-7.0.0/apache-arrow-7.0.0.tar.gz"
ARROW_NAME="apache-arrow-7.0.0.tar.gz"
ARROW_SOURCE="apache-arrow-7.0.0"
ARROW_MD5SUM="316ade159901646849b3b4760fa52816"
ARROW_DOWNLOAD="https://github.com/apache/arrow/archive/refs/tags/apache-arrow-13.0.0.tar.gz"
ARROW_NAME="apache-arrow-13.0.0.tar.gz"
ARROW_SOURCE="arrow-apache-arrow-13.0.0"
ARROW_MD5SUM="8ec1ec6a119514bcaea1cf7aabc9df1f"

# Abseil
ABSEIL_DOWNLOAD="https://github.com/abseil/abseil-cpp/archive/refs/tags/20220623.1.tar.gz"
Expand Down Expand Up @@ -287,10 +300,10 @@ ORC_SOURCE=orc-1.7.2
ORC_MD5SUM="6cab37935eacdec7d078d327746a8578"

# jemalloc for arrow
JEMALLOC_ARROW_DOWNLOAD="https://github.com/jemalloc/jemalloc/releases/download/5.2.1/jemalloc-5.2.1.tar.bz2"
JEMALLOC_ARROW_NAME="jemalloc-5.2.1.tar.bz2"
JEMALLOC_ARROW_SOURCE="jemalloc-5.2.1"
JEMALLOC_ARROW_MD5SUM="3d41fbf006e6ebffd489bdb304d009ae"
JEMALLOC_ARROW_DOWNLOAD="https://github.com/jemalloc/jemalloc/releases/download/5.3.0/jemalloc-5.3.0.tar.bz2"
JEMALLOC_ARROW_NAME="jemalloc-5.3.0.tar.bz2"
JEMALLOC_ARROW_SOURCE="jemalloc-5.3.0"
JEMALLOC_ARROW_MD5SUM="09a8328574dab22a7df848eae6dbbf53"

# jemalloc for doris
JEMALLOC_DORIS_DOWNLOAD="https://github.com/jemalloc/jemalloc/releases/download/5.3.0/jemalloc-5.3.0.tar.bz2"
Expand Down Expand Up @@ -399,10 +412,10 @@ BENCHMARK_MD5SUM="8ddf8571d3f6198d37852bcbd964f817"

# xsimd
# for arrow-7.0.0, if arrow upgrade, this version may also need to be changed
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

change comment

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

XSIMD_DOWNLOAD="https://github.com/xtensor-stack/xsimd/archive/aeec9c872c8b475dedd7781336710f2dd2666cb2.tar.gz"
XSIMD_NAME=xsimd-aeec9c872c8b475dedd7781336710f2dd2666cb2.tar.gz
XSIMD_SOURCE=xsimd-aeec9c872c8b475dedd7781336710f2dd2666cb2
XSIMD_MD5SUM="d024855f71c0a2837a6918c0f8f66245"
XSIMD_DOWNLOAD="https://github.com/xtensor-stack/xsimd/archive/refs/tags/9.0.1.tar.gz"
XSIMD_NAME="xsimd-9.0.1.tar.gz"
XSIMD_SOURCE=xsimd-9.0.1
XSIMD_MD5SUM="59f38fe3364acd7ed137771258812d6c"

# simdjson
SIMDJSON_DOWNLOAD="https://github.com/simdjson/simdjson/archive/refs/tags/v3.0.1.tar.gz"
Expand Down Expand Up @@ -505,6 +518,8 @@ export TP_ARCHIVES=(
'CYRUS_SASL'
'LIBRDKAFKA'
'FLATBUFFERS'
'CARES'
'GRPC'
'ARROW'
'BROTLI'
'ZSTD'
Expand Down
Loading