-
Notifications
You must be signed in to change notification settings - Fork 3.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GH-41480: [Python] Building PyArrow: enable/disable python components by default based on availability in Arrow C++ #41494
Changes from 11 commits
a7c33c5
c340833
ec6f02b
fa91a93
f096999
726afec
1410185
63dea5a
7ad1977
c3a2136
f9b2b3c
5f2382e
0390e9e
f65dbc1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -108,25 +108,6 @@ if(UNIX) | |
endif() | ||
endif() | ||
|
||
# Top level cmake dir | ||
if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") | ||
option(PYARROW_BUILD_ACERO "Build the PyArrow Acero integration" OFF) | ||
option(PYARROW_BUILD_CUDA "Build the PyArrow CUDA support" OFF) | ||
option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" OFF) | ||
option(PYARROW_BUILD_FLIGHT "Build the PyArrow Flight integration" OFF) | ||
option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" OFF) | ||
option(PYARROW_BUILD_ORC "Build the PyArrow ORC integration" OFF) | ||
option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" OFF) | ||
option(PYARROW_BUILD_PARQUET_ENCRYPTION | ||
"Build the PyArrow Parquet encryption integration" OFF) | ||
option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF) | ||
option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" OFF) | ||
option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF) | ||
set(PYARROW_CXXFLAGS | ||
"" | ||
CACHE STRING "Compiler flags to append when compiling Arrow") | ||
endif() | ||
|
||
find_program(CCACHE_FOUND ccache) | ||
if(CCACHE_FOUND | ||
AND NOT CMAKE_C_COMPILER_LAUNCHER | ||
|
@@ -265,11 +246,77 @@ message(STATUS "NumPy include dir: ${NUMPY_INCLUDE_DIRS}") | |
|
||
include(UseCython) | ||
|
||
# PyArrow C++ | ||
# Arrow C++ and set default PyArrow build options | ||
include(GNUInstallDirs) | ||
|
||
find_package(Arrow REQUIRED) | ||
|
||
macro(define_option name description arrow_option) | ||
set("PYARROW_${name}" | ||
"AUTO" | ||
CACHE STRING ${description}) | ||
|
||
if("${PYARROW_${name}}" STREQUAL "AUTO") | ||
# by default, first check if env variable exists, otherwise use Arrow C++ config | ||
set(env_variable "PYARROW_WITH_${name}") | ||
if(DEFINED ENV{${env_variable}}) | ||
message(STATUS "Env variable is defined: ${env_variable}=$ENV{${env_variable}}") | ||
if($ENV{${env_variable}}) | ||
set("PYARROW_BUILD_${name}" ON) | ||
message(STATUS "Setting ${name} to ON through env variable") | ||
else() | ||
set("PYARROW_BUILD_${name}" OFF) | ||
message(STATUS "Setting ${name} to OFF through env variable") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I made this quite verbose, mostly for testing / debugging purposes. But can trim most of the messages after everything is working if that is preferred. |
||
endif() | ||
else() | ||
if(${arrow_option}) | ||
set("PYARROW_BUILD_${name}" ON) | ||
message(STATUS "Setting ${name} to ON through Arrow C++ config") | ||
else() | ||
set("PYARROW_BUILD_${name}" OFF) | ||
message(STATUS "Setting ${name} to OFF through Arrow C++ config") | ||
endif() | ||
endif() | ||
else() | ||
if("${PYARROW_${name}}") | ||
set("PYARROW_BUILD_${name}" ON) | ||
message(STATUS "Setting ${name} to ON through CMake option") | ||
else() | ||
set("PYARROW_BUILD_${name}" OFF) | ||
message(STATUS "Setting ${name} to OFF through CMake option") | ||
endif() | ||
endif() | ||
endmacro() | ||
|
||
define_option(ACERO "Build the PyArrow Acero integration" ARROW_ACERO) | ||
define_option(CUDA "Build the PyArrow CUDA support" ARROW_CUDA) | ||
define_option(DATASET "Build the PyArrow Dataset integration" ARROW_DATASET) | ||
define_option(FLIGHT "Build the PyArrow Flight integration" ARROW_FLIGHT) | ||
define_option(GANDIVA "Build the PyArrow Gandiva integration" ARROW_GANDIVA) | ||
define_option(ORC "Build the PyArrow ORC integration" ARROW_ORC) | ||
define_option(PARQUET "Build the PyArrow Parquet integration" ARROW_PARQUET) | ||
define_option(PARQUET_ENCRYPTION "Build the PyArrow Parquet encryption integration" | ||
PARQUET_REQUIRE_ENCRYPTION) | ||
define_option(SUBSTRAIT "Build the PyArrow Substrait integration" ARROW_SUBSTRAIT) | ||
define_option(AZURE "Build the PyArrow Azure integration" ARROW_AZURE) | ||
define_option(GCS "Build the PyArrow GCS integration" ARROW_GCS) | ||
define_option(S3 "Build the PyArrow S3 integration" ARROW_S3) | ||
define_option(HDFS "Build the PyArrow HDFS integration" ARROW_HDFS) | ||
option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF) | ||
option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" OFF) | ||
option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF) | ||
set(PYARROW_CXXFLAGS | ||
"" | ||
CACHE STRING "Compiler flags to append when compiling PyArrow C++") | ||
|
||
# enforce module dependencies | ||
if(PYARROW_BUILD_SUBSTRAIT) | ||
set(PYARROW_BUILD_DATASET ON) | ||
endif() | ||
if(PYARROW_BUILD_DATASET) | ||
set(PYARROW_BUILD_ACERO ON) | ||
endif() | ||
Comment on lines
+304
to
+310
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is to ensure we keep the same logic as we have now in setup.py, but we could in theory also leave this out (and ensure the scripts don't explicitly disable Acero when enabling Dataset) |
||
|
||
# PyArrow C++ | ||
set(PYARROW_CPP_ROOT_DIR pyarrow/src) | ||
set(PYARROW_CPP_SOURCE_DIR ${PYARROW_CPP_ROOT_DIR}/arrow/python) | ||
set(PYARROW_CPP_SRCS | ||
|
@@ -305,6 +352,7 @@ set(PYARROW_CPP_LINK_LIBS "") | |
|
||
# Check all the options from Arrow and PyArrow C++ to be in line | ||
if(PYARROW_BUILD_DATASET) | ||
message(STATUS "Building PyArrow with Dataset") | ||
if(NOT ARROW_DATASET) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_DATASET=ON") | ||
endif() | ||
|
@@ -317,6 +365,7 @@ if(PYARROW_BUILD_DATASET) | |
endif() | ||
|
||
if(PYARROW_BUILD_ACERO) | ||
message(STATUS "Building PyArrow with Acero") | ||
if(NOT ARROW_ACERO) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_ACERO=ON") | ||
endif() | ||
|
@@ -329,18 +378,13 @@ if(PYARROW_BUILD_ACERO) | |
endif() | ||
|
||
if(PYARROW_BUILD_PARQUET OR PYARROW_BUILD_PARQUET_ENCRYPTION) | ||
message(STATUS "Building PyArrow with Parquet") | ||
if(NOT ARROW_PARQUET) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_PARQUET=ON") | ||
endif() | ||
find_package(Parquet REQUIRED) | ||
endif() | ||
|
||
if(PYARROW_BUILD_HDFS) | ||
if(NOT ARROW_HDFS) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON") | ||
endif() | ||
endif() | ||
|
||
# Check for only Arrow C++ options | ||
if(ARROW_CSV) | ||
list(APPEND PYARROW_CPP_SRCS ${PYARROW_CPP_SOURCE_DIR}/csv.cc) | ||
|
@@ -400,6 +444,7 @@ endif() | |
|
||
set(PYARROW_CPP_FLIGHT_SRCS ${PYARROW_CPP_SOURCE_DIR}/flight.cc) | ||
if(PYARROW_BUILD_FLIGHT) | ||
message(STATUS "Building PyArrow with Flight") | ||
if(NOT ARROW_FLIGHT) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_FLIGHT=ON") | ||
endif() | ||
|
@@ -555,23 +600,39 @@ set_source_files_properties(pyarrow/lib.pyx PROPERTIES CYTHON_API TRUE) | |
set(LINK_LIBS arrow_python) | ||
|
||
if(PYARROW_BUILD_AZURE) | ||
message(STATUS "Building PyArrow with Azure") | ||
if(NOT ARROW_AZURE) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_AZURE=ON") | ||
endif() | ||
list(APPEND CYTHON_EXTENSIONS _azurefs) | ||
endif() | ||
|
||
if(PYARROW_BUILD_GCS) | ||
message(STATUS "Building PyArrow with GCS") | ||
if(NOT ARROW_GCS) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_GCS=ON") | ||
endif() | ||
list(APPEND CYTHON_EXTENSIONS _gcsfs) | ||
endif() | ||
|
||
if(PYARROW_BUILD_S3) | ||
message(STATUS "Building PyArrow with S3") | ||
if(NOT ARROW_S3) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_S3=ON") | ||
endif() | ||
list(APPEND CYTHON_EXTENSIONS _s3fs) | ||
endif() | ||
|
||
if(PYARROW_BUILD_HDFS) | ||
message(STATUS "Building PyArrow with HDFS") | ||
if(NOT ARROW_HDFS) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON") | ||
endif() | ||
list(APPEND CYTHON_EXTENSIONS _hdfs) | ||
endif() | ||
|
||
if(PYARROW_BUILD_CUDA) | ||
# Arrow CUDA | ||
message(STATUS "Building PyArrow with CUDA") | ||
if(NOT ARROW_CUDA) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_CUDA=ON") | ||
endif() | ||
|
@@ -646,8 +707,9 @@ if(PYARROW_BUILD_PARQUET) | |
endif() | ||
endif() | ||
|
||
# ORC | ||
if(PYARROW_BUILD_ORC) | ||
# ORC | ||
message(STATUS "Building PyArrow with ORC") | ||
if(NOT ARROW_ORC) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_ORC=ON") | ||
endif() | ||
|
@@ -679,6 +741,7 @@ endif() | |
|
||
# Substrait | ||
if(PYARROW_BUILD_SUBSTRAIT) | ||
message(STATUS "Building PyArrow with Substrait") | ||
if(NOT ARROW_SUBSTRAIT) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_SUBSTRAIT=ON") | ||
endif() | ||
|
@@ -696,6 +759,7 @@ endif() | |
|
||
# Gandiva | ||
if(PYARROW_BUILD_GANDIVA) | ||
message(STATUS "Building PyArrow with Gandiva") | ||
if(NOT ARROW_GANDIVA) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_GANDIVA=ON") | ||
endif() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -152,48 +152,27 @@ def initialize_options(self): | |
if not hasattr(sys, 'gettotalrefcount'): | ||
self.build_type = 'release' | ||
|
||
self.with_azure = strtobool( | ||
os.environ.get('PYARROW_WITH_AZURE', '0')) | ||
self.with_gcs = strtobool( | ||
os.environ.get('PYARROW_WITH_GCS', '0')) | ||
self.with_s3 = strtobool( | ||
os.environ.get('PYARROW_WITH_S3', '0')) | ||
self.with_hdfs = strtobool( | ||
os.environ.get('PYARROW_WITH_HDFS', '0')) | ||
self.with_cuda = strtobool( | ||
os.environ.get('PYARROW_WITH_CUDA', '0')) | ||
self.with_substrait = strtobool( | ||
os.environ.get('PYARROW_WITH_SUBSTRAIT', '0')) | ||
self.with_flight = strtobool( | ||
os.environ.get('PYARROW_WITH_FLIGHT', '0')) | ||
self.with_acero = strtobool( | ||
os.environ.get('PYARROW_WITH_ACERO', '0')) | ||
self.with_dataset = strtobool( | ||
os.environ.get('PYARROW_WITH_DATASET', '0')) | ||
self.with_parquet = strtobool( | ||
os.environ.get('PYARROW_WITH_PARQUET', '0')) | ||
self.with_parquet_encryption = strtobool( | ||
os.environ.get('PYARROW_WITH_PARQUET_ENCRYPTION', '0')) | ||
self.with_orc = strtobool( | ||
os.environ.get('PYARROW_WITH_ORC', '0')) | ||
self.with_gandiva = strtobool( | ||
os.environ.get('PYARROW_WITH_GANDIVA', '0')) | ||
self.with_azure = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we still need There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, they are still used in case someone is passing that to setup.py. See #41494 (comment) I think that's something we should deprecate, though. But planning to do a separate follow-up PR to that, since it's not actually related to this PR (it's just another way that someone can right now override the default, just as setting an environment variable) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, we need them for |
||
self.with_gcs = None | ||
self.with_s3 = None | ||
self.with_hdfs = None | ||
self.with_cuda = None | ||
self.with_substrait = None | ||
self.with_flight = None | ||
self.with_acero = None | ||
self.with_dataset = None | ||
self.with_parquet = None | ||
self.with_parquet_encryption = None | ||
self.with_orc = None | ||
self.with_gandiva = None | ||
|
||
self.generate_coverage = strtobool( | ||
os.environ.get('PYARROW_GENERATE_COVERAGE', '0')) | ||
self.bundle_arrow_cpp = strtobool( | ||
os.environ.get('PYARROW_BUNDLE_ARROW_CPP', '0')) | ||
self.bundle_cython_cpp = strtobool( | ||
os.environ.get('PYARROW_BUNDLE_CYTHON_CPP', '0')) | ||
|
||
self.with_parquet_encryption = (self.with_parquet_encryption and | ||
self.with_parquet) | ||
|
||
# enforce module dependencies | ||
if self.with_substrait: | ||
self.with_dataset = True | ||
if self.with_dataset: | ||
self.with_acero = True | ||
|
||
CYTHON_MODULE_NAMES = [ | ||
'lib', | ||
'_fs', | ||
|
@@ -270,23 +249,30 @@ def append_cmake_bool(value, varname): | |
cmake_options.append('-D{0}={1}'.format( | ||
varname, 'on' if value else 'off')) | ||
|
||
def append_cmake_component(flag, varname): | ||
# only pass this to cmake is the user pass the --with-component | ||
# flag to setup.py build_ext | ||
if flag is not None: | ||
append_cmake_bool(flag, varname) | ||
|
||
if self.cmake_generator: | ||
cmake_options += ['-G', self.cmake_generator] | ||
|
||
append_cmake_bool(self.with_cuda, 'PYARROW_BUILD_CUDA') | ||
append_cmake_bool(self.with_substrait, 'PYARROW_BUILD_SUBSTRAIT') | ||
append_cmake_bool(self.with_flight, 'PYARROW_BUILD_FLIGHT') | ||
append_cmake_bool(self.with_gandiva, 'PYARROW_BUILD_GANDIVA') | ||
append_cmake_bool(self.with_acero, 'PYARROW_BUILD_ACERO') | ||
append_cmake_bool(self.with_dataset, 'PYARROW_BUILD_DATASET') | ||
append_cmake_bool(self.with_orc, 'PYARROW_BUILD_ORC') | ||
append_cmake_bool(self.with_parquet, 'PYARROW_BUILD_PARQUET') | ||
append_cmake_bool(self.with_parquet_encryption, | ||
'PYARROW_BUILD_PARQUET_ENCRYPTION') | ||
append_cmake_bool(self.with_azure, 'PYARROW_BUILD_AZURE') | ||
append_cmake_bool(self.with_gcs, 'PYARROW_BUILD_GCS') | ||
append_cmake_bool(self.with_s3, 'PYARROW_BUILD_S3') | ||
append_cmake_bool(self.with_hdfs, 'PYARROW_BUILD_HDFS') | ||
append_cmake_component(self.with_cuda, 'PYARROW_CUDA') | ||
append_cmake_component(self.with_substrait, 'PYARROW_SUBSTRAIT') | ||
append_cmake_component(self.with_flight, 'PYARROW_FLIGHT') | ||
append_cmake_component(self.with_gandiva, 'PYARROW_GANDIVA') | ||
append_cmake_component(self.with_acero, 'PYARROW_ACERO') | ||
append_cmake_component(self.with_dataset, 'PYARROW_DATASET') | ||
append_cmake_component(self.with_orc, 'PYARROW_ORC') | ||
append_cmake_component(self.with_parquet, 'PYARROW_PARQUET') | ||
append_cmake_component(self.with_parquet_encryption, | ||
'PYARROW_PARQUET_ENCRYPTION') | ||
append_cmake_component(self.with_azure, 'PYARROW_AZURE') | ||
append_cmake_component(self.with_gcs, 'PYARROW_GCS') | ||
append_cmake_component(self.with_s3, 'PYARROW_S3') | ||
append_cmake_component(self.with_hdfs, 'PYARROW_HDFS') | ||
|
||
append_cmake_bool(self.bundle_arrow_cpp, | ||
'PYARROW_BUNDLE_ARROW_CPP') | ||
append_cmake_bool(self.bundle_cython_cpp, | ||
|
@@ -329,54 +315,8 @@ def append_cmake_bool(value, varname): | |
self._found_names = [] | ||
for name in self.CYTHON_MODULE_NAMES: | ||
built_path = pjoin(install_prefix, name + ext_suffix) | ||
if not os.path.exists(built_path): | ||
print(f'Did not find {built_path}') | ||
if self._failure_permitted(name): | ||
print(f'Cython module {name} failure permitted') | ||
continue | ||
raise RuntimeError('PyArrow C-extension failed to build:', | ||
os.path.abspath(built_path)) | ||
|
||
self._found_names.append(name) | ||
|
||
def _failure_permitted(self, name): | ||
if name == '_parquet' and not self.with_parquet: | ||
return True | ||
if name == '_parquet_encryption' and not self.with_parquet_encryption: | ||
return True | ||
if name == '_orc' and not self.with_orc: | ||
return True | ||
if name == '_flight' and not self.with_flight: | ||
return True | ||
if name == '_substrait' and not self.with_substrait: | ||
return True | ||
if name == '_azurefs' and not self.with_azure: | ||
return True | ||
if name == '_gcsfs' and not self.with_gcs: | ||
return True | ||
if name == '_s3fs' and not self.with_s3: | ||
return True | ||
if name == '_hdfs' and not self.with_hdfs: | ||
return True | ||
if name == '_dataset' and not self.with_dataset: | ||
return True | ||
if name == '_acero' and not self.with_acero: | ||
return True | ||
if name == '_exec_plan' and not self.with_acero: | ||
return True | ||
if name == '_dataset_orc' and not ( | ||
self.with_orc and self.with_dataset | ||
): | ||
return True | ||
if name == '_dataset_parquet' and not ( | ||
self.with_parquet and self.with_dataset | ||
): | ||
return True | ||
if name == '_cuda' and not self.with_cuda: | ||
return True | ||
if name == 'gandiva' and not self.with_gandiva: | ||
return True | ||
return False | ||
if os.path.exists(built_path): | ||
self._found_names.append(name) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to prepare There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's used in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm. It seems that the setuptools
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It does have
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, sorry. I checked wrong name... |
||
|
||
def _get_build_dir(self): | ||
# Get the package directory from build_py | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh... These patterns are wrong...
Could you use
^XXX/.*CMakeLists\.txt$|
for all existing patterns too?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Opened #41689 specifically for that