From 8d068f1fdde77d0ee0e0b2f91b44cc903289c234 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 16 May 2024 14:15:57 +0200 Subject: [PATCH] GH-41480: [Python] Building PyArrow: enable/disable python components by default based on availability in Arrow C++ (#41494) ### Rationale for this change Currently, when building pyarrow from source, one needs to manually enable the optional components through setting `PYARROW_WITH_...` environment variables. However, we could also make a default choice of components based on which ones where enabled in the Arrow C++ build. ### What changes are included in this PR? Set defaults for the various `PYARROW_BUILD_` based on the `ARROW_` setting. Keep the current `PYARROW_WITH_` environment variables working to allow to override this default. ### Are there any user-facing changes? No * GitHub Issue: #41480 Lead-authored-by: Joris Van den Bossche Co-authored-by: Sutou Kouhei Signed-off-by: Joris Van den Bossche --- ci/appveyor-cpp-build.bat | 1 - python/CMakeLists.txt | 115 +++++++++++++++++++++++--------- python/setup.py | 134 +++++++++++--------------------------- 3 files changed, 123 insertions(+), 127 deletions(-) diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index 8cfa67c437264..f688fbb63a9ad 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -129,7 +129,6 @@ set PYARROW_WITH_ORC=%ARROW_ORC% set PYARROW_WITH_PARQUET=ON set PYARROW_WITH_PARQUET_ENCRYPTION=ON set PYARROW_WITH_S3=%ARROW_S3% -set PYARROW_WITH_STATIC_BOOST=ON set PYARROW_WITH_SUBSTRAIT=ON set ARROW_HOME=%CONDA_PREFIX%\Library diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 212862357ace2..07acb9e31a731 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -108,25 +108,6 @@ if(UNIX) endif() endif() -# Top level cmake dir -if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") - option(PYARROW_BUILD_ACERO "Build the PyArrow Acero integration" OFF) - option(PYARROW_BUILD_CUDA "Build the PyArrow CUDA support" OFF) - option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" OFF) - option(PYARROW_BUILD_FLIGHT "Build the PyArrow Flight integration" OFF) - option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" OFF) - option(PYARROW_BUILD_ORC "Build the PyArrow ORC integration" OFF) - option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" OFF) - option(PYARROW_BUILD_PARQUET_ENCRYPTION - "Build the PyArrow Parquet encryption integration" OFF) - option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF) - option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" OFF) - option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF) - set(PYARROW_CXXFLAGS - "" - CACHE STRING "Compiler flags to append when compiling Arrow") -endif() - find_program(CCACHE_FOUND ccache) if(CCACHE_FOUND AND NOT CMAKE_C_COMPILER_LAUNCHER @@ -265,11 +246,70 @@ message(STATUS "NumPy include dir: ${NUMPY_INCLUDE_DIRS}") include(UseCython) -# PyArrow C++ +# Arrow C++ and set default PyArrow build options include(GNUInstallDirs) - find_package(Arrow REQUIRED) +macro(define_option name description arrow_option) + set("PYARROW_${name}" + "AUTO" + CACHE STRING ${description}) + + if("${PYARROW_${name}}" STREQUAL "AUTO") + # by default, first check if env variable exists, otherwise use Arrow C++ config + set(env_variable "PYARROW_WITH_${name}") + if(DEFINED ENV{${env_variable}}) + if($ENV{${env_variable}}) + set("PYARROW_BUILD_${name}" ON) + else() + set("PYARROW_BUILD_${name}" OFF) + endif() + else() + if(${arrow_option}) + set("PYARROW_BUILD_${name}" ON) + else() + set("PYARROW_BUILD_${name}" OFF) + endif() + endif() + else() + if("${PYARROW_${name}}") + set("PYARROW_BUILD_${name}" ON) + else() + set("PYARROW_BUILD_${name}" OFF) + endif() + endif() +endmacro() + +define_option(ACERO "Build the PyArrow Acero integration" ARROW_ACERO) +define_option(CUDA "Build the PyArrow CUDA support" ARROW_CUDA) +define_option(DATASET "Build the PyArrow Dataset integration" ARROW_DATASET) +define_option(FLIGHT "Build the PyArrow Flight integration" ARROW_FLIGHT) +define_option(GANDIVA "Build the PyArrow Gandiva integration" ARROW_GANDIVA) +define_option(ORC "Build the PyArrow ORC integration" ARROW_ORC) +define_option(PARQUET "Build the PyArrow Parquet integration" ARROW_PARQUET) +define_option(PARQUET_ENCRYPTION "Build the PyArrow Parquet encryption integration" + PARQUET_REQUIRE_ENCRYPTION) +define_option(SUBSTRAIT "Build the PyArrow Substrait integration" ARROW_SUBSTRAIT) +define_option(AZURE "Build the PyArrow Azure integration" ARROW_AZURE) +define_option(GCS "Build the PyArrow GCS integration" ARROW_GCS) +define_option(S3 "Build the PyArrow S3 integration" ARROW_S3) +define_option(HDFS "Build the PyArrow HDFS integration" ARROW_HDFS) +option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF) +option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" OFF) +option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF) +set(PYARROW_CXXFLAGS + "" + CACHE STRING "Compiler flags to append when compiling PyArrow C++") + +# enforce module dependencies +if(PYARROW_BUILD_SUBSTRAIT) + set(PYARROW_BUILD_DATASET ON) +endif() +if(PYARROW_BUILD_DATASET) + set(PYARROW_BUILD_ACERO ON) +endif() + +# PyArrow C++ set(PYARROW_CPP_ROOT_DIR pyarrow/src) set(PYARROW_CPP_SOURCE_DIR ${PYARROW_CPP_ROOT_DIR}/arrow/python) set(PYARROW_CPP_SRCS @@ -305,6 +345,7 @@ set(PYARROW_CPP_LINK_LIBS "") # Check all the options from Arrow and PyArrow C++ to be in line if(PYARROW_BUILD_DATASET) + message(STATUS "Building PyArrow with Dataset") if(NOT ARROW_DATASET) message(FATAL_ERROR "You must build Arrow C++ with ARROW_DATASET=ON") endif() @@ -317,6 +358,7 @@ if(PYARROW_BUILD_DATASET) endif() if(PYARROW_BUILD_ACERO) + message(STATUS "Building PyArrow with Acero") if(NOT ARROW_ACERO) message(FATAL_ERROR "You must build Arrow C++ with ARROW_ACERO=ON") endif() @@ -329,18 +371,13 @@ if(PYARROW_BUILD_ACERO) endif() if(PYARROW_BUILD_PARQUET OR PYARROW_BUILD_PARQUET_ENCRYPTION) + message(STATUS "Building PyArrow with Parquet") if(NOT ARROW_PARQUET) message(FATAL_ERROR "You must build Arrow C++ with ARROW_PARQUET=ON") endif() find_package(Parquet REQUIRED) endif() -if(PYARROW_BUILD_HDFS) - if(NOT ARROW_HDFS) - message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON") - endif() -endif() - # Check for only Arrow C++ options if(ARROW_CSV) list(APPEND PYARROW_CPP_SRCS ${PYARROW_CPP_SOURCE_DIR}/csv.cc) @@ -400,6 +437,7 @@ endif() set(PYARROW_CPP_FLIGHT_SRCS ${PYARROW_CPP_SOURCE_DIR}/flight.cc) if(PYARROW_BUILD_FLIGHT) + message(STATUS "Building PyArrow with Flight") if(NOT ARROW_FLIGHT) message(FATAL_ERROR "You must build Arrow C++ with ARROW_FLIGHT=ON") endif() @@ -555,23 +593,39 @@ set_source_files_properties(pyarrow/lib.pyx PROPERTIES CYTHON_API TRUE) set(LINK_LIBS arrow_python) if(PYARROW_BUILD_AZURE) + message(STATUS "Building PyArrow with Azure") + if(NOT ARROW_AZURE) + message(FATAL_ERROR "You must build Arrow C++ with ARROW_AZURE=ON") + endif() list(APPEND CYTHON_EXTENSIONS _azurefs) endif() if(PYARROW_BUILD_GCS) + message(STATUS "Building PyArrow with GCS") + if(NOT ARROW_GCS) + message(FATAL_ERROR "You must build Arrow C++ with ARROW_GCS=ON") + endif() list(APPEND CYTHON_EXTENSIONS _gcsfs) endif() if(PYARROW_BUILD_S3) + message(STATUS "Building PyArrow with S3") + if(NOT ARROW_S3) + message(FATAL_ERROR "You must build Arrow C++ with ARROW_S3=ON") + endif() list(APPEND CYTHON_EXTENSIONS _s3fs) endif() if(PYARROW_BUILD_HDFS) + message(STATUS "Building PyArrow with HDFS") + if(NOT ARROW_HDFS) + message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON") + endif() list(APPEND CYTHON_EXTENSIONS _hdfs) endif() if(PYARROW_BUILD_CUDA) - # Arrow CUDA + message(STATUS "Building PyArrow with CUDA") if(NOT ARROW_CUDA) message(FATAL_ERROR "You must build Arrow C++ with ARROW_CUDA=ON") endif() @@ -646,8 +700,9 @@ if(PYARROW_BUILD_PARQUET) endif() endif() +# ORC if(PYARROW_BUILD_ORC) - # ORC + message(STATUS "Building PyArrow with ORC") if(NOT ARROW_ORC) message(FATAL_ERROR "You must build Arrow C++ with ARROW_ORC=ON") endif() @@ -679,6 +734,7 @@ endif() # Substrait if(PYARROW_BUILD_SUBSTRAIT) + message(STATUS "Building PyArrow with Substrait") if(NOT ARROW_SUBSTRAIT) message(FATAL_ERROR "You must build Arrow C++ with ARROW_SUBSTRAIT=ON") endif() @@ -696,6 +752,7 @@ endif() # Gandiva if(PYARROW_BUILD_GANDIVA) + message(STATUS "Building PyArrow with Gandiva") if(NOT ARROW_GANDIVA) message(FATAL_ERROR "You must build Arrow C++ with ARROW_GANDIVA=ON") endif() diff --git a/python/setup.py b/python/setup.py index 6f3dddb29d248..ed2b7961e5fbb 100755 --- a/python/setup.py +++ b/python/setup.py @@ -152,32 +152,20 @@ def initialize_options(self): if not hasattr(sys, 'gettotalrefcount'): self.build_type = 'release' - self.with_azure = strtobool( - os.environ.get('PYARROW_WITH_AZURE', '0')) - self.with_gcs = strtobool( - os.environ.get('PYARROW_WITH_GCS', '0')) - self.with_s3 = strtobool( - os.environ.get('PYARROW_WITH_S3', '0')) - self.with_hdfs = strtobool( - os.environ.get('PYARROW_WITH_HDFS', '0')) - self.with_cuda = strtobool( - os.environ.get('PYARROW_WITH_CUDA', '0')) - self.with_substrait = strtobool( - os.environ.get('PYARROW_WITH_SUBSTRAIT', '0')) - self.with_flight = strtobool( - os.environ.get('PYARROW_WITH_FLIGHT', '0')) - self.with_acero = strtobool( - os.environ.get('PYARROW_WITH_ACERO', '0')) - self.with_dataset = strtobool( - os.environ.get('PYARROW_WITH_DATASET', '0')) - self.with_parquet = strtobool( - os.environ.get('PYARROW_WITH_PARQUET', '0')) - self.with_parquet_encryption = strtobool( - os.environ.get('PYARROW_WITH_PARQUET_ENCRYPTION', '0')) - self.with_orc = strtobool( - os.environ.get('PYARROW_WITH_ORC', '0')) - self.with_gandiva = strtobool( - os.environ.get('PYARROW_WITH_GANDIVA', '0')) + self.with_azure = None + self.with_gcs = None + self.with_s3 = None + self.with_hdfs = None + self.with_cuda = None + self.with_substrait = None + self.with_flight = None + self.with_acero = None + self.with_dataset = None + self.with_parquet = None + self.with_parquet_encryption = None + self.with_orc = None + self.with_gandiva = None + self.generate_coverage = strtobool( os.environ.get('PYARROW_GENERATE_COVERAGE', '0')) self.bundle_arrow_cpp = strtobool( @@ -185,15 +173,6 @@ def initialize_options(self): self.bundle_cython_cpp = strtobool( os.environ.get('PYARROW_BUNDLE_CYTHON_CPP', '0')) - self.with_parquet_encryption = (self.with_parquet_encryption and - self.with_parquet) - - # enforce module dependencies - if self.with_substrait: - self.with_dataset = True - if self.with_dataset: - self.with_acero = True - CYTHON_MODULE_NAMES = [ 'lib', '_fs', @@ -270,23 +249,30 @@ def append_cmake_bool(value, varname): cmake_options.append('-D{0}={1}'.format( varname, 'on' if value else 'off')) + def append_cmake_component(flag, varname): + # only pass this to cmake is the user pass the --with-component + # flag to setup.py build_ext + if flag is not None: + append_cmake_bool(flag, varname) + if self.cmake_generator: cmake_options += ['-G', self.cmake_generator] - append_cmake_bool(self.with_cuda, 'PYARROW_BUILD_CUDA') - append_cmake_bool(self.with_substrait, 'PYARROW_BUILD_SUBSTRAIT') - append_cmake_bool(self.with_flight, 'PYARROW_BUILD_FLIGHT') - append_cmake_bool(self.with_gandiva, 'PYARROW_BUILD_GANDIVA') - append_cmake_bool(self.with_acero, 'PYARROW_BUILD_ACERO') - append_cmake_bool(self.with_dataset, 'PYARROW_BUILD_DATASET') - append_cmake_bool(self.with_orc, 'PYARROW_BUILD_ORC') - append_cmake_bool(self.with_parquet, 'PYARROW_BUILD_PARQUET') - append_cmake_bool(self.with_parquet_encryption, - 'PYARROW_BUILD_PARQUET_ENCRYPTION') - append_cmake_bool(self.with_azure, 'PYARROW_BUILD_AZURE') - append_cmake_bool(self.with_gcs, 'PYARROW_BUILD_GCS') - append_cmake_bool(self.with_s3, 'PYARROW_BUILD_S3') - append_cmake_bool(self.with_hdfs, 'PYARROW_BUILD_HDFS') + append_cmake_component(self.with_cuda, 'PYARROW_CUDA') + append_cmake_component(self.with_substrait, 'PYARROW_SUBSTRAIT') + append_cmake_component(self.with_flight, 'PYARROW_FLIGHT') + append_cmake_component(self.with_gandiva, 'PYARROW_GANDIVA') + append_cmake_component(self.with_acero, 'PYARROW_ACERO') + append_cmake_component(self.with_dataset, 'PYARROW_DATASET') + append_cmake_component(self.with_orc, 'PYARROW_ORC') + append_cmake_component(self.with_parquet, 'PYARROW_PARQUET') + append_cmake_component(self.with_parquet_encryption, + 'PYARROW_PARQUET_ENCRYPTION') + append_cmake_component(self.with_azure, 'PYARROW_AZURE') + append_cmake_component(self.with_gcs, 'PYARROW_GCS') + append_cmake_component(self.with_s3, 'PYARROW_S3') + append_cmake_component(self.with_hdfs, 'PYARROW_HDFS') + append_cmake_bool(self.bundle_arrow_cpp, 'PYARROW_BUNDLE_ARROW_CPP') append_cmake_bool(self.bundle_cython_cpp, @@ -329,54 +315,8 @@ def append_cmake_bool(value, varname): self._found_names = [] for name in self.CYTHON_MODULE_NAMES: built_path = pjoin(install_prefix, name + ext_suffix) - if not os.path.exists(built_path): - print(f'Did not find {built_path}') - if self._failure_permitted(name): - print(f'Cython module {name} failure permitted') - continue - raise RuntimeError('PyArrow C-extension failed to build:', - os.path.abspath(built_path)) - - self._found_names.append(name) - - def _failure_permitted(self, name): - if name == '_parquet' and not self.with_parquet: - return True - if name == '_parquet_encryption' and not self.with_parquet_encryption: - return True - if name == '_orc' and not self.with_orc: - return True - if name == '_flight' and not self.with_flight: - return True - if name == '_substrait' and not self.with_substrait: - return True - if name == '_azurefs' and not self.with_azure: - return True - if name == '_gcsfs' and not self.with_gcs: - return True - if name == '_s3fs' and not self.with_s3: - return True - if name == '_hdfs' and not self.with_hdfs: - return True - if name == '_dataset' and not self.with_dataset: - return True - if name == '_acero' and not self.with_acero: - return True - if name == '_exec_plan' and not self.with_acero: - return True - if name == '_dataset_orc' and not ( - self.with_orc and self.with_dataset - ): - return True - if name == '_dataset_parquet' and not ( - self.with_parquet and self.with_dataset - ): - return True - if name == '_cuda' and not self.with_cuda: - return True - if name == 'gandiva' and not self.with_gandiva: - return True - return False + if os.path.exists(built_path): + self._found_names.append(name) def _get_build_dir(self): # Get the package directory from build_py