From a7c33c539f80ce82d0555fcd0c8595ad2077d5cd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 2 May 2024 11:44:29 +0200 Subject: [PATCH] GH-41480: [Python] Building PyArrow: enable/disable python components by default based on availability in Arrow C++ --- .pre-commit-config.yaml | 1 + ci/appveyor-cpp-build.bat | 1 - python/CMakeLists.txt | 81 ++++++++++++++++---------- python/setup.py | 118 +++++++------------------------------- 4 files changed, 74 insertions(+), 127 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bf5ca08d53c32..6bdd862e6be3a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -123,6 +123,7 @@ repos: ?^go/.*/CMakeLists\.txt$| ?^java/.*/CMakeLists\.txt$| ?^matlab/.*/CMakeLists\.txt$| + ?^python/CMakeLists\.txt$| ?^python/.*/CMakeLists\.txt$| ) exclude: >- diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index 8cfa67c437264..f688fbb63a9ad 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -129,7 +129,6 @@ set PYARROW_WITH_ORC=%ARROW_ORC% set PYARROW_WITH_PARQUET=ON set PYARROW_WITH_PARQUET_ENCRYPTION=ON set PYARROW_WITH_S3=%ARROW_S3% -set PYARROW_WITH_STATIC_BOOST=ON set PYARROW_WITH_SUBSTRAIT=ON set ARROW_HOME=%CONDA_PREFIX%\Library diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 212862357ace2..38359eaf53064 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -108,25 +108,6 @@ if(UNIX) endif() endif() -# Top level cmake dir -if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") - option(PYARROW_BUILD_ACERO "Build the PyArrow Acero integration" OFF) - option(PYARROW_BUILD_CUDA "Build the PyArrow CUDA support" OFF) - option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" OFF) - option(PYARROW_BUILD_FLIGHT "Build the PyArrow Flight integration" OFF) - option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" OFF) - option(PYARROW_BUILD_ORC "Build the PyArrow ORC integration" OFF) - option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" OFF) - option(PYARROW_BUILD_PARQUET_ENCRYPTION - "Build the PyArrow Parquet encryption integration" OFF) - option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF) - option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" OFF) - option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF) - set(PYARROW_CXXFLAGS - "" - CACHE STRING "Compiler flags to append when compiling Arrow") -endif() - find_program(CCACHE_FOUND ccache) if(CCACHE_FOUND AND NOT CMAKE_C_COMPILER_LAUNCHER @@ -265,11 +246,36 @@ message(STATUS "NumPy include dir: ${NUMPY_INCLUDE_DIRS}") include(UseCython) -# PyArrow C++ +# Arrow C++ and set default PyArrow build options include(GNUInstallDirs) - find_package(Arrow REQUIRED) +# Top level cmake dir +if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") + option(PYARROW_BUILD_ACERO "Build the PyArrow Acero integration" ${ARROW_ACERO}) + option(PYARROW_BUILD_CUDA "Build the PyArrow CUDA support" ${ARROW_CUDA}) + option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" ${ARROW_DATASET}) + option(PYARROW_BUILD_FLIGHT "Build the PyArrow Flight integration" ${ARROW_FLIGHT}) + option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" ${ARROW_GANDIVA}) + option(PYARROW_BUILD_ORC "Build the PyArrow ORC integration" ${ARROW_ORC}) + option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" ${ARROW_PARQUET}) + option(PYARROW_BUILD_PARQUET_ENCRYPTION + "Build the PyArrow Parquet encryption integration" ${PARQUET_REQUIRE_ENCRYPTION}) + option(PYARROW_BUILD_SUBSTRAIT "Build the PyArrow Substrait integration" + ${ARROW_SUBSTRAIT}) + option(PYARROW_BUILD_AZURE "Build the PyArrow Azure integration" ${ARROW_AZURE}) + option(PYARROW_BUILD_GCS "Build the PyArrow GCS integration" ${ARROW_GCS}) + option(PYARROW_BUILD_S3 "Build the PyArrow S3 integration" ${ARROW_S3}) + option(PYARROW_BUILD_HDFS "Build the PyArrow HDFS integration" ${ARROW_HDFS}) + option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF) + option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" OFF) + option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF) + set(PYARROW_CXXFLAGS + "" + CACHE STRING "Compiler flags to append when compiling Arrow") +endif() + +# PyArrow C++ set(PYARROW_CPP_ROOT_DIR pyarrow/src) set(PYARROW_CPP_SOURCE_DIR ${PYARROW_CPP_ROOT_DIR}/arrow/python) set(PYARROW_CPP_SRCS @@ -305,6 +311,7 @@ set(PYARROW_CPP_LINK_LIBS "") # Check all the options from Arrow and PyArrow C++ to be in line if(PYARROW_BUILD_DATASET) + message(STATUS "Building PyArrow Dataset") if(NOT ARROW_DATASET) message(FATAL_ERROR "You must build Arrow C++ with ARROW_DATASET=ON") endif() @@ -317,6 +324,7 @@ if(PYARROW_BUILD_DATASET) endif() if(PYARROW_BUILD_ACERO) + message(STATUS "Building PyArrow Acero") if(NOT ARROW_ACERO) message(FATAL_ERROR "You must build Arrow C++ with ARROW_ACERO=ON") endif() @@ -329,18 +337,13 @@ if(PYARROW_BUILD_ACERO) endif() if(PYARROW_BUILD_PARQUET OR PYARROW_BUILD_PARQUET_ENCRYPTION) + message(STATUS "Building PyArrow Parquet") if(NOT ARROW_PARQUET) message(FATAL_ERROR "You must build Arrow C++ with ARROW_PARQUET=ON") endif() find_package(Parquet REQUIRED) endif() -if(PYARROW_BUILD_HDFS) - if(NOT ARROW_HDFS) - message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON") - endif() -endif() - # Check for only Arrow C++ options if(ARROW_CSV) list(APPEND PYARROW_CPP_SRCS ${PYARROW_CPP_SOURCE_DIR}/csv.cc) @@ -400,6 +403,7 @@ endif() set(PYARROW_CPP_FLIGHT_SRCS ${PYARROW_CPP_SOURCE_DIR}/flight.cc) if(PYARROW_BUILD_FLIGHT) + message(STATUS "Building PyArrow Flight") if(NOT ARROW_FLIGHT) message(FATAL_ERROR "You must build Arrow C++ with ARROW_FLIGHT=ON") endif() @@ -555,23 +559,39 @@ set_source_files_properties(pyarrow/lib.pyx PROPERTIES CYTHON_API TRUE) set(LINK_LIBS arrow_python) if(PYARROW_BUILD_AZURE) + message(STATUS "Building PyArrow Azure") + if(NOT ARROW_AZURE) + message(FATAL_ERROR "You must build Arrow C++ with ARROW_AZURE=ON") + endif() list(APPEND CYTHON_EXTENSIONS _azurefs) endif() if(PYARROW_BUILD_GCS) + message(STATUS "Building PyArrow GCS") + if(NOT ARROW_GCS) + message(FATAL_ERROR "You must build Arrow C++ with ARROW_GCS=ON") + endif() list(APPEND CYTHON_EXTENSIONS _gcsfs) endif() if(PYARROW_BUILD_S3) + message(STATUS "Building PyArrow S3") + if(NOT ARROW_S3) + message(FATAL_ERROR "You must build Arrow C++ with ARROW_S3=ON") + endif() list(APPEND CYTHON_EXTENSIONS _s3fs) endif() if(PYARROW_BUILD_HDFS) + message(STATUS "Building PyArrow HDFS") + if(NOT ARROW_HDFS) + message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON") + endif() list(APPEND CYTHON_EXTENSIONS _hdfs) endif() if(PYARROW_BUILD_CUDA) - # Arrow CUDA + message(STATUS "Building PyArrow CUDA") if(NOT ARROW_CUDA) message(FATAL_ERROR "You must build Arrow C++ with ARROW_CUDA=ON") endif() @@ -646,8 +666,9 @@ if(PYARROW_BUILD_PARQUET) endif() endif() +# ORC if(PYARROW_BUILD_ORC) - # ORC + message(STATUS "Building PyArrow ORC") if(NOT ARROW_ORC) message(FATAL_ERROR "You must build Arrow C++ with ARROW_ORC=ON") endif() @@ -679,6 +700,7 @@ endif() # Substrait if(PYARROW_BUILD_SUBSTRAIT) + message(STATUS "Building PyArrow Substrait") if(NOT ARROW_SUBSTRAIT) message(FATAL_ERROR "You must build Arrow C++ with ARROW_SUBSTRAIT=ON") endif() @@ -696,6 +718,7 @@ endif() # Gandiva if(PYARROW_BUILD_GANDIVA) + message(STATUS "Building PyArrow Gandiva") if(NOT ARROW_GANDIVA) message(FATAL_ERROR "You must build Arrow C++ with ARROW_GANDIVA=ON") endif() diff --git a/python/setup.py b/python/setup.py index 6f3dddb29d248..e63db9d5797cd 100755 --- a/python/setup.py +++ b/python/setup.py @@ -152,32 +152,6 @@ def initialize_options(self): if not hasattr(sys, 'gettotalrefcount'): self.build_type = 'release' - self.with_azure = strtobool( - os.environ.get('PYARROW_WITH_AZURE', '0')) - self.with_gcs = strtobool( - os.environ.get('PYARROW_WITH_GCS', '0')) - self.with_s3 = strtobool( - os.environ.get('PYARROW_WITH_S3', '0')) - self.with_hdfs = strtobool( - os.environ.get('PYARROW_WITH_HDFS', '0')) - self.with_cuda = strtobool( - os.environ.get('PYARROW_WITH_CUDA', '0')) - self.with_substrait = strtobool( - os.environ.get('PYARROW_WITH_SUBSTRAIT', '0')) - self.with_flight = strtobool( - os.environ.get('PYARROW_WITH_FLIGHT', '0')) - self.with_acero = strtobool( - os.environ.get('PYARROW_WITH_ACERO', '0')) - self.with_dataset = strtobool( - os.environ.get('PYARROW_WITH_DATASET', '0')) - self.with_parquet = strtobool( - os.environ.get('PYARROW_WITH_PARQUET', '0')) - self.with_parquet_encryption = strtobool( - os.environ.get('PYARROW_WITH_PARQUET_ENCRYPTION', '0')) - self.with_orc = strtobool( - os.environ.get('PYARROW_WITH_ORC', '0')) - self.with_gandiva = strtobool( - os.environ.get('PYARROW_WITH_GANDIVA', '0')) self.generate_coverage = strtobool( os.environ.get('PYARROW_GENERATE_COVERAGE', '0')) self.bundle_arrow_cpp = strtobool( @@ -185,15 +159,6 @@ def initialize_options(self): self.bundle_cython_cpp = strtobool( os.environ.get('PYARROW_BUNDLE_CYTHON_CPP', '0')) - self.with_parquet_encryption = (self.with_parquet_encryption and - self.with_parquet) - - # enforce module dependencies - if self.with_substrait: - self.with_dataset = True - if self.with_dataset: - self.with_acero = True - CYTHON_MODULE_NAMES = [ 'lib', '_fs', @@ -270,23 +235,28 @@ def append_cmake_bool(value, varname): cmake_options.append('-D{0}={1}'.format( varname, 'on' if value else 'off')) + def maybe_append_cmake_bool(varname): + env_var = varname.replace("PYARROW_BUILD", "PYARROW_WITH") + if env_var in os.environ: + value = strtobool(os.environ.get(env_var)) + append_cmake_bool(value, varname) + if self.cmake_generator: cmake_options += ['-G', self.cmake_generator] - append_cmake_bool(self.with_cuda, 'PYARROW_BUILD_CUDA') - append_cmake_bool(self.with_substrait, 'PYARROW_BUILD_SUBSTRAIT') - append_cmake_bool(self.with_flight, 'PYARROW_BUILD_FLIGHT') - append_cmake_bool(self.with_gandiva, 'PYARROW_BUILD_GANDIVA') - append_cmake_bool(self.with_acero, 'PYARROW_BUILD_ACERO') - append_cmake_bool(self.with_dataset, 'PYARROW_BUILD_DATASET') - append_cmake_bool(self.with_orc, 'PYARROW_BUILD_ORC') - append_cmake_bool(self.with_parquet, 'PYARROW_BUILD_PARQUET') - append_cmake_bool(self.with_parquet_encryption, - 'PYARROW_BUILD_PARQUET_ENCRYPTION') - append_cmake_bool(self.with_azure, 'PYARROW_BUILD_AZURE') - append_cmake_bool(self.with_gcs, 'PYARROW_BUILD_GCS') - append_cmake_bool(self.with_s3, 'PYARROW_BUILD_S3') - append_cmake_bool(self.with_hdfs, 'PYARROW_BUILD_HDFS') + maybe_append_cmake_bool('PYARROW_BUILD_CUDA') + maybe_append_cmake_bool('PYARROW_BUILD_SUBSTRAIT') + maybe_append_cmake_bool('PYARROW_BUILD_FLIGHT') + maybe_append_cmake_bool('PYARROW_BUILD_GANDIVA') + maybe_append_cmake_bool('PYARROW_BUILD_ACERO') + maybe_append_cmake_bool('PYARROW_BUILD_DATASET') + maybe_append_cmake_bool('PYARROW_BUILD_ORC') + maybe_append_cmake_bool('PYARROW_BUILD_PARQUET') + maybe_append_cmake_bool('PYARROW_BUILD_PARQUET_ENCRYPTION') + maybe_append_cmake_bool('PYARROW_BUILD_AZURE') + maybe_append_cmake_bool('PYARROW_BUILD_GCS') + maybe_append_cmake_bool('PYARROW_BUILD_S3') + maybe_append_cmake_bool('PYARROW_BUILD_HDFS') append_cmake_bool(self.bundle_arrow_cpp, 'PYARROW_BUNDLE_ARROW_CPP') append_cmake_bool(self.bundle_cython_cpp, @@ -329,54 +299,8 @@ def append_cmake_bool(value, varname): self._found_names = [] for name in self.CYTHON_MODULE_NAMES: built_path = pjoin(install_prefix, name + ext_suffix) - if not os.path.exists(built_path): - print(f'Did not find {built_path}') - if self._failure_permitted(name): - print(f'Cython module {name} failure permitted') - continue - raise RuntimeError('PyArrow C-extension failed to build:', - os.path.abspath(built_path)) - - self._found_names.append(name) - - def _failure_permitted(self, name): - if name == '_parquet' and not self.with_parquet: - return True - if name == '_parquet_encryption' and not self.with_parquet_encryption: - return True - if name == '_orc' and not self.with_orc: - return True - if name == '_flight' and not self.with_flight: - return True - if name == '_substrait' and not self.with_substrait: - return True - if name == '_azurefs' and not self.with_azure: - return True - if name == '_gcsfs' and not self.with_gcs: - return True - if name == '_s3fs' and not self.with_s3: - return True - if name == '_hdfs' and not self.with_hdfs: - return True - if name == '_dataset' and not self.with_dataset: - return True - if name == '_acero' and not self.with_acero: - return True - if name == '_exec_plan' and not self.with_acero: - return True - if name == '_dataset_orc' and not ( - self.with_orc and self.with_dataset - ): - return True - if name == '_dataset_parquet' and not ( - self.with_parquet and self.with_dataset - ): - return True - if name == '_cuda' and not self.with_cuda: - return True - if name == 'gandiva' and not self.with_gandiva: - return True - return False + if os.path.exists(built_path): + self._found_names.append(name) def _get_build_dir(self): # Get the package directory from build_py