Skip to content

Commit

Permalink
apacheGH-41480: [Python] Building PyArrow: enable/disable python comp…
Browse files Browse the repository at this point in the history
…onents by default based on availability in Arrow C++
  • Loading branch information
jorisvandenbossche committed May 2, 2024
1 parent 14c54bb commit a7c33c5
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 127 deletions.
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ repos:
?^go/.*/CMakeLists\.txt$|
?^java/.*/CMakeLists\.txt$|
?^matlab/.*/CMakeLists\.txt$|
?^python/CMakeLists\.txt$|
?^python/.*/CMakeLists\.txt$|
)
exclude: >-
Expand Down
1 change: 0 additions & 1 deletion ci/appveyor-cpp-build.bat
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,6 @@ set PYARROW_WITH_ORC=%ARROW_ORC%
set PYARROW_WITH_PARQUET=ON
set PYARROW_WITH_PARQUET_ENCRYPTION=ON
set PYARROW_WITH_S3=%ARROW_S3%
set PYARROW_WITH_STATIC_BOOST=ON
set PYARROW_WITH_SUBSTRAIT=ON

set ARROW_HOME=%CONDA_PREFIX%\Library
Expand Down
81 changes: 52 additions & 29 deletions python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -108,25 +108,6 @@ if(UNIX)
endif()
endif()

# Top level cmake dir
if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
option(PYARROW_BUILD_ACERO "Build the PyArrow Acero integration" OFF)
option(PYARROW_BUILD_CUDA "Build the PyArrow CUDA support" OFF)
option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" OFF)
option(PYARROW_BUILD_FLIGHT "Build the PyArrow Flight integration" OFF)
option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" OFF)
option(PYARROW_BUILD_ORC "Build the PyArrow ORC integration" OFF)
option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" OFF)
option(PYARROW_BUILD_PARQUET_ENCRYPTION
"Build the PyArrow Parquet encryption integration" OFF)
option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF)
option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" OFF)
option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF)
set(PYARROW_CXXFLAGS
""
CACHE STRING "Compiler flags to append when compiling Arrow")
endif()

find_program(CCACHE_FOUND ccache)
if(CCACHE_FOUND
AND NOT CMAKE_C_COMPILER_LAUNCHER
Expand Down Expand Up @@ -265,11 +246,36 @@ message(STATUS "NumPy include dir: ${NUMPY_INCLUDE_DIRS}")

include(UseCython)

# PyArrow C++
# Arrow C++ and set default PyArrow build options
include(GNUInstallDirs)

find_package(Arrow REQUIRED)

# Top level cmake dir
if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
option(PYARROW_BUILD_ACERO "Build the PyArrow Acero integration" ${ARROW_ACERO})
option(PYARROW_BUILD_CUDA "Build the PyArrow CUDA support" ${ARROW_CUDA})
option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" ${ARROW_DATASET})
option(PYARROW_BUILD_FLIGHT "Build the PyArrow Flight integration" ${ARROW_FLIGHT})
option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" ${ARROW_GANDIVA})
option(PYARROW_BUILD_ORC "Build the PyArrow ORC integration" ${ARROW_ORC})
option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" ${ARROW_PARQUET})
option(PYARROW_BUILD_PARQUET_ENCRYPTION
"Build the PyArrow Parquet encryption integration" ${PARQUET_REQUIRE_ENCRYPTION})
option(PYARROW_BUILD_SUBSTRAIT "Build the PyArrow Substrait integration"
${ARROW_SUBSTRAIT})
option(PYARROW_BUILD_AZURE "Build the PyArrow Azure integration" ${ARROW_AZURE})
option(PYARROW_BUILD_GCS "Build the PyArrow GCS integration" ${ARROW_GCS})
option(PYARROW_BUILD_S3 "Build the PyArrow S3 integration" ${ARROW_S3})
option(PYARROW_BUILD_HDFS "Build the PyArrow HDFS integration" ${ARROW_HDFS})
option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF)
option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" OFF)
option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF)
set(PYARROW_CXXFLAGS
""
CACHE STRING "Compiler flags to append when compiling Arrow")
endif()

# PyArrow C++
set(PYARROW_CPP_ROOT_DIR pyarrow/src)
set(PYARROW_CPP_SOURCE_DIR ${PYARROW_CPP_ROOT_DIR}/arrow/python)
set(PYARROW_CPP_SRCS
Expand Down Expand Up @@ -305,6 +311,7 @@ set(PYARROW_CPP_LINK_LIBS "")

# Check all the options from Arrow and PyArrow C++ to be in line
if(PYARROW_BUILD_DATASET)
message(STATUS "Building PyArrow Dataset")
if(NOT ARROW_DATASET)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_DATASET=ON")
endif()
Expand All @@ -317,6 +324,7 @@ if(PYARROW_BUILD_DATASET)
endif()

if(PYARROW_BUILD_ACERO)
message(STATUS "Building PyArrow Acero")
if(NOT ARROW_ACERO)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_ACERO=ON")
endif()
Expand All @@ -329,18 +337,13 @@ if(PYARROW_BUILD_ACERO)
endif()

if(PYARROW_BUILD_PARQUET OR PYARROW_BUILD_PARQUET_ENCRYPTION)
message(STATUS "Building PyArrow Parquet")
if(NOT ARROW_PARQUET)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_PARQUET=ON")
endif()
find_package(Parquet REQUIRED)
endif()

if(PYARROW_BUILD_HDFS)
if(NOT ARROW_HDFS)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON")
endif()
endif()

# Check for only Arrow C++ options
if(ARROW_CSV)
list(APPEND PYARROW_CPP_SRCS ${PYARROW_CPP_SOURCE_DIR}/csv.cc)
Expand Down Expand Up @@ -400,6 +403,7 @@ endif()

set(PYARROW_CPP_FLIGHT_SRCS ${PYARROW_CPP_SOURCE_DIR}/flight.cc)
if(PYARROW_BUILD_FLIGHT)
message(STATUS "Building PyArrow Flight")
if(NOT ARROW_FLIGHT)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_FLIGHT=ON")
endif()
Expand Down Expand Up @@ -555,23 +559,39 @@ set_source_files_properties(pyarrow/lib.pyx PROPERTIES CYTHON_API TRUE)
set(LINK_LIBS arrow_python)

if(PYARROW_BUILD_AZURE)
message(STATUS "Building PyArrow Azure")
if(NOT ARROW_AZURE)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_AZURE=ON")
endif()
list(APPEND CYTHON_EXTENSIONS _azurefs)
endif()

if(PYARROW_BUILD_GCS)
message(STATUS "Building PyArrow GCS")
if(NOT ARROW_GCS)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_GCS=ON")
endif()
list(APPEND CYTHON_EXTENSIONS _gcsfs)
endif()

if(PYARROW_BUILD_S3)
message(STATUS "Building PyArrow S3")
if(NOT ARROW_S3)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_S3=ON")
endif()
list(APPEND CYTHON_EXTENSIONS _s3fs)
endif()

if(PYARROW_BUILD_HDFS)
message(STATUS "Building PyArrow HDFS")
if(NOT ARROW_HDFS)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON")
endif()
list(APPEND CYTHON_EXTENSIONS _hdfs)
endif()

if(PYARROW_BUILD_CUDA)
# Arrow CUDA
message(STATUS "Building PyArrow CUDA")
if(NOT ARROW_CUDA)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_CUDA=ON")
endif()
Expand Down Expand Up @@ -646,8 +666,9 @@ if(PYARROW_BUILD_PARQUET)
endif()
endif()

# ORC
if(PYARROW_BUILD_ORC)
# ORC
message(STATUS "Building PyArrow ORC")
if(NOT ARROW_ORC)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_ORC=ON")
endif()
Expand Down Expand Up @@ -679,6 +700,7 @@ endif()

# Substrait
if(PYARROW_BUILD_SUBSTRAIT)
message(STATUS "Building PyArrow Substrait")
if(NOT ARROW_SUBSTRAIT)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_SUBSTRAIT=ON")
endif()
Expand All @@ -696,6 +718,7 @@ endif()

# Gandiva
if(PYARROW_BUILD_GANDIVA)
message(STATUS "Building PyArrow Gandiva")
if(NOT ARROW_GANDIVA)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_GANDIVA=ON")
endif()
Expand Down
118 changes: 21 additions & 97 deletions python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,48 +152,13 @@ def initialize_options(self):
if not hasattr(sys, 'gettotalrefcount'):
self.build_type = 'release'

self.with_azure = strtobool(
os.environ.get('PYARROW_WITH_AZURE', '0'))
self.with_gcs = strtobool(
os.environ.get('PYARROW_WITH_GCS', '0'))
self.with_s3 = strtobool(
os.environ.get('PYARROW_WITH_S3', '0'))
self.with_hdfs = strtobool(
os.environ.get('PYARROW_WITH_HDFS', '0'))
self.with_cuda = strtobool(
os.environ.get('PYARROW_WITH_CUDA', '0'))
self.with_substrait = strtobool(
os.environ.get('PYARROW_WITH_SUBSTRAIT', '0'))
self.with_flight = strtobool(
os.environ.get('PYARROW_WITH_FLIGHT', '0'))
self.with_acero = strtobool(
os.environ.get('PYARROW_WITH_ACERO', '0'))
self.with_dataset = strtobool(
os.environ.get('PYARROW_WITH_DATASET', '0'))
self.with_parquet = strtobool(
os.environ.get('PYARROW_WITH_PARQUET', '0'))
self.with_parquet_encryption = strtobool(
os.environ.get('PYARROW_WITH_PARQUET_ENCRYPTION', '0'))
self.with_orc = strtobool(
os.environ.get('PYARROW_WITH_ORC', '0'))
self.with_gandiva = strtobool(
os.environ.get('PYARROW_WITH_GANDIVA', '0'))
self.generate_coverage = strtobool(
os.environ.get('PYARROW_GENERATE_COVERAGE', '0'))
self.bundle_arrow_cpp = strtobool(
os.environ.get('PYARROW_BUNDLE_ARROW_CPP', '0'))
self.bundle_cython_cpp = strtobool(
os.environ.get('PYARROW_BUNDLE_CYTHON_CPP', '0'))

self.with_parquet_encryption = (self.with_parquet_encryption and
self.with_parquet)

# enforce module dependencies
if self.with_substrait:
self.with_dataset = True
if self.with_dataset:
self.with_acero = True

CYTHON_MODULE_NAMES = [
'lib',
'_fs',
Expand Down Expand Up @@ -270,23 +235,28 @@ def append_cmake_bool(value, varname):
cmake_options.append('-D{0}={1}'.format(
varname, 'on' if value else 'off'))

def maybe_append_cmake_bool(varname):
env_var = varname.replace("PYARROW_BUILD", "PYARROW_WITH")
if env_var in os.environ:
value = strtobool(os.environ.get(env_var))
append_cmake_bool(value, varname)

if self.cmake_generator:
cmake_options += ['-G', self.cmake_generator]

append_cmake_bool(self.with_cuda, 'PYARROW_BUILD_CUDA')
append_cmake_bool(self.with_substrait, 'PYARROW_BUILD_SUBSTRAIT')
append_cmake_bool(self.with_flight, 'PYARROW_BUILD_FLIGHT')
append_cmake_bool(self.with_gandiva, 'PYARROW_BUILD_GANDIVA')
append_cmake_bool(self.with_acero, 'PYARROW_BUILD_ACERO')
append_cmake_bool(self.with_dataset, 'PYARROW_BUILD_DATASET')
append_cmake_bool(self.with_orc, 'PYARROW_BUILD_ORC')
append_cmake_bool(self.with_parquet, 'PYARROW_BUILD_PARQUET')
append_cmake_bool(self.with_parquet_encryption,
'PYARROW_BUILD_PARQUET_ENCRYPTION')
append_cmake_bool(self.with_azure, 'PYARROW_BUILD_AZURE')
append_cmake_bool(self.with_gcs, 'PYARROW_BUILD_GCS')
append_cmake_bool(self.with_s3, 'PYARROW_BUILD_S3')
append_cmake_bool(self.with_hdfs, 'PYARROW_BUILD_HDFS')
maybe_append_cmake_bool('PYARROW_BUILD_CUDA')
maybe_append_cmake_bool('PYARROW_BUILD_SUBSTRAIT')
maybe_append_cmake_bool('PYARROW_BUILD_FLIGHT')
maybe_append_cmake_bool('PYARROW_BUILD_GANDIVA')
maybe_append_cmake_bool('PYARROW_BUILD_ACERO')
maybe_append_cmake_bool('PYARROW_BUILD_DATASET')
maybe_append_cmake_bool('PYARROW_BUILD_ORC')
maybe_append_cmake_bool('PYARROW_BUILD_PARQUET')
maybe_append_cmake_bool('PYARROW_BUILD_PARQUET_ENCRYPTION')
maybe_append_cmake_bool('PYARROW_BUILD_AZURE')
maybe_append_cmake_bool('PYARROW_BUILD_GCS')
maybe_append_cmake_bool('PYARROW_BUILD_S3')
maybe_append_cmake_bool('PYARROW_BUILD_HDFS')
append_cmake_bool(self.bundle_arrow_cpp,
'PYARROW_BUNDLE_ARROW_CPP')
append_cmake_bool(self.bundle_cython_cpp,
Expand Down Expand Up @@ -329,54 +299,8 @@ def append_cmake_bool(value, varname):
self._found_names = []
for name in self.CYTHON_MODULE_NAMES:
built_path = pjoin(install_prefix, name + ext_suffix)
if not os.path.exists(built_path):
print(f'Did not find {built_path}')
if self._failure_permitted(name):
print(f'Cython module {name} failure permitted')
continue
raise RuntimeError('PyArrow C-extension failed to build:',
os.path.abspath(built_path))

self._found_names.append(name)

def _failure_permitted(self, name):
if name == '_parquet' and not self.with_parquet:
return True
if name == '_parquet_encryption' and not self.with_parquet_encryption:
return True
if name == '_orc' and not self.with_orc:
return True
if name == '_flight' and not self.with_flight:
return True
if name == '_substrait' and not self.with_substrait:
return True
if name == '_azurefs' and not self.with_azure:
return True
if name == '_gcsfs' and not self.with_gcs:
return True
if name == '_s3fs' and not self.with_s3:
return True
if name == '_hdfs' and not self.with_hdfs:
return True
if name == '_dataset' and not self.with_dataset:
return True
if name == '_acero' and not self.with_acero:
return True
if name == '_exec_plan' and not self.with_acero:
return True
if name == '_dataset_orc' and not (
self.with_orc and self.with_dataset
):
return True
if name == '_dataset_parquet' and not (
self.with_parquet and self.with_dataset
):
return True
if name == '_cuda' and not self.with_cuda:
return True
if name == 'gandiva' and not self.with_gandiva:
return True
return False
if os.path.exists(built_path):
self._found_names.append(name)

def _get_build_dir(self):
# Get the package directory from build_py
Expand Down

0 comments on commit a7c33c5

Please sign in to comment.