From 172763133b0c9234dca72db27ce519cde1c01116 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 10 May 2024 16:42:21 -0300 Subject: [PATCH] GH-39301: [Archery][CI][Integration] Add nanoarrow to archery + integration setup (#39302) ### Rationale for this change The ability to add integration testing was added in nanoarrow however, the infrastructure for running these tests currently lives in the arrow monorepo. ### What changes are included in this PR? - Added the relevant code to Archery such that these tests can be run - Added the relevant scripts/environment variables to CI such that these tests run in the integration CI job ### Are these changes tested? Yes, via the "Integration" CI job. ### Are there any user-facing changes? No. This PR still needs https://github.com/apache/arrow/pull/41264 for the integration tests to pass. * Closes: #39301 * GitHub Issue: #39301 Lead-authored-by: Dewey Dunnington Co-authored-by: Dewey Dunnington Signed-off-by: Dewey Dunnington --- .github/workflows/integration.yml | 6 + ci/scripts/integration_arrow_build.sh | 2 + ci/scripts/nanoarrow_build.sh | 52 ++++++ dev/archery/archery/cli.py | 5 +- dev/archery/archery/integration/datagen.py | 3 + dev/archery/archery/integration/runner.py | 8 +- .../archery/integration/tester_nanoarrow.py | 148 ++++++++++++++++++ docker-compose.yml | 2 + 8 files changed, 223 insertions(+), 3 deletions(-) create mode 100755 ci/scripts/nanoarrow_build.sh create mode 100644 dev/archery/archery/integration/tester_nanoarrow.py diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 6e09ad61480a6..f53f4aeb505d2 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -75,6 +75,11 @@ jobs: with: repository: apache/arrow-rs path: rust + - name: Checkout Arrow nanoarrow + uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 + with: + repository: apache/arrow-nanoarrow + path: nanoarrow - name: Free up disk space run: | ci/scripts/util_free_space.sh @@ -97,6 +102,7 @@ jobs: run: > archery docker run \ -e ARCHERY_DEFAULT_BRANCH=${{ github.event.repository.default_branch }} \ + -e ARCHERY_INTEGRATION_WITH_NANOARROW=1 \ -e ARCHERY_INTEGRATION_WITH_RUST=1 \ conda-integration - name: Docker Push diff --git a/ci/scripts/integration_arrow_build.sh b/ci/scripts/integration_arrow_build.sh index e5c31527aedff..9b54049a2b803 100755 --- a/ci/scripts/integration_arrow_build.sh +++ b/ci/scripts/integration_arrow_build.sh @@ -30,6 +30,8 @@ build_dir=${2} ${arrow_dir}/ci/scripts/rust_build.sh ${arrow_dir} ${build_dir} +${arrow_dir}/ci/scripts/nanoarrow_build.sh ${arrow_dir} ${build_dir} + if [ "${ARROW_INTEGRATION_CPP}" == "ON" ]; then ${arrow_dir}/ci/scripts/cpp_build.sh ${arrow_dir} ${build_dir} fi diff --git a/ci/scripts/nanoarrow_build.sh b/ci/scripts/nanoarrow_build.sh new file mode 100755 index 0000000000000..1612b9a2d0102 --- /dev/null +++ b/ci/scripts/nanoarrow_build.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +arrow_dir=${1} +source_dir=${1}/nanoarrow +build_dir=${2}/nanoarrow + +# This file is used to build the nanoarrow binaries needed for the archery +# integration tests. Testing of the nanoarrow implementation in normal CI is handled +# by github workflows in the arrow-nanoarrow repository. + +if [ "${ARCHERY_INTEGRATION_WITH_NANOARROW}" -eq "0" ]; then + echo "=====================================================================" + echo "Not building nanoarrow" + echo "=====================================================================" + exit 0; +elif [ ! -d "${source_dir}" ]; then + echo "=====================================================================" + echo "The nanoarrow source is missing. Please clone the arrow-nanoarrow repository" + echo "to arrow/nanoarrow before running the integration tests:" + echo " git clone https://github.com/apache/arrow-nanoarrow.git path/to/arrow/nanoarrow" + echo "=====================================================================" + exit 1; +fi + +set -x + +mkdir -p ${build_dir} +pushd ${build_dir} + +cmake ${source_dir} -DNANOARROW_BUILD_INTEGRATION_TESTS=ON +cmake --build . + +popd diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 8a26d9266f22d..cd746f9c4499a 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -738,6 +738,9 @@ def _set_default(opt, default): help='Include JavaScript in integration tests') @click.option('--with-go', type=bool, default=False, help='Include Go in integration tests') +@click.option('--with-nanoarrow', type=bool, default=False, + help='Include nanoarrow in integration tests', + envvar="ARCHERY_INTEGRATION_WITH_NANOARROW") @click.option('--with-rust', type=bool, default=False, help='Include Rust in integration tests', envvar="ARCHERY_INTEGRATION_WITH_RUST") @@ -776,7 +779,7 @@ def integration(with_all=False, random_seed=12345, **args): gen_path = args['write_generated_json'] - languages = ['cpp', 'csharp', 'java', 'js', 'go', 'rust'] + languages = ['cpp', 'csharp', 'java', 'js', 'go', 'nanoarrow', 'rust'] formats = ['ipc', 'flight', 'c_data'] enabled_languages = 0 diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 5cae907a4aa71..f6302165cd5a0 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1928,17 +1928,20 @@ def _temp_path(): .skip_tester('C#') .skip_tester('Java') .skip_tester('JS') + .skip_tester('nanoarrow') .skip_tester('Rust'), generate_binary_view_case() .skip_tester('Java') .skip_tester('JS') + .skip_tester('nanoarrow') .skip_tester('Rust'), generate_list_view_case() .skip_tester('C#') # Doesn't support large list views .skip_tester('Java') .skip_tester('JS') + .skip_tester('nanoarrow') .skip_tester('Rust'), generate_extension_case() diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py index 5b66842b25926..0ea244720cc1d 100644 --- a/dev/archery/archery/integration/runner.py +++ b/dev/archery/archery/integration/runner.py @@ -36,6 +36,7 @@ from .tester_java import JavaTester from .tester_js import JSTester from .tester_csharp import CSharpTester +from .tester_nanoarrow import NanoarrowTester from .util import guid, printer from .util import SKIP_C_ARRAY, SKIP_C_SCHEMA, SKIP_FLIGHT, SKIP_IPC from ..utils.source import ARROW_ROOT_DEFAULT @@ -541,8 +542,8 @@ def get_static_json_files(): def run_all_tests(with_cpp=True, with_java=True, with_js=True, with_csharp=True, with_go=True, with_rust=False, - run_ipc=False, run_flight=False, run_c_data=False, - tempdir=None, **kwargs): + with_nanoarrow=False, run_ipc=False, run_flight=False, + run_c_data=False, tempdir=None, **kwargs): tempdir = tempdir or tempfile.mkdtemp(prefix='arrow-integration-') testers: List[Tester] = [] @@ -562,6 +563,9 @@ def run_all_tests(with_cpp=True, with_java=True, with_js=True, if with_go: testers.append(GoTester(**kwargs)) + if with_nanoarrow: + testers.append(NanoarrowTester(**kwargs)) + if with_rust: testers.append(RustTester(**kwargs)) diff --git a/dev/archery/archery/integration/tester_nanoarrow.py b/dev/archery/archery/integration/tester_nanoarrow.py new file mode 100644 index 0000000000000..30ff1bb6e50a7 --- /dev/null +++ b/dev/archery/archery/integration/tester_nanoarrow.py @@ -0,0 +1,148 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import functools +import os + +from . import cdata +from .tester import Tester, CDataExporter, CDataImporter +from ..utils.source import ARROW_ROOT_DEFAULT + + +_NANOARROW_PATH = os.environ.get( + "ARROW_NANOARROW_PATH", + os.path.join(ARROW_ROOT_DEFAULT, "nanoarrow/cdata"), +) + +_INTEGRATION_DLL = os.path.join( + _NANOARROW_PATH, "libnanoarrow_c_data_integration" + cdata.dll_suffix +) + + +class NanoarrowTester(Tester): + PRODUCER = False + CONSUMER = False + FLIGHT_SERVER = False + FLIGHT_CLIENT = False + C_DATA_SCHEMA_EXPORTER = True + C_DATA_ARRAY_EXPORTER = True + C_DATA_SCHEMA_IMPORTER = True + C_DATA_ARRAY_IMPORTER = True + + name = "nanoarrow" + + def validate(self, json_path, arrow_path, quirks=None): + raise NotImplementedError() + + def json_to_file(self, json_path, arrow_path): + raise NotImplementedError() + + def stream_to_file(self, stream_path, file_path): + raise NotImplementedError() + + def file_to_stream(self, file_path, stream_path): + raise NotImplementedError() + + def make_c_data_exporter(self): + return NanoarrowCDataExporter(self.debug, self.args) + + def make_c_data_importer(self): + return NanoarrowCDataImporter(self.debug, self.args) + + +_nanoarrow_c_data_entrypoints = """ + const char* nanoarrow_CDataIntegration_ExportSchemaFromJson( + const char* json_path, struct ArrowSchema* out); + + const char* nanoarrow_CDataIntegration_ImportSchemaAndCompareToJson( + const char* json_path, struct ArrowSchema* schema); + + const char* nanoarrow_CDataIntegration_ExportBatchFromJson( + const char* json_path, int num_batch, struct ArrowArray* out); + + const char* nanoarrow_CDataIntegration_ImportBatchAndCompareToJson( + const char* json_path, int num_batch, struct ArrowArray* batch); + + int64_t nanoarrow_BytesAllocated(void); + """ + + +@functools.lru_cache +def _load_ffi(ffi, lib_path=_INTEGRATION_DLL): + ffi.cdef(_nanoarrow_c_data_entrypoints) + dll = ffi.dlopen(lib_path) + return dll + + +class _CDataBase: + def __init__(self, debug, args): + self.debug = debug + self.args = args + self.ffi = cdata.ffi() + self.dll = _load_ffi(self.ffi) + + def _check_nanoarrow_error(self, na_error): + """ + Check a `const char*` error return from an integration entrypoint. + + A null means success, a non-empty string is an error message. + The string is statically allocated on the nanoarrow side and does not + need to be released. + """ + assert self.ffi.typeof(na_error) is self.ffi.typeof("const char*") + if na_error != self.ffi.NULL: + error = self.ffi.string(na_error).decode("utf8", errors="replace") + raise RuntimeError(f"nanoarrow C Data Integration call failed: {error}") + + +class NanoarrowCDataExporter(CDataExporter, _CDataBase): + def export_schema_from_json(self, json_path, c_schema_ptr): + na_error = self.dll.nanoarrow_CDataIntegration_ExportSchemaFromJson( + str(json_path).encode(), c_schema_ptr + ) + self._check_nanoarrow_error(na_error) + + def export_batch_from_json(self, json_path, num_batch, c_array_ptr): + na_error = self.dll.nanoarrow_CDataIntegration_ExportBatchFromJson( + str(json_path).encode(), num_batch, c_array_ptr + ) + self._check_nanoarrow_error(na_error) + + @property + def supports_releasing_memory(self): + return True + + def record_allocation_state(self): + return self.dll.nanoarrow_BytesAllocated() + + +class NanoarrowCDataImporter(CDataImporter, _CDataBase): + def import_schema_and_compare_to_json(self, json_path, c_schema_ptr): + na_error = self.dll.nanoarrow_CDataIntegration_ImportSchemaAndCompareToJson( + str(json_path).encode(), c_schema_ptr + ) + self._check_nanoarrow_error(na_error) + + def import_batch_and_compare_to_json(self, json_path, num_batch, c_array_ptr): + na_error = self.dll.nanoarrow_CDataIntegration_ImportBatchAndCompareToJson( + str(json_path).encode(), num_batch, c_array_ptr + ) + self._check_nanoarrow_error(na_error) + + @property + def supports_releasing_memory(self): + return True diff --git a/docker-compose.yml b/docker-compose.yml index 9bedb59a77be8..7a4d455dfe723 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1751,9 +1751,11 @@ services: volumes: *conda-volumes environment: <<: [*common, *ccache] + ARCHERY_INTEGRATION_WITH_NANOARROW: 0 ARCHERY_INTEGRATION_WITH_RUST: 0 # Tell Archery where Arrow binaries are located ARROW_CPP_EXE_PATH: /build/cpp/debug + ARROW_NANOARROW_PATH: /build/nanoarrow ARROW_RUST_EXE_PATH: /build/rust/debug command: ["/arrow/ci/scripts/integration_arrow_build.sh /arrow /build &&