From 172763133b0c9234dca72db27ce519cde1c01116 Mon Sep 17 00:00:00 2001
From: Dewey Dunnington <dewey@fishandwhistle.net>
Date: Fri, 10 May 2024 16:42:21 -0300
Subject: [PATCH] GH-39301: [Archery][CI][Integration] Add nanoarrow to archery
 + integration setup (#39302)

### Rationale for this change

The ability to add integration testing was added in nanoarrow however, the infrastructure for running these tests currently lives in the arrow monorepo.

### What changes are included in this PR?

- Added the relevant code to Archery such that these tests can be run
- Added the relevant scripts/environment variables to CI such that these tests run in the integration CI job

### Are these changes tested?

Yes, via the "Integration" CI job.

### Are there any user-facing changes?

No.

This PR still needs https://github.com/apache/arrow/pull/41264 for the integration tests to pass.

* Closes: #39301
* GitHub Issue: #39301

Lead-authored-by: Dewey Dunnington <dewey@fishandwhistle.net>
Co-authored-by: Dewey Dunnington <dewey@voltrondata.com>
Signed-off-by: Dewey Dunnington <dewey@fishandwhistle.net>
---
 .github/workflows/integration.yml             |   6 +
 ci/scripts/integration_arrow_build.sh         |   2 +
 ci/scripts/nanoarrow_build.sh                 |  52 ++++++
 dev/archery/archery/cli.py                    |   5 +-
 dev/archery/archery/integration/datagen.py    |   3 +
 dev/archery/archery/integration/runner.py     |   8 +-
 .../archery/integration/tester_nanoarrow.py   | 148 ++++++++++++++++++
 docker-compose.yml                            |   2 +
 8 files changed, 223 insertions(+), 3 deletions(-)
 create mode 100755 ci/scripts/nanoarrow_build.sh
 create mode 100644 dev/archery/archery/integration/tester_nanoarrow.py

diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
index 6e09ad61480a6..f53f4aeb505d2 100644
--- a/.github/workflows/integration.yml
+++ b/.github/workflows/integration.yml
@@ -75,6 +75,11 @@ jobs:
         with:
           repository: apache/arrow-rs
           path: rust
+      - name: Checkout Arrow nanoarrow
+        uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0
+        with:
+          repository: apache/arrow-nanoarrow
+          path: nanoarrow
       - name: Free up disk space
         run: |
           ci/scripts/util_free_space.sh
@@ -97,6 +102,7 @@ jobs:
         run: >
           archery docker run \
             -e ARCHERY_DEFAULT_BRANCH=${{ github.event.repository.default_branch }} \
+            -e ARCHERY_INTEGRATION_WITH_NANOARROW=1 \
             -e ARCHERY_INTEGRATION_WITH_RUST=1 \
             conda-integration
       - name: Docker Push
diff --git a/ci/scripts/integration_arrow_build.sh b/ci/scripts/integration_arrow_build.sh
index e5c31527aedff..9b54049a2b803 100755
--- a/ci/scripts/integration_arrow_build.sh
+++ b/ci/scripts/integration_arrow_build.sh
@@ -30,6 +30,8 @@ build_dir=${2}
 
 ${arrow_dir}/ci/scripts/rust_build.sh ${arrow_dir} ${build_dir}
 
+${arrow_dir}/ci/scripts/nanoarrow_build.sh ${arrow_dir} ${build_dir}
+
 if [ "${ARROW_INTEGRATION_CPP}" == "ON" ]; then
     ${arrow_dir}/ci/scripts/cpp_build.sh ${arrow_dir} ${build_dir}
 fi
diff --git a/ci/scripts/nanoarrow_build.sh b/ci/scripts/nanoarrow_build.sh
new file mode 100755
index 0000000000000..1612b9a2d0102
--- /dev/null
+++ b/ci/scripts/nanoarrow_build.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+arrow_dir=${1}
+source_dir=${1}/nanoarrow
+build_dir=${2}/nanoarrow
+
+# This file is used to build the nanoarrow binaries needed for the archery
+# integration tests. Testing of the nanoarrow implementation in normal CI is handled
+# by github workflows in the arrow-nanoarrow repository.
+
+if [ "${ARCHERY_INTEGRATION_WITH_NANOARROW}" -eq "0" ]; then
+  echo "====================================================================="
+  echo "Not building nanoarrow"
+  echo "====================================================================="
+  exit 0;
+elif [ ! -d "${source_dir}" ]; then
+  echo "====================================================================="
+  echo "The nanoarrow source is missing. Please clone the arrow-nanoarrow repository"
+  echo "to arrow/nanoarrow before running the integration tests:"
+  echo "  git clone https://github.com/apache/arrow-nanoarrow.git path/to/arrow/nanoarrow"
+  echo "====================================================================="
+  exit 1;
+fi
+
+set -x
+
+mkdir -p ${build_dir}
+pushd ${build_dir}
+
+cmake ${source_dir} -DNANOARROW_BUILD_INTEGRATION_TESTS=ON
+cmake --build .
+
+popd
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index 8a26d9266f22d..cd746f9c4499a 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -738,6 +738,9 @@ def _set_default(opt, default):
               help='Include JavaScript in integration tests')
 @click.option('--with-go', type=bool, default=False,
               help='Include Go in integration tests')
+@click.option('--with-nanoarrow', type=bool, default=False,
+              help='Include nanoarrow in integration tests',
+              envvar="ARCHERY_INTEGRATION_WITH_NANOARROW")
 @click.option('--with-rust', type=bool, default=False,
               help='Include Rust in integration tests',
               envvar="ARCHERY_INTEGRATION_WITH_RUST")
@@ -776,7 +779,7 @@ def integration(with_all=False, random_seed=12345, **args):
 
     gen_path = args['write_generated_json']
 
-    languages = ['cpp', 'csharp', 'java', 'js', 'go', 'rust']
+    languages = ['cpp', 'csharp', 'java', 'js', 'go', 'nanoarrow', 'rust']
     formats = ['ipc', 'flight', 'c_data']
 
     enabled_languages = 0
diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py
index 5cae907a4aa71..f6302165cd5a0 100644
--- a/dev/archery/archery/integration/datagen.py
+++ b/dev/archery/archery/integration/datagen.py
@@ -1928,17 +1928,20 @@ def _temp_path():
         .skip_tester('C#')
         .skip_tester('Java')
         .skip_tester('JS')
+        .skip_tester('nanoarrow')
         .skip_tester('Rust'),
 
         generate_binary_view_case()
         .skip_tester('Java')
         .skip_tester('JS')
+        .skip_tester('nanoarrow')
         .skip_tester('Rust'),
 
         generate_list_view_case()
         .skip_tester('C#')     # Doesn't support large list views
         .skip_tester('Java')
         .skip_tester('JS')
+        .skip_tester('nanoarrow')
         .skip_tester('Rust'),
 
         generate_extension_case()
diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py
index 5b66842b25926..0ea244720cc1d 100644
--- a/dev/archery/archery/integration/runner.py
+++ b/dev/archery/archery/integration/runner.py
@@ -36,6 +36,7 @@
 from .tester_java import JavaTester
 from .tester_js import JSTester
 from .tester_csharp import CSharpTester
+from .tester_nanoarrow import NanoarrowTester
 from .util import guid, printer
 from .util import SKIP_C_ARRAY, SKIP_C_SCHEMA, SKIP_FLIGHT, SKIP_IPC
 from ..utils.source import ARROW_ROOT_DEFAULT
@@ -541,8 +542,8 @@ def get_static_json_files():
 
 def run_all_tests(with_cpp=True, with_java=True, with_js=True,
                   with_csharp=True, with_go=True, with_rust=False,
-                  run_ipc=False, run_flight=False, run_c_data=False,
-                  tempdir=None, **kwargs):
+                  with_nanoarrow=False, run_ipc=False, run_flight=False,
+                  run_c_data=False, tempdir=None, **kwargs):
     tempdir = tempdir or tempfile.mkdtemp(prefix='arrow-integration-')
 
     testers: List[Tester] = []
@@ -562,6 +563,9 @@ def run_all_tests(with_cpp=True, with_java=True, with_js=True,
     if with_go:
         testers.append(GoTester(**kwargs))
 
+    if with_nanoarrow:
+        testers.append(NanoarrowTester(**kwargs))
+
     if with_rust:
         testers.append(RustTester(**kwargs))
 
diff --git a/dev/archery/archery/integration/tester_nanoarrow.py b/dev/archery/archery/integration/tester_nanoarrow.py
new file mode 100644
index 0000000000000..30ff1bb6e50a7
--- /dev/null
+++ b/dev/archery/archery/integration/tester_nanoarrow.py
@@ -0,0 +1,148 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import functools
+import os
+
+from . import cdata
+from .tester import Tester, CDataExporter, CDataImporter
+from ..utils.source import ARROW_ROOT_DEFAULT
+
+
+_NANOARROW_PATH = os.environ.get(
+    "ARROW_NANOARROW_PATH",
+    os.path.join(ARROW_ROOT_DEFAULT, "nanoarrow/cdata"),
+)
+
+_INTEGRATION_DLL = os.path.join(
+    _NANOARROW_PATH, "libnanoarrow_c_data_integration" + cdata.dll_suffix
+)
+
+
+class NanoarrowTester(Tester):
+    PRODUCER = False
+    CONSUMER = False
+    FLIGHT_SERVER = False
+    FLIGHT_CLIENT = False
+    C_DATA_SCHEMA_EXPORTER = True
+    C_DATA_ARRAY_EXPORTER = True
+    C_DATA_SCHEMA_IMPORTER = True
+    C_DATA_ARRAY_IMPORTER = True
+
+    name = "nanoarrow"
+
+    def validate(self, json_path, arrow_path, quirks=None):
+        raise NotImplementedError()
+
+    def json_to_file(self, json_path, arrow_path):
+        raise NotImplementedError()
+
+    def stream_to_file(self, stream_path, file_path):
+        raise NotImplementedError()
+
+    def file_to_stream(self, file_path, stream_path):
+        raise NotImplementedError()
+
+    def make_c_data_exporter(self):
+        return NanoarrowCDataExporter(self.debug, self.args)
+
+    def make_c_data_importer(self):
+        return NanoarrowCDataImporter(self.debug, self.args)
+
+
+_nanoarrow_c_data_entrypoints = """
+    const char* nanoarrow_CDataIntegration_ExportSchemaFromJson(
+        const char* json_path, struct ArrowSchema* out);
+
+    const char* nanoarrow_CDataIntegration_ImportSchemaAndCompareToJson(
+        const char* json_path, struct ArrowSchema* schema);
+
+    const char* nanoarrow_CDataIntegration_ExportBatchFromJson(
+        const char* json_path, int num_batch, struct ArrowArray* out);
+
+    const char* nanoarrow_CDataIntegration_ImportBatchAndCompareToJson(
+        const char* json_path, int num_batch, struct ArrowArray* batch);
+
+    int64_t nanoarrow_BytesAllocated(void);
+    """
+
+
+@functools.lru_cache
+def _load_ffi(ffi, lib_path=_INTEGRATION_DLL):
+    ffi.cdef(_nanoarrow_c_data_entrypoints)
+    dll = ffi.dlopen(lib_path)
+    return dll
+
+
+class _CDataBase:
+    def __init__(self, debug, args):
+        self.debug = debug
+        self.args = args
+        self.ffi = cdata.ffi()
+        self.dll = _load_ffi(self.ffi)
+
+    def _check_nanoarrow_error(self, na_error):
+        """
+        Check a `const char*` error return from an integration entrypoint.
+
+        A null means success, a non-empty string is an error message.
+        The string is statically allocated on the nanoarrow side and does not
+        need to be released.
+        """
+        assert self.ffi.typeof(na_error) is self.ffi.typeof("const char*")
+        if na_error != self.ffi.NULL:
+            error = self.ffi.string(na_error).decode("utf8", errors="replace")
+            raise RuntimeError(f"nanoarrow C Data Integration call failed: {error}")
+
+
+class NanoarrowCDataExporter(CDataExporter, _CDataBase):
+    def export_schema_from_json(self, json_path, c_schema_ptr):
+        na_error = self.dll.nanoarrow_CDataIntegration_ExportSchemaFromJson(
+            str(json_path).encode(), c_schema_ptr
+        )
+        self._check_nanoarrow_error(na_error)
+
+    def export_batch_from_json(self, json_path, num_batch, c_array_ptr):
+        na_error = self.dll.nanoarrow_CDataIntegration_ExportBatchFromJson(
+            str(json_path).encode(), num_batch, c_array_ptr
+        )
+        self._check_nanoarrow_error(na_error)
+
+    @property
+    def supports_releasing_memory(self):
+        return True
+
+    def record_allocation_state(self):
+        return self.dll.nanoarrow_BytesAllocated()
+
+
+class NanoarrowCDataImporter(CDataImporter, _CDataBase):
+    def import_schema_and_compare_to_json(self, json_path, c_schema_ptr):
+        na_error = self.dll.nanoarrow_CDataIntegration_ImportSchemaAndCompareToJson(
+            str(json_path).encode(), c_schema_ptr
+        )
+        self._check_nanoarrow_error(na_error)
+
+    def import_batch_and_compare_to_json(self, json_path, num_batch, c_array_ptr):
+        na_error = self.dll.nanoarrow_CDataIntegration_ImportBatchAndCompareToJson(
+            str(json_path).encode(), num_batch, c_array_ptr
+        )
+        self._check_nanoarrow_error(na_error)
+
+    @property
+    def supports_releasing_memory(self):
+        return True
diff --git a/docker-compose.yml b/docker-compose.yml
index 9bedb59a77be8..7a4d455dfe723 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1751,9 +1751,11 @@ services:
     volumes: *conda-volumes
     environment:
       <<: [*common, *ccache]
+      ARCHERY_INTEGRATION_WITH_NANOARROW: 0
       ARCHERY_INTEGRATION_WITH_RUST: 0
       # Tell Archery where Arrow binaries are located
       ARROW_CPP_EXE_PATH: /build/cpp/debug
+      ARROW_NANOARROW_PATH: /build/nanoarrow
       ARROW_RUST_EXE_PATH: /build/rust/debug
     command:
       ["/arrow/ci/scripts/integration_arrow_build.sh /arrow /build &&