diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index fe49e275d908d..36a0dc014db8d 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -32,12 +32,12 @@ env: jobs: complete: - name: AMD64 Ubuntu 22.04 Complete Documentation + name: AMD64 Debian 12 Complete Documentation runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 150 env: - UBUNTU: "22.04" + JDK: 17 steps: - name: Checkout Arrow uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 @@ -50,8 +50,8 @@ jobs: uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0 with: path: .docker - key: ubuntu-docs-${{ hashFiles('cpp/**') }} - restore-keys: ubuntu-docs- + key: debian-docs-${{ hashFiles('cpp/**') }} + restore-keys: debian-docs- - name: Setup Python uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: @@ -62,7 +62,8 @@ jobs: env: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - run: archery docker run ubuntu-docs + JDK: 17 + run: archery docker run debian-docs - name: Docker Push if: >- success() && @@ -73,4 +74,4 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} continue-on-error: true - run: archery docker push ubuntu-docs + run: archery docker push debian-docs diff --git a/.github/workflows/docs_light.yml b/.github/workflows/docs_light.yml index 376c87651d2d0..947e2ac21b83c 100644 --- a/.github/workflows/docs_light.yml +++ b/.github/workflows/docs_light.yml @@ -31,7 +31,7 @@ on: permissions: contents: read - + env: ARCHERY_DEBUG: 1 ARCHERY_USE_DOCKER_CLI: 1 diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index 83afa69a653a9..4665a32e24bbe 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -28,6 +28,7 @@ sphinx-design sphinx-copybutton sphinx-lint sphinxcontrib-jquery +sphinxcontrib-mermaid sphinx==6.2 # Requirement for doctest-cython # Needs upper pin of 0.3.0, see: diff --git a/ci/docker/linux-apt-docs.dockerfile b/ci/docker/linux-apt-docs.dockerfile index ec424b4e6eaa0..1c916840e071b 100644 --- a/ci/docker/linux-apt-docs.dockerfile +++ b/ci/docker/linux-apt-docs.dockerfile @@ -21,18 +21,34 @@ FROM ${base} ARG r=4.4 ARG jdk=8 -# See R install instructions at https://cloud.r-project.org/bin/linux/ubuntu/ +ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium + +# See R install instructions at https://cloud.r-project.org/bin/linux/ RUN apt-get update -y && \ apt-get install -y \ - dirmngr \ apt-transport-https \ - software-properties-common && \ - wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | \ - tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc && \ - add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu '$(lsb_release -cs)'-cran40/' && \ + dirmngr \ + gpg \ + lsb-release && \ + gpg --keyserver keyserver.ubuntu.com \ + --recv-key 95C0FAF38DB3CCAD0C080A7BDC78B2DDEABC47B7 && \ + gpg --export 95C0FAF38DB3CCAD0C080A7BDC78B2DDEABC47B7 | \ + gpg --no-default-keyring \ + --keyring /usr/share/keyrings/cran.gpg \ + --import - && \ + echo "deb [signed-by=/usr/share/keyrings/cran.gpg] https://cloud.r-project.org/bin/linux/$(lsb_release -is | tr 'A-Z' 'a-z') $(lsb_release -cs)-cran40/" | \ + tee /etc/apt/sources.list.d/cran.list && \ + if [ -f /etc/apt/sources.list.d/debian.sources ]; then \ + sed -i \ + -e 's/main$/main contrib non-free non-free-firmware/g' \ + /etc/apt/sources.list.d/debian.sources; \ + fi && \ + apt-get update -y && \ apt-get install -y --no-install-recommends \ autoconf-archive \ automake \ + chromium \ + chromium-sandbox \ curl \ doxygen \ gi-docgen \ @@ -48,6 +64,8 @@ RUN apt-get update -y && \ libxml2-dev \ meson \ ninja-build \ + nodejs \ + npm \ nvidia-cuda-toolkit \ openjdk-${jdk}-jdk-headless \ pandoc \ @@ -55,9 +73,12 @@ RUN apt-get update -y && \ r-base=${r}* \ rsync \ ruby-dev \ + sudo \ wget && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* + rm -rf /var/lib/apt/lists/* && \ + PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \ + npm install -g yarn @mermaid-js/mermaid-cli ENV JAVA_HOME=/usr/lib/jvm/java-${jdk}-openjdk-amd64 @@ -68,20 +89,6 @@ RUN /arrow/ci/scripts/util_download_apache.sh \ ENV PATH=/opt/apache-maven-${maven}/bin:$PATH RUN mvn -version -ARG node=16 -RUN apt-get purge -y npm && \ - apt-get autoremove -y --purge && \ - wget -q -O - https://deb.nodesource.com/setup_${node}.x | bash - && \ - apt-get install -y nodejs && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ - npm install -g yarn - -COPY docs/requirements.txt /arrow/docs/ -RUN python3 -m venv ${ARROW_PYTHON_VENV} && \ - . ${ARROW_PYTHON_VENV}/bin/activate && \ - pip install -r arrow/docs/requirements.txt - COPY c_glib/Gemfile /arrow/c_glib/ RUN gem install --no-document bundler && \ bundle install --gemfile /arrow/c_glib/Gemfile @@ -98,6 +105,17 @@ COPY r/DESCRIPTION /arrow/r/ RUN /arrow/ci/scripts/r_deps.sh /arrow && \ R -e "install.packages('pkgdown')" +RUN useradd --user-group --create-home --groups audio,video arrow +RUN echo "arrow ALL=(ALL:ALL) NOPASSWD:ALL" | \ + EDITOR=tee visudo -f /etc/sudoers.d/arrow +USER arrow + +COPY docs/requirements.txt /arrow/docs/ +RUN sudo chown -R arrow: ${ARROW_PYTHON_VENV} && \ + python3 -m venv ${ARROW_PYTHON_VENV} && \ + . ${ARROW_PYTHON_VENV}/bin/activate && \ + pip install -r arrow/docs/requirements.txt + ENV ARROW_ACERO=ON \ ARROW_AZURE=OFF \ ARROW_BUILD_STATIC=OFF \ diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index e28ceae8801f0..ceeab2455bef6 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -229,12 +229,17 @@ find . -name "*.o" -delete popd if [ -x "$(command -v ldconfig)" ]; then - ldconfig ${ARROW_HOME}/${CMAKE_INSTALL_LIBDIR:-lib} + if [ -x "$(command -v sudo)" ]; then + SUDO=sudo + else + SUDO= + fi + ${SUDO} ldconfig ${ARROW_HOME}/${CMAKE_INSTALL_LIBDIR:-lib} fi if [ "${ARROW_USE_CCACHE}" == "ON" ]; then - echo -e "===\n=== ccache statistics after build\n===" - ccache -sv 2>/dev/null || ccache -s + echo -e "===\n=== ccache statistics after build\n===" + ccache -sv 2>/dev/null || ccache -s fi if command -v sccache &> /dev/null; then @@ -244,6 +249,6 @@ fi if [ "${BUILD_DOCS_CPP}" == "ON" ]; then pushd ${source_dir}/apidoc - doxygen + OUTPUT_DIRECTORY=${build_dir}/apidoc doxygen popd fi diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh index a5a012ad2c5c4..2eb58e8dc75ec 100755 --- a/ci/scripts/integration_arrow.sh +++ b/ci/scripts/integration_arrow.sh @@ -40,6 +40,8 @@ if [ "${ARROW_INTEGRATION_JAVA}" == "ON" ]; then pip install jpype1 fi +export ARROW_BUILD_ROOT=${build_dir} + # Get more detailed context on crashes export PYTHONFAULTHANDLER=1 diff --git a/ci/scripts/java_build.sh b/ci/scripts/java_build.sh index 2103f0329baec..0fa1edab429c0 100755 --- a/ci/scripts/java_build.sh +++ b/ci/scripts/java_build.sh @@ -75,7 +75,16 @@ fi # Use `2 * ncores` threads mvn="${mvn} -T 2C" -pushd ${source_dir} +# https://github.com/apache/arrow/issues/41429 +# TODO: We want to out-of-source build. This is a workaround. We copy +# all needed files to the build directory from the source directory +# and build in the build directory. +mkdir -p ${build_dir} +rm -rf ${build_dir}/format +cp -aL ${arrow_dir}/format ${build_dir}/ +rm -rf ${build_dir}/java +cp -aL ${source_dir} ${build_dir}/ +pushd ${build_dir}/java if [ "${ARROW_JAVA_SHADE_FLATBUFFERS}" == "ON" ]; then mvn="${mvn} -Pshade-flatbuffers" @@ -95,7 +104,7 @@ if [ "${BUILD_DOCS_JAVA}" == "ON" ]; then # HTTP pooling is turned of to avoid download issues https://issues.apache.org/jira/browse/ARROW-11633 mkdir -p ${build_dir}/docs/java/reference ${mvn} -Dcheckstyle.skip=true -Dhttp.keepAlive=false -Dmaven.wagon.http.pool=false clean install site - rsync -a ${arrow_dir}/java/target/site/apidocs/ ${build_dir}/docs/java/reference + rsync -a target/site/apidocs/ ${build_dir}/docs/java/reference fi popd diff --git a/ci/scripts/java_cdata_integration.sh b/ci/scripts/java_cdata_integration.sh index 86ea7cf155350..0ee5d3026aa09 100755 --- a/ci/scripts/java_cdata_integration.sh +++ b/ci/scripts/java_cdata_integration.sh @@ -20,9 +20,9 @@ set -ex arrow_dir=${1} -export ARROW_SOURCE_DIR=${arrow_dir} +build_dir=${2} -pushd ${arrow_dir}/java/c/src/test/python +pushd ${build_dir}/java/c/src/test/python python integration_tests.py diff --git a/ci/scripts/js_build.sh b/ci/scripts/js_build.sh index d61f74f0b7ca1..196539ee0f101 100755 --- a/ci/scripts/js_build.sh +++ b/ci/scripts/js_build.sh @@ -25,7 +25,16 @@ build_dir=${2} : ${BUILD_DOCS_JS:=OFF} -pushd ${source_dir} +# https://github.com/apache/arrow/issues/41429 +# TODO: We want to out-of-source build. This is a workaround. We copy +# all needed files to the build directory from the source directory +# and build in the build directory. +rm -rf ${build_dir}/js +mkdir -p ${build_dir} +cp -aL ${arrow_dir}/LICENSE.txt ${build_dir}/ +cp -aL ${arrow_dir}/NOTICE.txt ${build_dir}/ +cp -aL ${source_dir} ${build_dir}/js +pushd ${build_dir}/js yarn --immutable yarn lint:ci @@ -34,18 +43,18 @@ yarn build if [ "${BUILD_DOCS_JS}" == "ON" ]; then # If apache or upstream are defined use those as remote. # Otherwise use origin which could be a fork on PRs. - if [ "$(git config --get remote.apache.url)" == "git@github.com:apache/arrow.git" ]; then + if [ "$(git -C ${arrow_dir} config --get remote.apache.url)" == "git@github.com:apache/arrow.git" ]; then yarn doc --gitRemote apache - elif [[ "$(git config --get remote.upstream.url)" =~ "https://github.com/apache/arrow" ]]; then + elif [[ "$(git -C ${arrow_dir}config --get remote.upstream.url)" =~ "https://github.com/apache/arrow" ]]; then yarn doc --gitRemote upstream - elif [[ "$(basename -s .git $(git config --get remote.origin.url))" == "arrow" ]]; then + elif [[ "$(basename -s .git $(git -C ${arrow_dir} config --get remote.origin.url))" == "arrow" ]]; then yarn doc else echo "Failed to build docs because the remote is not set correctly. Please set the origin or upstream remote to https://github.com/apache/arrow.git or the apache remote to git@github.com:apache/arrow.git." exit 0 fi mkdir -p ${build_dir}/docs/js - rsync -a ${arrow_dir}/js/doc/ ${build_dir}/docs/js + rsync -a doc/ ${build_dir}/docs/js fi popd diff --git a/ci/scripts/js_test.sh b/ci/scripts/js_test.sh index 40de974ede161..863b1c3d34613 100755 --- a/ci/scripts/js_test.sh +++ b/ci/scripts/js_test.sh @@ -20,8 +20,9 @@ set -ex source_dir=${1}/js +build_dir=${2}/js -pushd ${source_dir} +pushd ${build_dir} yarn lint yarn test diff --git a/ci/scripts/python_build.sh b/ci/scripts/python_build.sh index 99153cdf75539..9455baf353633 100755 --- a/ci/scripts/python_build.sh +++ b/ci/scripts/python_build.sh @@ -78,17 +78,42 @@ export PYARROW_PARALLEL=${n_jobs} export CMAKE_PREFIX_PATH export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} -pushd ${source_dir} +# https://github.com/apache/arrow/issues/41429 +# TODO: We want to out-of-source build. This is a workaround. We copy +# all needed files to the build directory from the source directory +# and build in the build directory. +rm -rf ${python_build_dir} +cp -aL ${source_dir} ${python_build_dir} +pushd ${python_build_dir} # - Cannot call setup.py as it may install in the wrong directory # on Debian/Ubuntu (ARROW-15243). # - Cannot use build isolation as we want to use specific dependency versions # (e.g. Numpy, Pandas) on some CI jobs. ${PYTHON:-python} -m pip install --no-deps --no-build-isolation -vv . -# Remove build artifacts from source directory -find build/ -user root -delete popd if [ "${BUILD_DOCS_PYTHON}" == "ON" ]; then + # https://github.com/apache/arrow/issues/41429 + # TODO: We want to out-of-source build. This is a workaround. + # + # Copy docs/source because the "autosummary_generate = True" + # configuration generates files to docs/source/python/generated/. + rm -rf ${python_build_dir}/docs/source + mkdir -p ${python_build_dir}/docs + cp -a ${arrow_dir}/docs/source ${python_build_dir}/docs/ + rm -rf ${python_build_dir}/format + cp -a ${arrow_dir}/format ${python_build_dir}/ + rm -rf ${python_build_dir}/cpp/examples + mkdir -p ${python_build_dir}/cpp + cp -a ${arrow_dir}/cpp/examples ${python_build_dir}/cpp/ + rm -rf ${python_build_dir}/ci + cp -a ${arrow_dir}/ci/ ${python_build_dir}/ ncpus=$(python -c "import os; print(os.cpu_count())") - sphinx-build -b html -j ${ncpus} ${arrow_dir}/docs/source ${build_dir}/docs + export ARROW_CPP_DOXYGEN_XML=${build_dir}/cpp/apidoc/xml + pushd ${build_dir} + sphinx-build \ + -b html \ + ${python_build_dir}/docs/source \ + ${build_dir}/docs + popd fi diff --git a/ci/scripts/r_build.sh b/ci/scripts/r_build.sh index 38b54e4434036..f4dc5a5781c6e 100755 --- a/ci/scripts/r_build.sh +++ b/ci/scripts/r_build.sh @@ -24,15 +24,29 @@ build_dir=${2} : ${BUILD_DOCS_R:=OFF} -pushd ${source_dir} +# https://github.com/apache/arrow/issues/41429 +# TODO: We want to out-of-source build. This is a workaround. We copy +# all needed files to the build directory from the source directory +# and build in the build directory. +rm -rf ${build_dir}/r +cp -aL ${source_dir} ${build_dir}/r +pushd ${build_dir}/r # build first so that any stray compiled files in r/src are ignored ${R_BIN} CMD build . -${R_BIN} CMD INSTALL ${INSTALL_ARGS} arrow*.tar.gz +if [ -x "$(command -v sudo)" ]; then + SUDO=sudo +else + SUDO= +fi +${SUDO} \ + env \ + PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig:${PKG_CONFIG_PATH} \ + ${R_BIN} CMD INSTALL ${INSTALL_ARGS} arrow*.tar.gz if [ "${BUILD_DOCS_R}" == "ON" ]; then ${R_BIN} -e "pkgdown::build_site(install = FALSE)" - rsync -a ${source_dir}/docs/ ${build_dir}/docs/r + rsync -a docs/ ${build_dir}/docs/r fi popd diff --git a/dev/archery/archery/docker/core.py b/dev/archery/archery/docker/core.py index 7376bb0a3b72d..cb831060022a4 100644 --- a/dev/archery/archery/docker/core.py +++ b/dev/archery/archery/docker/core.py @@ -371,6 +371,10 @@ def run(self, service_name, command=None, *, env=None, volumes=None, v = "{}:{}".format(v['source'], v['target']) args.extend(['-v', v]) + # append capabilities from the compose conf + for c in service.get('cap_add', []): + args.extend([f'--cap-add={c}']) + # infer whether an interactive shell is desired or not if command in ['cmd.exe', 'bash', 'sh', 'powershell']: args.append('-it') diff --git a/dev/archery/archery/integration/tester_java.py b/dev/archery/archery/integration/tester_java.py index 8e7a0bb99f9de..ccc807410a848 100644 --- a/dev/archery/archery/integration/tester_java.py +++ b/dev/archery/archery/integration/tester_java.py @@ -18,17 +18,23 @@ import contextlib import functools import os +from pathlib import Path import subprocess from . import cdata from .tester import Tester, CDataExporter, CDataImporter from .util import run_cmd, log -from ..utils.source import ARROW_ROOT_DEFAULT + + +ARROW_BUILD_ROOT = os.environ.get( + 'ARROW_BUILD_ROOT', + Path(__file__).resolve().parents[5] +) def load_version_from_pom(): import xml.etree.ElementTree as ET - tree = ET.parse(os.path.join(ARROW_ROOT_DEFAULT, 'java', 'pom.xml')) + tree = ET.parse(os.path.join(ARROW_BUILD_ROOT, 'java', 'pom.xml')) tag_pattern = '{http://maven.apache.org/POM/4.0.0}version' version_tag = list(tree.getroot().findall(tag_pattern))[0] return version_tag.text @@ -48,7 +54,7 @@ def load_version_from_pom(): _ARROW_TOOLS_JAR = os.environ.get( "ARROW_JAVA_INTEGRATION_JAR", os.path.join( - ARROW_ROOT_DEFAULT, + ARROW_BUILD_ROOT, "java/tools/target", f"arrow-tools-{_arrow_version}-jar-with-dependencies.jar" ) @@ -56,7 +62,7 @@ def load_version_from_pom(): _ARROW_C_DATA_JAR = os.environ.get( "ARROW_C_DATA_JAVA_INTEGRATION_JAR", os.path.join( - ARROW_ROOT_DEFAULT, + ARROW_BUILD_ROOT, "java/c/target", f"arrow-c-data-{_arrow_version}.jar" ) @@ -64,7 +70,7 @@ def load_version_from_pom(): _ARROW_FLIGHT_JAR = os.environ.get( "ARROW_FLIGHT_JAVA_INTEGRATION_JAR", os.path.join( - ARROW_ROOT_DEFAULT, + ARROW_BUILD_ROOT, "java/flight/flight-integration-tests/target", f"flight-integration-tests-{_arrow_version}-jar-with-dependencies.jar" ) diff --git a/dev/archery/archery/integration/tester_js.py b/dev/archery/archery/integration/tester_js.py index c7f363ba54687..3d1a229931cde 100644 --- a/dev/archery/archery/integration/tester_js.py +++ b/dev/archery/archery/integration/tester_js.py @@ -16,13 +16,17 @@ # under the License. import os +from pathlib import Path from .tester import Tester from .util import run_cmd, log -from ..utils.source import ARROW_ROOT_DEFAULT -ARROW_JS_ROOT = os.path.join(ARROW_ROOT_DEFAULT, 'js') +ARROW_BUILD_ROOT = os.environ.get( + 'ARROW_BUILD_ROOT', + Path(__file__).resolve().parents[5] +) +ARROW_JS_ROOT = os.path.join(ARROW_BUILD_ROOT, 'js') _EXE_PATH = os.path.join(ARROW_JS_ROOT, 'bin') _VALIDATE = os.path.join(_EXE_PATH, 'integration.ts') _JSON_TO_ARROW = os.path.join(_EXE_PATH, 'json-to-arrow.ts') diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 126b0fcb6f76a..146fa52fa958b 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -65,7 +65,7 @@ groups: - r-binary-packages - ubuntu-* - wheel-* - - test-ubuntu-*-docs + - test-debian-*-docs {############################# Testing tasks #################################} @@ -1458,15 +1458,15 @@ tasks: {% endfor %} # be sure to update binary-task.rb when upgrading ubuntu - test-ubuntu-22.04-docs: + test-debian-12-docs: ci: github template: docs/github.linux.yml params: env: - UBUNTU: 22.04 + JDK: 17 pr_number: Unset flags: "-v $PWD/build/:/build/" - image: ubuntu-docs + image: debian-docs publish: false artifacts: - docs.tar.gz @@ -1594,8 +1594,8 @@ tasks: template: docs/github.linux.yml params: env: - UBUNTU: 22.04 + JDK: 17 pr_number: Unset flags: "-v $PWD/build/:/build/" - image: ubuntu-docs + image: debian-docs publish: true diff --git a/docker-compose.yml b/docker-compose.yml index d771fc2d22a35..9bedb59a77be8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -131,7 +131,8 @@ x-hierarchy: - debian-cpp: - debian-c-glib: - debian-ruby - - debian-python + - debian-python: + - debian-docs - debian-go: - debian-go-cgo - debian-go-cgo-python @@ -145,8 +146,7 @@ x-hierarchy: - ubuntu-c-glib: - ubuntu-ruby - ubuntu-lint - - ubuntu-python: - - ubuntu-docs + - ubuntu-python - ubuntu-python-sdist-test - ubuntu-r - ubuntu-r-only-r @@ -1228,6 +1228,8 @@ services: # We should extend the list of enabled rules after adding this build to # the CI pipeline. image: ${REPO}:${ARCH}-conda-python-${PYTHON}-pandas-${PANDAS} + cap_add: + - SYS_ADMIN environment: <<: [*common, *ccache] ARROW_SUBSTRAIT: "ON" @@ -1378,7 +1380,7 @@ services: /arrow/ci/scripts/python_build.sh /arrow /build && /arrow/ci/scripts/java_jni_build.sh /arrow $${ARROW_HOME} /build /tmp/dist/java/ && /arrow/ci/scripts/java_build.sh /arrow /build /tmp/dist/java && - /arrow/ci/scripts/java_cdata_integration.sh /arrow /tmp/dist/java" ] + /arrow/ci/scripts/java_cdata_integration.sh /arrow /build" ] conda-python-cython2: # Usage: @@ -1680,7 +1682,7 @@ services: command: &js-command > /bin/bash -c " /arrow/ci/scripts/js_build.sh /arrow /build && - /arrow/ci/scripts/js_test.sh /arrow" + /arrow/ci/scripts/js_test.sh /arrow /build" #################################### C# ##################################### @@ -1759,29 +1761,34 @@ services: ################################ Docs ####################################### - ubuntu-docs: + debian-docs: # Usage: - # docker-compose build ubuntu-cpp - # docker-compose build ubuntu-python - # docker-compose build ubuntu-docs - # docker-compose run --rm ubuntu-docs - image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-docs + # docker-compose build debian-cpp + # docker-compose build debian-python + # docker-compose build debian-docs + # docker-compose run --rm debian-docs + image: ${REPO}:${ARCH}-debian-${DEBIAN}-docs build: context: . dockerfile: ci/docker/linux-apt-docs.dockerfile cache_from: - - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-docs + - ${REPO}:${ARCH}-debian-${DEBIAN}-docs args: r: ${R} jdk: ${JDK} maven: ${MAVEN} node: ${NODE} - base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-3 + base: ${REPO}:${ARCH}-debian-${DEBIAN}-python-3 + # This is for Chromium used by Mermaid. Chromium uses namespace + # isolation for security by default. + cap_add: + - SYS_ADMIN environment: <<: [*common, *ccache] ARROW_CUDA: "ON" ARROW_CXX_FLAGS_DEBUG: "-g1" ARROW_C_FLAGS_DEBUG: "-g1" + ARROW_HOME: "/tmp/local" ARROW_JAVA_SKIP_GIT_PLUGIN: ARROW_SUBSTRAIT: "ON" BUILD_DOCS_C_GLIB: "ON" @@ -1790,9 +1797,11 @@ services: BUILD_DOCS_JS: "ON" BUILD_DOCS_PYTHON: "ON" BUILD_DOCS_R: "ON" - volumes: *ubuntu-volumes - command: &docs-command > + volumes: *debian-volumes + command: > /bin/bash -c " + sudo mkdir -p /build /ccache && + sudo chown -R `id --user --name`: /build /ccache && /arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && /arrow/ci/scripts/c_glib_build.sh /arrow /build && diff --git a/docs/requirements.txt b/docs/requirements.txt index 8891680814dff..afb252e17457b 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -8,8 +8,9 @@ myst-parser[linkify] numpydoc pydata-sphinx-theme~=0.14 sphinx-autobuild -sphinx-design sphinx-copybutton +sphinx-design sphinx-lint +sphinxcontrib-mermaid sphinx==6.2 pandas diff --git a/docs/source/conf.py b/docs/source/conf.py index 05340dc923c89..b487200555a09 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -125,6 +125,7 @@ 'sphinx.ext.intersphinx', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', + 'sphinxcontrib.mermaid', ] # Show members for classes in .. autosummary @@ -137,7 +138,9 @@ } # Breathe configuration -breathe_projects = {"arrow_cpp": "../../cpp/apidoc/xml"} +breathe_projects = { + "arrow_cpp": os.environ.get("ARROW_CPP_DOXYGEN_XML", "../../cpp/apidoc/xml"), +} breathe_default_project = "arrow_cpp" # Overridden conditionally below @@ -584,6 +587,9 @@ # # texinfo_no_detailmenu = False +# -- Options for mermaid output ------------------------------------------- + +mermaid_output_format = 'svg' def setup(app): # Use a config value to indicate whether CUDA API docs can be generated. diff --git a/docs/source/format/Flight.rst b/docs/source/format/Flight.rst index 7ee84952b4350..c65a1f70bde7f 100644 --- a/docs/source/format/Flight.rst +++ b/docs/source/format/Flight.rst @@ -68,9 +68,8 @@ Downloading Data A client that wishes to download the data would: -.. figure:: ./Flight/DoGet.mmd.svg - - Retrieving data via ``DoGet``. +.. mermaid:: ./Flight/DoGet.mmd + :caption: Retrieving data via ``DoGet``. #. Construct or acquire a ``FlightDescriptor`` for the data set they are interested in. @@ -168,9 +167,8 @@ data. However, ``GetFlightInfo`` doesn't return until the query completes, so the client is blocked. In this situation, the client can use ``PollFlightInfo`` instead of ``GetFlightInfo``: -.. figure:: ./Flight/PollFlightInfo.mmd.svg - - Polling a long-running query by ``PollFlightInfo``. +.. mermaid:: ./Flight/PollFlightInfo.mmd + :caption: Polling a long-running query by ``PollFlightInfo``. #. Construct or acquire a ``FlightDescriptor``, as before. #. Call ``PollFlightInfo(FlightDescriptor)`` to get a ``PollInfo`` @@ -229,9 +227,8 @@ Uploading Data To upload data, a client would: -.. figure:: ./Flight/DoPut.mmd.svg - - Uploading data via ``DoPut``. +.. mermaid:: ./Flight/DoPut.mmd + :caption: Uploading data via ``DoPut``. #. Construct or acquire a ``FlightDescriptor``, as before. #. Call ``DoPut(FlightData)`` and upload a stream of Arrow record @@ -257,9 +254,8 @@ require being stateful if implemented using ``DoGet`` and ``DoPut``. Instead, ``DoExchange`` allows this to be implemented as a single call. A client would: -.. figure:: ./Flight/DoExchange.mmd.svg - - Complex data flow with ``DoExchange``. +.. mermaid:: ./Flight/DoExchange.mmd + :caption: Complex data flow with ``DoExchange``. #. Construct or acquire a ``FlightDescriptor``, as before. #. Call ``DoExchange(FlightData)``. diff --git a/docs/source/format/Flight/DoExchange.mmd b/docs/source/format/Flight/DoExchange.mmd index 14f1789aeaaa7..f7586bf35eb4f 100644 --- a/docs/source/format/Flight/DoExchange.mmd +++ b/docs/source/format/Flight/DoExchange.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandGetTables.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/Flight/DoExchange.mmd.svg b/docs/source/format/Flight/DoExchange.mmd.svg deleted file mode 100644 index 204d63d77218d..0000000000000 --- a/docs/source/format/Flight/DoExchange.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientServerThe first FlightData includes a FlightDescriptorDoExchange(FlightData)1stream of FlightData2stream of FlightData3par[[Client sends data]][[Server sends data]]ClientServer \ No newline at end of file diff --git a/docs/source/format/Flight/DoGet.mmd b/docs/source/format/Flight/DoGet.mmd index c2e3cd034448c..cac59afb8219f 100644 --- a/docs/source/format/Flight/DoGet.mmd +++ b/docs/source/format/Flight/DoGet.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandGetTables.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/Flight/DoGet.mmd.svg b/docs/source/format/Flight/DoGet.mmd.svg deleted file mode 100644 index 48a50d77ed33f..0000000000000 --- a/docs/source/format/Flight/DoGet.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientMetadata ServerData ServerGetFlightInfo(FlightDescriptor)1FlightInfo{endpoints: [FlightEndpoint{ticket: Ticket}, …]}2This may be parallelizedDoGet(Ticket)3stream of FlightData4loop[for each endpoint in FlightInfo.endpoints]ClientMetadata ServerData Server \ No newline at end of file diff --git a/docs/source/format/Flight/DoPut.mmd b/docs/source/format/Flight/DoPut.mmd index 5845edef1f466..876505da2d300 100644 --- a/docs/source/format/Flight/DoPut.mmd +++ b/docs/source/format/Flight/DoPut.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandGetTables.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/Flight/DoPut.mmd.svg b/docs/source/format/Flight/DoPut.mmd.svg deleted file mode 100644 index 9e490e152bdb3..0000000000000 --- a/docs/source/format/Flight/DoPut.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientServerThe first FlightData includes a FlightDescriptorDoPut(FlightData)1stream of FlightData2PutResult{app_metadata}3ClientServer \ No newline at end of file diff --git a/docs/source/format/Flight/PollFlightInfo.mmd b/docs/source/format/Flight/PollFlightInfo.mmd index d062a3a216958..f91c077b655c0 100644 --- a/docs/source/format/Flight/PollFlightInfo.mmd +++ b/docs/source/format/Flight/PollFlightInfo.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd):/data minlag/mermaid-cli -i /data/PollFlightInfo.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/Flight/PollFlightInfo.mmd.svg b/docs/source/format/Flight/PollFlightInfo.mmd.svg deleted file mode 100644 index 1890361f88ce4..0000000000000 --- a/docs/source/format/Flight/PollFlightInfo.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientMetadata ServerData ServerThis may be parallelizedSome endpoints may be processed while pollingloop[for each endpoint in FlightInfo.endpoints]PollFlightInfo(FlightDescriptor)1PollInfo{descriptor: FlightDescriptor', ...}2PollFlightInfo(FlightDescriptor')3PollInfo{descriptor: FlightDescriptor'', ...}4PollFlightInfo(FlightDescriptor'')5PollInfo{descriptor: null, info: FlightInfo{endpoints: [FlightEndpoint{ticket: Ticket}, …]}6DoGet(Ticket)7stream of FlightData8ClientMetadata ServerData Server \ No newline at end of file diff --git a/docs/source/format/FlightSql.rst b/docs/source/format/FlightSql.rst index 1a43e4bdff306..181efce286e70 100644 --- a/docs/source/format/FlightSql.rst +++ b/docs/source/format/FlightSql.rst @@ -242,21 +242,17 @@ Close and invalidate the current session context. Sequence Diagrams ================= -.. figure:: ./FlightSql/CommandGetTables.mmd.svg +.. mermaid:: ./FlightSql/CommandGetTables.mmd + :caption: Listing available tables. - Listing available tables. +.. mermaid:: ./FlightSql/CommandStatementQuery.mmd + :caption: Executing an ad-hoc query. -.. figure:: ./FlightSql/CommandStatementQuery.mmd.svg +.. mermaid:: ./FlightSql/CommandPreparedStatementQuery.mmd + :caption: Creating a prepared statement, then executing it. - Executing an ad-hoc query. - -.. figure:: ./FlightSql/CommandPreparedStatementQuery.mmd.svg - - Creating a prepared statement, then executing it. - -.. figure:: ./FlightSql/CommandStatementIngest.mmd.svg - - Executing a bulk ingestion. +.. mermaid:: ./FlightSql/CommandStatementIngest.mmd + :caption: Executing a bulk ingestion. External Resources ================== diff --git a/docs/source/format/FlightSql/CommandGetTables.mmd b/docs/source/format/FlightSql/CommandGetTables.mmd index f151411647f23..e6b18ed7dc08b 100644 --- a/docs/source/format/FlightSql/CommandGetTables.mmd +++ b/docs/source/format/FlightSql/CommandGetTables.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandGetTables.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/FlightSql/CommandGetTables.mmd.svg b/docs/source/format/FlightSql/CommandGetTables.mmd.svg deleted file mode 100644 index 4e71c01982289..0000000000000 --- a/docs/source/format/FlightSql/CommandGetTables.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientServerGetFlightInfo(CommandGetTables)1FlightInfo{…Ticket…}2DoGet(Ticket)3stream of FlightData4ClientServer \ No newline at end of file diff --git a/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd b/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd index cbd1eb6014bca..ce18b91eaa33e 100644 --- a/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd +++ b/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandPreparedStatementQuery.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd.svg b/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd.svg deleted file mode 100644 index cbf6a78e9a5ce..0000000000000 --- a/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ServerClientServerClientoptional response with updated handleloop[for each endpoint in FlightInfo.endpoints]loop[for each invocation of the prepared statement]DoAction(ActionCreatePreparedStatementRequest)1ActionCreatePreparedStatementResult{handle}2DoPut(CommandPreparedStatementQuery)3stream of FlightData4DoPutPreparedStatementResult{handle}5GetFlightInfo(CommandPreparedStatementQuery)6FlightInfo{endpoints: [FlightEndpoint{…}, …]}7DoGet(endpoint.ticket)8stream of FlightData9DoAction(ActionClosePreparedStatementRequest)10ActionClosePreparedStatementRequest{}11 \ No newline at end of file diff --git a/docs/source/format/FlightSql/CommandStatementIngest.mmd b/docs/source/format/FlightSql/CommandStatementIngest.mmd index 781289d77b41a..0578f465d4dda 100644 --- a/docs/source/format/FlightSql/CommandStatementIngest.mmd +++ b/docs/source/format/FlightSql/CommandStatementIngest.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandGetTables.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/FlightSql/CommandStatementIngest.mmd.svg b/docs/source/format/FlightSql/CommandStatementIngest.mmd.svg deleted file mode 100644 index e2aa72459afa5..0000000000000 --- a/docs/source/format/FlightSql/CommandStatementIngest.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ServerClientServerClientDoPut(CommandStatementIngest)1stream of FlightData2PutResult{DoPutUpdateResult{RecordCount: int64}}3 \ No newline at end of file diff --git a/docs/source/format/FlightSql/CommandStatementQuery.mmd b/docs/source/format/FlightSql/CommandStatementQuery.mmd index 7b67fecfb75c6..f26aa2f951fcf 100644 --- a/docs/source/format/FlightSql/CommandStatementQuery.mmd +++ b/docs/source/format/FlightSql/CommandStatementQuery.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandStatementQuery.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/FlightSql/CommandStatementQuery.mmd.svg b/docs/source/format/FlightSql/CommandStatementQuery.mmd.svg deleted file mode 100644 index f5e8c79f137ff..0000000000000 --- a/docs/source/format/FlightSql/CommandStatementQuery.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientServerGetFlightInfo(CommandStatementQuery)1FlightInfo{endpoints: [FlightEndpoint{…}, …]}2DoGet(endpoint.ticket)3stream of FlightData4loop[for each endpoint in FlightInfo.endpoints]ClientServer \ No newline at end of file