diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2e598e0a95064..bf5ca08d53c32 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -132,3 +132,9 @@ repos: ?^cpp/cmake_modules/UseCython\.cmake$| ?^cpp/src/arrow/util/config\.h\.cmake$| ) + - repo: https://github.com/sphinx-contrib/sphinx-lint + rev: v0.9.1 + hooks: + - id: sphinx-lint + files: ^docs/ + args: ['--disable', 'all', '--enable', 'trailing-whitespace,missing-final-newline', 'docs'] diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index 0a356d5722c42..83afa69a653a9 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -26,6 +26,7 @@ pydata-sphinx-theme=0.14 sphinx-autobuild sphinx-design sphinx-copybutton +sphinx-lint sphinxcontrib-jquery sphinx==6.2 # Requirement for doctest-cython diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 5fa41e28a3208..8a26d9266f22d 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -261,6 +261,7 @@ def build(ctx, src, build_dir, force, targets, **kwargs): "Check all sources files for license texts via Apache RAT."), LintCheck('r', "Lint R files."), LintCheck('docker', "Lint Dockerfiles with hadolint."), + LintCheck('docs', "Lint docs with sphinx-lint."), ] @@ -285,9 +286,10 @@ def decorate_lint_command(cmd): help="Run IWYU on all C++ files if enabled") @click.option("-a", "--all", is_flag=True, default=False, help="Enable all checks.") +@click.argument("path", required=False) @decorate_lint_command @click.pass_context -def lint(ctx, src, fix, iwyu_all, **checks): +def lint(ctx, src, fix, iwyu_all, path, **checks): if checks.pop('all'): # "--all" is given => enable all non-selected checks for k, v in checks.items(): @@ -297,7 +299,7 @@ def lint(ctx, src, fix, iwyu_all, **checks): raise click.UsageError( "Need to enable at least one lint check (try --help)") try: - linter(src, fix, iwyu_all=iwyu_all, **checks) + linter(src, fix, iwyu_all=iwyu_all, path=path, **checks) except LintValidationException: sys.exit(1) diff --git a/dev/archery/archery/utils/lint.py b/dev/archery/archery/utils/lint.py index 15f22ca2e6e5c..108c9ded361e7 100644 --- a/dev/archery/archery/utils/lint.py +++ b/dev/archery/archery/utils/lint.py @@ -436,10 +436,55 @@ def docker_linter(src): cwd=src.path)) -def linter(src, fix=False, *, clang_format=False, cpplint=False, +class SphinxLint(Command): + def __init__(self, src, path=None, sphinx_lint_bin=None, disable=None, enable=None): + self.src = src + self.path = path + self.bin = default_bin(sphinx_lint_bin, "sphinx-lint") + self.disable = disable or "all" + self.enable = enable + + def lint(self, *args, check=False): + docs_path = os.path.join(self.src.path, "docs") + + args = [] + + if self.disable: + args.extend(["--disable", self.disable]) + + if self.enable: + args.extend(["--enable", self.enable]) + + if self.path is not None: + args.extend([self.path]) + else: + args.extend([docs_path]) + + return self.run(*args, check=check) + + +def docs_linter(src, path=None): + """Run sphinx-lint on docs.""" + logger.info("Running docs linter (sphinx-lint)") + + sphinx_lint = SphinxLint( + src, + path=path, + disable="all", + enable="trailing-whitespace,missing-final-newline" + ) + + if not sphinx_lint.available: + logger.error("sphinx-lint linter requested but sphinx-lint binary not found") + return + + yield LintResult.from_cmd(sphinx_lint.lint()) + + +def linter(src, fix=False, path=None, *, clang_format=False, cpplint=False, clang_tidy=False, iwyu=False, iwyu_all=False, python=False, numpydoc=False, cmake_format=False, rat=False, - r=False, docker=False): + r=False, docker=False, docs=False): """Run all linters.""" with tmpdir(prefix="arrow-lint-") as root: build_dir = os.path.join(root, "cpp-build") @@ -481,6 +526,9 @@ def linter(src, fix=False, *, clang_format=False, cpplint=False, if docker: results.extend(docker_linter(src)) + if docs: + results.extend(docs_linter(src, path)) + # Raise error if one linter failed, ensuring calling code can exit with # non-zero. for result in results: diff --git a/dev/archery/setup.py b/dev/archery/setup.py index 23a1600910d04..cd3e2e9ca0834 100755 --- a/dev/archery/setup.py +++ b/dev/archery/setup.py @@ -41,7 +41,7 @@ 'integration': ['cffi'], 'integration-java': ['jpype1'], 'lint': ['numpydoc==1.1.0', 'autopep8', 'flake8==6.1.0', 'cython-lint', - 'cmake_format==0.6.13'], + 'cmake_format==0.6.13', 'sphinx-lint==0.9.1'], 'numpydoc': ['numpydoc==1.1.0'], 'release': ['pygithub', jinja_req, 'jira', 'semver', 'gitpython'], } diff --git a/docs/requirements.txt b/docs/requirements.txt index 252344a74a58f..8891680814dff 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -10,5 +10,6 @@ pydata-sphinx-theme~=0.14 sphinx-autobuild sphinx-design sphinx-copybutton +sphinx-lint sphinx==6.2 pandas diff --git a/docs/source/cpp/acero/developer_guide.rst b/docs/source/cpp/acero/developer_guide.rst index 331cd833b58af..80ca68556fc40 100644 --- a/docs/source/cpp/acero/developer_guide.rst +++ b/docs/source/cpp/acero/developer_guide.rst @@ -187,7 +187,7 @@ Examples task (described below) as completed which allows the plan to finish. * The ``fetch`` node, in ``InputReceived``, may decide that it has all the data it needs. It can then call ``StopProducing`` on its input. - + Initialization / Construction / Destruction ------------------------------------------- @@ -271,7 +271,7 @@ distributed systems. Once that has been done then it should be possible to do a meaning exchanging between multiple exec plan instances on a single system) if desired. .. figure:: dist_plan.svg - + A distributed plan can provide parallelism even if the plans themselves run serially Pipeline Parallelism @@ -472,7 +472,7 @@ Benchmarking The most complete macro benchmarking for Acero is provided by https://github.com/voltrondata-labs/arrowbench These include a set of TPC-H benchmarks, executed from the R-dplyr integration, which are run on every Arrow commit and -reported to Conbench at https://conbench.ursa.dev/ +reported to Conbench at https://conbench.ursa.dev/ In addition to these TPC-H benchmarks there are a number of micro-benchmarks for various nodes (hash-join, asof-join, etc.) Finally, the compute functions themselves should mostly have micro-benchmarks. For more on micro benchmarks you diff --git a/docs/source/cpp/acero/overview.rst b/docs/source/cpp/acero/overview.rst index c569f82b099b6..8be4cbc1b1772 100644 --- a/docs/source/cpp/acero/overview.rst +++ b/docs/source/cpp/acero/overview.rst @@ -206,7 +206,7 @@ is very similar to a RecordBatch. It can have zero or more columns and all of t must have the same length. There are a few key differences from ExecBatch: .. figure:: rb_vs_eb.svg - + Both the record batch and the exec batch have strong ownership of the arrays & buffers * An `ExecBatch` does not have a schema. This is because an `ExecBatch` is assumed to be @@ -217,7 +217,7 @@ must have the same length. There are a few key differences from ExecBatch: also has a length property which describes how many rows are in a batch. So another way to view a `Scalar` is a constant array with `length` elements. * An `ExecBatch` contains additional information used by the exec plan. For example, an - `index` can be used to describe a batch's position in an ordered stream. We expect + `index` can be used to describe a batch's position in an ordered stream. We expect that `ExecBatch` will also evolve to contain additional fields such as a selection vector. .. figure:: scalar_vs_array.svg @@ -266,5 +266,5 @@ various query representations (e.g. Substrait). The Declaration objects are the with the DeclarationToXyz methods, are the current public API for Acero. .. figure:: decl_vs_ep.svg - - A declaration is a blueprint that is used to instantiate exec plan instances \ No newline at end of file + + A declaration is a blueprint that is used to instantiate exec plan instances diff --git a/docs/source/cpp/acero/substrait.rst b/docs/source/cpp/acero/substrait.rst index 797b2407f93cd..a5532733627c1 100644 --- a/docs/source/cpp/acero/substrait.rst +++ b/docs/source/cpp/acero/substrait.rst @@ -111,7 +111,7 @@ Aggregate Relations * Each measure's arguments must be direct references. * A measure may not have a filter * A measure may not have sorts -* A measure's invocation must be AGGREGATION_INVOCATION_ALL or +* A measure's invocation must be AGGREGATION_INVOCATION_ALL or AGGREGATION_INVOCATION_UNSPECIFIED * A measure's phase must be AGGREGATION_PHASE_INITIAL_TO_RESULT @@ -146,73 +146,73 @@ Types - Caveat * - boolean - boolean - - + - * - i8 - int8 - - + - * - i16 - int16 - - + - * - i32 - int32 - - + - * - i64 - int64 - - + - * - fp32 - float32 - - + - * - fp64 - float64 - - + - * - string - string - - + - * - binary - binary - - + - * - timestamp - timestamp - - + - * - timestamp_tz - timestamp - - + - * - date - date32 - - + - * - time - time64 - - + - * - interval_year - - + - - Not currently supported * - interval_day - - + - - Not currently supported * - uuid - - + - - Not currently supported * - FIXEDCHAR - - + - - Not currently supported * - VARCHAR - - + - - Not currently supported * - FIXEDBINARY - fixed_size_binary - - + - * - DECIMAL - decimal128 - - + - * - STRUCT - struct - Arrow struct fields will have no name (empty string) * - NSTRUCT - - + - - Not currently supported * - LIST - list - - + - * - MAP - map - K must not be nullable diff --git a/docs/source/cpp/acero/user_guide.rst b/docs/source/cpp/acero/user_guide.rst index eca1a0104708b..adcc17216e5ae 100644 --- a/docs/source/cpp/acero/user_guide.rst +++ b/docs/source/cpp/acero/user_guide.rst @@ -32,14 +32,14 @@ Using Acero The basic workflow for Acero is this: #. First, create a graph of :class:`Declaration` objects describing the plan - + #. Call one of the DeclarationToXyz methods to execute the Declaration. a. A new ExecPlan is created from the graph of Declarations. Each Declaration will correspond to one ExecNode in the plan. In addition, a sink node will be added, depending on which DeclarationToXyz method was used. - b. The ExecPlan is executed. Typically this happens as part of the DeclarationToXyz call but in + b. The ExecPlan is executed. Typically this happens as part of the DeclarationToXyz call but in DeclarationToReader the reader is returned before the plan is finished executing. c. Once the plan is finished it is destroyed @@ -315,7 +315,7 @@ of a specific execution node. ``source`` ---------- -A ``source`` operation can be considered as an entry point to create a streaming execution plan. +A ``source`` operation can be considered as an entry point to create a streaming execution plan. :class:`SourceNodeOptions` are used to create the ``source`` operation. The ``source`` operation is the most generic and flexible type of source currently available but it can be quite tricky to configure. First you should review the other source node types to ensure there @@ -326,7 +326,7 @@ function should take no arguments and should return an ``arrow::Future>``. This function might be reading a file, iterating through an in memory structure, or receiving data from a network connection. The arrow library refers to these functions as ``arrow::AsyncGenerator`` -and there are a number of utilities for working with these functions. For this example we use +and there are a number of utilities for working with these functions. For this example we use a vector of record batches that we've already stored in memory. In addition, the schema of the data must be known up front. Acero must know the schema of the data at each stage of the execution graph before any processing has begun. This means we must supply the @@ -368,10 +368,10 @@ Example of using ``source`` (usage of sink is explained in detail in :ref:`sink< In the previous example, :ref:`source node `, a source node was used to input the data. But when developing an application, if the data is already in memory as a table, it is much easier, and more performant to use :class:`TableSourceNodeOptions`. -Here the input data can be passed as a ``std::shared_ptr`` along with a ``max_batch_size``. +Here the input data can be passed as a ``std::shared_ptr`` along with a ``max_batch_size``. The ``max_batch_size`` is to break up large record batches so that they can be processed in parallel. It is important to note that the table batches will not get merged to form larger batches when the source -table has a smaller batch size. +table has a smaller batch size. Example of using ``table_source`` @@ -387,7 +387,7 @@ Example of using ``table_source`` ``filter`` ---------- -``filter`` operation, as the name suggests, provides an option to define data filtering +``filter`` operation, as the name suggests, provides an option to define data filtering criteria. It selects rows where the given expression evaluates to true. Filters can be written using :class:`arrow::compute::Expression`, and the expression should have a return type of boolean. For example, if we wish to keep rows where the value @@ -415,7 +415,7 @@ functions, i.e. elementwise functions that return one value for each input row independent of the value of all other rows). This is exposed via :class:`ProjectNodeOptions` which requires, an :class:`arrow::compute::Expression` and name for each of the output columns (if names are not -provided, the string representations of exprs will be used). +provided, the string representations of exprs will be used). Project example: @@ -456,7 +456,7 @@ can be selected from :ref:`this list of aggregation functions The aggregation can provide results as a group or scalar. For instances, an operation like `hash_count` provides the counts per each unique record -as a grouped result while an operation like `sum` provides a single record. +as a grouped result while an operation like `sum` provides a single record. Scalar Aggregation example: @@ -481,14 +481,14 @@ Group Aggregation example: ``sink`` -------- -``sink`` operation provides output and is the final node of a streaming -execution definition. :class:`SinkNodeOptions` interface is used to pass +``sink`` operation provides output and is the final node of a streaming +execution definition. :class:`SinkNodeOptions` interface is used to pass the required options. Similar to the source operator the sink operator exposes the output with a function that returns a record batch future each time it is called. It is expected the caller will repeatedly call this function until the generator function is exhausted (returns ``std::optional::nullopt``). If this function is not called often enough then record batches will accumulate in memory. An execution plan should only have one -"terminal" node (one sink node). An :class:`ExecPlan` can terminate early due to cancellation or +"terminal" node (one sink node). An :class:`ExecPlan` can terminate early due to cancellation or an error, before the output is fully consumed. However, the plan can be safely destroyed independently of the sink, which will hold the unconsumed batches by `exec_plan->finished()`. @@ -526,12 +526,12 @@ Example:: arrow::Future<> finish = arrow::Future<>::Make(); struct CustomSinkNodeConsumer : public cp::SinkNodeConsumer { - CustomSinkNodeConsumer(std::atomic *batches_seen, arrow::Future<>finish): + CustomSinkNodeConsumer(std::atomic *batches_seen, arrow::Future<>finish): batches_seen(batches_seen), finish(std::move(finish)) {} // Consumption logic can be written here arrow::Status Consume(cp::ExecBatch batch) override { // data can be consumed in the expected way - // transfer to another system or just do some work + // transfer to another system or just do some work // and write to disk (*batches_seen)++; return arrow::Status::OK(); @@ -541,9 +541,9 @@ Example:: std::atomic *batches_seen; arrow::Future<> finish; - + }; - + std::shared_ptr consumer = std::make_shared(&batches_seen, finish); @@ -567,14 +567,14 @@ Consuming-Sink example: ``order_by_sink`` ----------------- -``order_by_sink`` operation is an extension to the ``sink`` operation. -This operation provides the ability to guarantee the ordering of the -stream by providing the :class:`OrderBySinkNodeOptions`. -Here the :class:`arrow::compute::SortOptions` are provided to define which columns +``order_by_sink`` operation is an extension to the ``sink`` operation. +This operation provides the ability to guarantee the ordering of the +stream by providing the :class:`OrderBySinkNodeOptions`. +Here the :class:`arrow::compute::SortOptions` are provided to define which columns are used for sorting and whether to sort by ascending or descending values. .. note:: This node is a "pipeline breaker" and will fully materialize the dataset in memory. - In the future, spillover mechanisms will be added which should alleviate this + In the future, spillover mechanisms will be added which should alleviate this constraint. @@ -593,14 +593,14 @@ Order-By-Sink example: ``select_k_sink`` ----------------- -``select_k_sink`` option enables selecting the top/bottom K elements, -similar to a SQL ``ORDER BY ... LIMIT K`` clause. -:class:`SelectKOptions` which is a defined by -using :struct:`OrderBySinkNode` definition. This option returns a sink node that receives +``select_k_sink`` option enables selecting the top/bottom K elements, +similar to a SQL ``ORDER BY ... LIMIT K`` clause. +:class:`SelectKOptions` which is a defined by +using :struct:`OrderBySinkNode` definition. This option returns a sink node that receives inputs and then compute top_k/bottom_k. .. note:: This node is a "pipeline breaker" and will fully materialize the input in memory. - In the future, spillover mechanisms will be added which should alleviate this + In the future, spillover mechanisms will be added which should alleviate this constraint. SelectK example: @@ -617,7 +617,7 @@ SelectK example: .. _stream_execution_table_sink_docs: -The ``table_sink`` node provides the ability to receive the output as an in-memory table. +The ``table_sink`` node provides the ability to receive the output as an in-memory table. This is simpler to use than the other sink nodes provided by the streaming execution engine but it only makes sense when the output fits comfortably in memory. The node is created using :class:`TableSinkNodeOptions`. @@ -637,7 +637,7 @@ Example of using ``table_sink`` --------- ``scan`` is an operation used to load and process datasets. It should be preferred over the -more generic ``source`` node when your input is a dataset. The behavior is defined using +more generic ``source`` node when your input is a dataset. The behavior is defined using :class:`arrow::dataset::ScanNodeOptions`. More information on datasets and the various scan options can be found in :doc:`../dataset`. @@ -683,10 +683,10 @@ Write example: ``union`` ------------- -``union`` merges multiple data streams with the same schema into one, similar to +``union`` merges multiple data streams with the same schema into one, similar to a SQL ``UNION ALL`` clause. -The following example demonstrates how this can be achieved using +The following example demonstrates how this can be achieved using two data sources. Union example: @@ -704,15 +704,15 @@ Union example: ------------- ``hash_join`` operation provides the relational algebra operation, join using hash-based -algorithm. :class:`HashJoinNodeOptions` contains the options required in -defining a join. The hash_join supports +algorithm. :class:`HashJoinNodeOptions` contains the options required in +defining a join. The hash_join supports `left/right/full semi/anti/outerjoins -`_. +`_. Also the join-key (i.e. the column(s) to join on), and suffixes (i.e a suffix term like "_x" -which can be appended as a suffix for column names duplicated in both left and right -relations.) can be set via the join options. +which can be appended as a suffix for column names duplicated in both left and right +relations.) can be set via the join options. `Read more on hash-joins -`_. +`_. Hash-Join example: @@ -726,7 +726,7 @@ Hash-Join example: Summary ======= -There are examples of these nodes which can be found in +There are examples of these nodes which can be found in ``cpp/examples/arrow/execution_plan_documentation_examples.cc`` in the Arrow source. Complete Example: diff --git a/docs/source/cpp/api/scalar.rst b/docs/source/cpp/api/scalar.rst index 04e78450d7744..be9f9686bf110 100644 --- a/docs/source/cpp/api/scalar.rst +++ b/docs/source/cpp/api/scalar.rst @@ -44,4 +44,4 @@ Utilities .. doxygenclass:: arrow::ScalarVisitor :project: arrow_cpp :members: - :undoc-members: \ No newline at end of file + :undoc-members: diff --git a/docs/source/cpp/build_system.rst b/docs/source/cpp/build_system.rst index 60df117eb510e..0c94d7e5ce5dc 100644 --- a/docs/source/cpp/build_system.rst +++ b/docs/source/cpp/build_system.rst @@ -47,7 +47,7 @@ file into an executable linked with the Arrow C++ shared library: .. code-block:: cmake cmake_minimum_required(VERSION 3.16) - + project(MyExample) find_package(Arrow REQUIRED) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index e7310d2c0c711..546b6e5716df7 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -49,8 +49,8 @@ Computation inputs are represented as a general :class:`Datum` class, which is a tagged union of several shapes of data such as :class:`Scalar`, :class:`Array` and :class:`ChunkedArray`. Many compute functions support both array (chunked or not) and scalar inputs, however some will mandate -particular input types. For example, while ``array_sort_indices`` requires its -first and only input to be an array, the generalized ``sort_indices`` +particular input types. For example, while ``array_sort_indices`` requires its +first and only input to be an array, the generalized ``sort_indices`` function accepts an array, chunked array, record batch or table. .. _invoking-compute-functions: @@ -572,28 +572,28 @@ representation based on the rounding criterion. | trunc | Unary | Numeric | Float32/Float64/Decimal | | | +-------------------+------------+-------------+-------------------------+----------------------------------+--------+ -* \(1) By default rounding functions change a value to the nearest - integer using HALF_TO_EVEN to resolve ties. Options are available to control - the rounding criterion. All ``round`` functions have the +* \(1) By default rounding functions change a value to the nearest + integer using HALF_TO_EVEN to resolve ties. Options are available to control + the rounding criterion. All ``round`` functions have the ``round_mode`` option to set the rounding mode. * \(2) Round to a number of digits where the ``ndigits`` option of :struct:`RoundOptions` specifies the rounding precision in terms of number of digits. A negative value corresponds to digits in the non-fractional part. For example, -2 corresponds to rounding to the nearest multiple of 100 (zeroing the ones and tens digits). Default value of ``ndigits`` is 0 - which rounds to the nearest integer. For integer inputs a non-negative + which rounds to the nearest integer. For integer inputs a non-negative ``ndigits`` value is ignored and the input is returned unchanged. For integer - inputs, if ``-ndigits`` is larger than the maximum number of digits the + inputs, if ``-ndigits`` is larger than the maximum number of digits the input type can hold, an error is returned. * \(3) Round to a multiple where the ``multiple`` option of :struct:`RoundToMultipleOptions` specifies the rounding scale. The rounding - multiple has to be a positive value and can be casted to input type. - For example, 100 corresponds to rounding to the nearest multiple of 100 - (zeroing the ones and tens digits). Default value of ``multiple`` is 1 which + multiple has to be a positive value and can be casted to input type. + For example, 100 corresponds to rounding to the nearest multiple of 100 + (zeroing the ones and tens digits). Default value of ``multiple`` is 1 which rounds to the nearest integer. * \(4) Round the first input to multiple of the second input. The rounding - multiple has to be a positive value and can be casted to the first input type. - For example, 100 corresponds to rounding to the nearest multiple of 100 + multiple has to be a positive value and can be casted to the first input type. + For example, 100 corresponds to rounding to the nearest multiple of 100 (zeroing the ones and tens digits). For ``round`` functions, the following rounding modes are available. @@ -634,8 +634,8 @@ The example values are given for default values of ``ndigits`` and ``multiple``. | | | -3.5 -> -3, -4.5 -> -5 | +-----------------------+--------------------------------------------------------------+---------------------------+ -The following table gives examples of how ``ndigits`` (for the ``round`` -and ``round_binary`` functions) and ``multiple`` (for ``round_to_multiple``) +The following table gives examples of how ``ndigits`` (for the ``round`` +and ``round_binary`` functions) and ``multiple`` (for ``round_to_multiple``) influence the operation performed, respectively. +--------------------+-------------------+---------------------------+ @@ -1621,12 +1621,12 @@ Array-wise ("vector") functions Cumulative Functions ~~~~~~~~~~~~~~~~~~~~ -Cumulative functions are vector functions that perform a running accumulation on -their input using a given binary associative operation with an identity element -(a monoid) and output an array containing the corresponding intermediate running -values. The input is expected to be of numeric type. By default these functions -do not detect overflow. They are also available in an overflow-checking variant, -suffixed ``_checked``, which returns an ``Invalid`` :class:`Status` when +Cumulative functions are vector functions that perform a running accumulation on +their input using a given binary associative operation with an identity element +(a monoid) and output an array containing the corresponding intermediate running +values. The input is expected to be of numeric type. By default these functions +do not detect overflow. They are also available in an overflow-checking variant, +suffixed ``_checked``, which returns an ``Invalid`` :class:`Status` when overflow is detected. +-------------------------+-------+-------------+-------------+--------------------------------+-----------+ @@ -1649,8 +1649,8 @@ overflow is detected. * \(1) CumulativeOptions has two optional parameters. The first parameter :member:`CumulativeOptions::start` is a starting value for the running - accumulation. It has a default value of 0 for `sum`, 1 for `prod`, min of - input type for `max`, and max of input type for `min`. Specified values of + accumulation. It has a default value of 0 for `sum`, 1 for `prod`, min of + input type for `max`, and max of input type for `min`. Specified values of ``start`` must be castable to the input type. The second parameter :member:`CumulativeOptions::skip_nulls` is a boolean. When set to false (the default), the first encountered null is propagated. When set to @@ -1861,9 +1861,9 @@ replaced, based on the remaining inputs. Pairwise functions ~~~~~~~~~~~~~~~~~~~~ -Pairwise functions are unary vector functions that perform a binary operation on +Pairwise functions are unary vector functions that perform a binary operation on a pair of elements in the input array, typically on adjacent elements. The n-th -output is computed by applying the binary operation to the n-th and (n-p)-th inputs, +output is computed by applying the binary operation to the n-th and (n-p)-th inputs, where p is the period. The default period is 1, in which case the binary operation is applied to adjacent pairs of inputs. The period can also be negative, in which case the n-th output is computed by applying the binary @@ -1877,9 +1877,9 @@ operation to the n-th and (n+abs(p))-th inputs. | pairwise_diff_checked | Unary | Numeric/Temporal | Numeric/Temporal | :struct:`PairwiseOptions` | \(1)(3) | +------------------------+-------+----------------------+----------------------+--------------------------------+----------+ -* \(1) Computes the first order difference of an array, It internally calls - the scalar function ``Subtract`` (or the checked variant) to compute - differences, so its behavior and supported types are the same as - ``Subtract``. The period can be specified in :struct:`PairwiseOptions`. +* \(1) Computes the first order difference of an array, It internally calls + the scalar function ``Subtract`` (or the checked variant) to compute + differences, so its behavior and supported types are the same as + ``Subtract``. The period can be specified in :struct:`PairwiseOptions`. * \(2) Wraps around the result when overflow is detected. * \(3) Returns an ``Invalid`` :class:`Status` when overflow is detected. diff --git a/docs/source/cpp/dataset.rst b/docs/source/cpp/dataset.rst index 1f5d0476c2889..a64b73b61c05d 100644 --- a/docs/source/cpp/dataset.rst +++ b/docs/source/cpp/dataset.rst @@ -378,28 +378,28 @@ Partitioning performance considerations Partitioning datasets has two aspects that affect performance: it increases the number of files and it creates a directory structure around the files. Both of these have benefits -as well as costs. Depending on the configuration and the size of your dataset, the costs -can outweigh the benefits. +as well as costs. Depending on the configuration and the size of your dataset, the costs +can outweigh the benefits. -Because partitions split up the dataset into multiple files, partitioned datasets can be -read and written with parallelism. However, each additional file adds a little overhead in -processing for filesystem interaction. It also increases the overall dataset size since +Because partitions split up the dataset into multiple files, partitioned datasets can be +read and written with parallelism. However, each additional file adds a little overhead in +processing for filesystem interaction. It also increases the overall dataset size since each file has some shared metadata. For example, each parquet file contains the schema and -group-level statistics. The number of partitions is a floor for the number of files. If -you partition a dataset by date with a year of data, you will have at least 365 files. If -you further partition by another dimension with 1,000 unique values, you will have up to +group-level statistics. The number of partitions is a floor for the number of files. If +you partition a dataset by date with a year of data, you will have at least 365 files. If +you further partition by another dimension with 1,000 unique values, you will have up to 365,000 files. This fine of partitioning often leads to small files that mostly consist of metadata. -Partitioned datasets create nested folder structures, and those allow us to prune which +Partitioned datasets create nested folder structures, and those allow us to prune which files are loaded in a scan. However, this adds overhead to discovering files in the dataset, as we'll need to recursively "list directory" to find the data files. Too fine partitions can cause problems here: Partitioning a dataset by date for a years worth -of data will require 365 list calls to find all the files; adding another column with +of data will require 365 list calls to find all the files; adding another column with cardinality 1,000 will make that 365,365 calls. The most optimal partitioning layout will depend on your data, access patterns, and which -systems will be reading the data. Most systems, including Arrow, should work across a +systems will be reading the data. Most systems, including Arrow, should work across a range of file sizes and partitioning layouts, but there are extremes you should avoid. These guidelines can help avoid some known worst cases: diff --git a/docs/source/cpp/datatypes.rst b/docs/source/cpp/datatypes.rst index 4e1fe76b4d6f2..7eb70936f4e1d 100644 --- a/docs/source/cpp/datatypes.rst +++ b/docs/source/cpp/datatypes.rst @@ -72,8 +72,8 @@ To instantiate data types, it is recommended to call the provided Type Traits ----------- -Writing code that can handle concrete :class:`arrow::DataType` subclasses would -be verbose, if it weren't for type traits. Arrow's type traits map the Arrow +Writing code that can handle concrete :class:`arrow::DataType` subclasses would +be verbose, if it weren't for type traits. Arrow's type traits map the Arrow data types to the specialized array, scalar, builder, and other associated types. For example, the Boolean type has traits: @@ -96,7 +96,7 @@ For example, the Boolean type has traits: See the :ref:`type-traits` for an explanation of each of these fields. Using type traits, one can write template functions that can handle a variety -of Arrow types. For example, to write a function that creates an array of +of Arrow types. For example, to write a function that creates an array of Fibonacci values for any Arrow numeric type: .. code-block:: cpp @@ -128,7 +128,7 @@ For some common cases, there are type associations on the classes themselves. Us Similar to the type traits provided in `std::type_traits `_, -Arrow provides type predicates such as ``is_number_type`` as well as +Arrow provides type predicates such as ``is_number_type`` as well as corresponding templates that wrap ``std::enable_if_t`` such as ``enable_if_number``. These can constrain template functions to only compile for relevant types, which is useful if other overloads need to be implemented. For example, to write a sum @@ -176,20 +176,20 @@ here is how one might sum across columns of arbitrary numeric types: class TableSummation { double partial = 0.0; public: - + arrow::Result Compute(std::shared_ptr batch) { for (std::shared_ptr array : batch->columns()) { ARROW_RETURN_NOT_OK(arrow::VisitArrayInline(*array, this)); } return partial; } - + // Default implementation arrow::Status Visit(const arrow::Array& array) { return arrow::Status::NotImplemented("Cannot compute sum for array of type ", array.type()->ToString()); } - + template arrow::enable_if_number Visit(const ArrayType& array) { for (std::optional value : array) { diff --git a/docs/source/cpp/examples/compute_and_write_example.rst b/docs/source/cpp/examples/compute_and_write_example.rst index e66d3ced55d0c..a4b619f7ffff3 100644 --- a/docs/source/cpp/examples/compute_and_write_example.rst +++ b/docs/source/cpp/examples/compute_and_write_example.rst @@ -21,8 +21,8 @@ Compute and Write CSV Example ============================= -The file ``cpp/examples/arrow/compute_and_write_csv_example.cc`` located inside -the source tree contains an example of creating a table of two numerical columns -and then comparing the magnitudes of the entries in the columns and writing out to +The file ``cpp/examples/arrow/compute_and_write_csv_example.cc`` located inside +the source tree contains an example of creating a table of two numerical columns +and then comparing the magnitudes of the entries in the columns and writing out to a CSV file with the column entries and their comparisons. The code in the example is documented. diff --git a/docs/source/cpp/flight.rst b/docs/source/cpp/flight.rst index e07a84e91ee4f..a1e9420bfd34e 100644 --- a/docs/source/cpp/flight.rst +++ b/docs/source/cpp/flight.rst @@ -350,10 +350,10 @@ Closing unresponsive connections calls Cancel() on a timer, with the main thread resetting the timer every time an operation completes successfully. For a fully-worked out example, see the Cookbook. - + .. note:: There is a long standing ticket for a per-write/per-read timeout instead of a per call timeout (ARROW-6062_), but this is not (easily) - possible to implement with the blocking gRPC API. + possible to implement with the blocking gRPC API. .. _best gRPC practices: https://grpc.io/docs/guides/performance/#general .. _gRPC keys: https://grpc.github.io/grpc/cpp/group__grpc__arg__keys.html diff --git a/docs/source/cpp/gandiva.rst b/docs/source/cpp/gandiva.rst index 07b07bee7ac4e..f60d1fc8ac8d9 100644 --- a/docs/source/cpp/gandiva.rst +++ b/docs/source/cpp/gandiva.rst @@ -29,8 +29,8 @@ Gandiva only handles projections and filters; for other transformations, see :ref:`Compute Functions `. Gandiva was designed to take advantage of the Arrow memory format and modern -hardware. From the Arrow memory model, since Arrow arrays have separate buffers for values and -validity bitmaps, values and their null status can often be processed +hardware. From the Arrow memory model, since Arrow arrays have separate buffers for values and +validity bitmaps, values and their null status can often be processed independently, allowing for better instruction pipelining. On modern hardware, compiling expressions using LLVM allows the execution to be optimized to the local runtime environment and hardware, including available SIMD @@ -42,25 +42,25 @@ pre-compiled into LLVM IR (intermediate representation). Expression, Projector and Filter ================================ -To effectively utilize Gandiva, you will construct expression trees with ``TreeExprBuilder``, -including the creation of function nodes, if-else logic, and boolean expressions. +To effectively utilize Gandiva, you will construct expression trees with ``TreeExprBuilder``, +including the creation of function nodes, if-else logic, and boolean expressions. Subsequently, leverage ``Projector`` or ``Filter`` execution kernels to efficiently evaluate these expressions. -See :doc:`./gandiva/expr_projector_filter` for more details. +See :doc:`./gandiva/expr_projector_filter` for more details. External Functions Development ============================== -Gandiva offers the capability of integrating external functions, encompassing -both C functions and IR functions. This feature broadens the spectrum of -functions that can be applied within Gandiva expressions. For developers -looking to customize and enhance their computational solutions, -Gandiva provides the opportunity to develop and register their own external -functions, thus allowing for a more tailored and flexible use of the Gandiva +Gandiva offers the capability of integrating external functions, encompassing +both C functions and IR functions. This feature broadens the spectrum of +functions that can be applied within Gandiva expressions. For developers +looking to customize and enhance their computational solutions, +Gandiva provides the opportunity to develop and register their own external +functions, thus allowing for a more tailored and flexible use of the Gandiva environment. -See :doc:`./gandiva/external_func` for more details. +See :doc:`./gandiva/external_func` for more details. .. toctree:: :maxdepth: 2 gandiva/expr_projector_filter - gandiva/external_func \ No newline at end of file + gandiva/external_func diff --git a/docs/source/cpp/gandiva/expr_projector_filter.rst b/docs/source/cpp/gandiva/expr_projector_filter.rst index c960d1d869fe5..9d58b185032e3 100644 --- a/docs/source/cpp/gandiva/expr_projector_filter.rst +++ b/docs/source/cpp/gandiva/expr_projector_filter.rst @@ -30,7 +30,7 @@ literal values, created by :func:`TreeExprBuilder::MakeLiteral`. Nodes can be combined into more complex expression trees using: * :func:`TreeExprBuilder::MakeFunction` to create a function - node. (You can call :func:`GetRegisteredFunctionSignatures` to + node. (You can call :func:`GetRegisteredFunctionSignatures` to get a list of valid function signatures.) * :func:`TreeExprBuilder::MakeIf` to create if-else logic. * :func:`TreeExprBuilder::MakeAnd` and :func:`TreeExprBuilder::MakeOr` @@ -39,7 +39,7 @@ can be combined into more complex expression trees using: functions to create set membership tests. Each of these functions create new composite nodes, which contain the leaf nodes -(literals and field references) or other composite nodes as children. By +(literals and field references) or other composite nodes as children. By composing these, you can create arbitrarily complex expression trees. Once an expression tree is built, they are wrapped in either :class:`Expression` @@ -84,7 +84,7 @@ reused to process distinct record batches in parallel. Evaluating projections ---------------------- -Execution is performed with :func:`Projector::Evaluate`. This outputs +Execution is performed with :func:`Projector::Evaluate`. This outputs a vector of arrays, which can be passed along with the output schema to :func:`arrow::RecordBatch::Make()`. @@ -99,14 +99,14 @@ Evaluating filters :func:`Filter::Evaluate` produces :class:`SelectionVector`, a vector of row indices that matched the filter condition. The selection vector -is a wrapper around an arrow integer array, parameterized by bitwidth. When -creating the selection vector (you must initialize it *before* passing to -``Evaluate()``), you must choose the bitwidth, which determines the max index +is a wrapper around an arrow integer array, parameterized by bitwidth. When +creating the selection vector (you must initialize it *before* passing to +``Evaluate()``), you must choose the bitwidth, which determines the max index value it can hold, and the max number of slots, which determines how many indices -it may contain. In general, the max number of slots should be set to your batch -size and the bitwidth the smallest integer size that can represent all integers -less than the batch size. For example, if your batch size is 100k, set the -maximum number of slots to 100k and the bitwidth to 32 (since 2^16 = 64k which +it may contain. In general, the max number of slots should be set to your batch +size and the bitwidth the smallest integer size that can represent all integers +less than the batch size. For example, if your batch size is 100k, set the +maximum number of slots to 100k and the bitwidth to 32 (since 2^16 = 64k which would be too small). Once ``Evaluate()`` has been run and the :class:`SelectionVector` is @@ -123,10 +123,10 @@ output record batch. Evaluating projections and filters ---------------------------------- -Finally, you can also project while apply a selection vector, with +Finally, you can also project while apply a selection vector, with :func:`Projector::Evaluate()`. To do so, first make sure to initialize the :class:`Projector` with :func:`SelectionVector::GetMode()` so that the projector -compiles with the correct bitwidth. Then you can pass the +compiles with the correct bitwidth. Then you can pass the :class:`SelectionVector` into the :func:`Projector::Evaluate()` method. @@ -134,4 +134,4 @@ compiles with the correct bitwidth. Then you can pass the :language: cpp :start-after: (Doc section: Evaluate filter and projection) :end-before: (Doc section: Evaluate filter and projection) - :dedent: 2 \ No newline at end of file + :dedent: 2 diff --git a/docs/source/cpp/gandiva/external_func.rst b/docs/source/cpp/gandiva/external_func.rst index cdd8fc82e59db..f8bdde83d96e6 100644 --- a/docs/source/cpp/gandiva/external_func.rst +++ b/docs/source/cpp/gandiva/external_func.rst @@ -79,7 +79,7 @@ The ``NativeFunction`` class is used to define the metadata for an external func * ``ResultNullableType::kResultNullIfNull``: result validity is an intersection of the validity of the children. * ``ResultNullableType::kResultNullNever``: result is always valid. * ``ResultNullableType::kResultNullInternal``: result validity depends on some internal logic. -* ``pc_name``: The name of the corresponding precompiled function. +* ``pc_name``: The name of the corresponding precompiled function. * Typically, this name follows the convention ``{base_name}`` + ``_{param1_type}`` + ``{param2_type}`` + ... + ``{paramN_type}``. For example, if the base name is ``add`` and the function takes two ``int32`` parameters and returns an ``int32``, the precompiled function name would be ``add_int32_int32``, but this convention is not mandatory as long as you can guarantee its uniqueness. * ``flags``: Optional flags for additional function attributes (default is 0). Please check out ``NativeFunction::kNeedsContext``, ``NativeFunction::kNeedsFunctionHolder``, and ``NativeFunction::kCanReturnErrors`` for more details. @@ -153,10 +153,10 @@ Not all Arrow data types are supported in Gandiva. The following table lists the | utf8 (as return type) | int64_t context, | | | const char*, | | | uint32_t* | -| | [see next section]| +| | [see next section]| +-------------------------------------+-------------------+ -Handling arrow::StringType (utf8 type) and arrow::BinaryType +Handling arrow::StringType (utf8 type) and arrow::BinaryType ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Both ``arrow::StringType`` and ``arrow::BinaryType`` are variable-length types. And they are handled similarly in external functions. Since ``arrow::StringType`` (utf8 type) is more commonly used, we will use it below as the example to explain how to handle variable-length types in external functions. @@ -179,7 +179,7 @@ When ``arrow::StringType`` (``utf8`` type) is used as the return type in a funct 2. **Function Parameters:** * **Context Parameter**: The C function should begin with an additional parameter, ``int64_t context``. This parameter is crucial for context management within the function. * **String Length Output Parameter**: The function should also include a ``uint32_t*`` parameter at the end. This output parameter will store the length of the returned string data. -3. **Return Value**: The function should return a ``const char*`` pointer, pointing to the string data. +3. **Return Value**: The function should return a ``const char*`` pointer, pointing to the string data. 4. **Function Implementation:** * **Memory Allocation and Error Messaging:** Within the function's implementation, use ``gdv_fn_context_arena_malloc`` and ``gdv_fn_context_set_error_msg`` for memory allocation and error messaging, respectively. Both functions take ``int64_t context`` as their first parameter, facilitating efficient context utilization. @@ -200,10 +200,10 @@ You can use ``gandiva::FunctionRegistry``'s APIs to register external C function NativeFunction func, void* c_function_ptr, std::optional function_holder_maker = std::nullopt); -The above API allows you to register an external C function. +The above API allows you to register an external C function. -* The ``NativeFunction`` object describes the metadata of the external C function. -* The ``c_function_ptr`` is the function pointer to the external C function's implementation. +* The ``NativeFunction`` object describes the metadata of the external C function. +* The ``c_function_ptr`` is the function pointer to the external C function's implementation. * The optional ``function_holder_maker`` is used to create a function holder for the external C function if the external C function requires a function holder. Check out the ``gandiva::FunctionHolder`` class and its several sub-classes for more details. External IR functions diff --git a/docs/source/cpp/getting_started.rst b/docs/source/cpp/getting_started.rst index 89bd4559ef1e6..2cab5d1581c1c 100644 --- a/docs/source/cpp/getting_started.rst +++ b/docs/source/cpp/getting_started.rst @@ -24,17 +24,17 @@ Getting Started The following articles demonstrate installation, use, and a basic understanding of Arrow. These articles will get you setup quickly using Arrow and give you a taste of what the library is capable of. -Specifically, it contains: an installation and linking guide; documentation of conventions used -in the codebase and suggested for users; and tutorials, including: +Specifically, it contains: an installation and linking guide; documentation of conventions used +in the codebase and suggested for users; and tutorials, including: -* Building Arrow arrays and tabular structures +* Building Arrow arrays and tabular structures * Reading and writing Parquet, Arrow, and CSV files * Executing compute kernels on arrays * Reading and writing multi-file partitioned datasets Start here to gain a basic understanding of Arrow, and move on to the :doc:`/cpp/user_guide` to -explore more specific topics and underlying concepts, or the :doc:`/cpp/api` to explore Arrow's -API. +explore more specific topics and underlying concepts, or the :doc:`/cpp/api` to explore Arrow's +API. .. toctree:: @@ -44,5 +44,3 @@ API. tutorials/io_tutorial.rst tutorials/compute_tutorial.rst tutorials/datasets_tutorial.rst - - diff --git a/docs/source/cpp/memory.rst b/docs/source/cpp/memory.rst index ad8276e3728a2..33907b5580f61 100644 --- a/docs/source/cpp/memory.rst +++ b/docs/source/cpp/memory.rst @@ -205,7 +205,7 @@ simply do:: Memory Profiling ================ -On Linux, detailed profiles of memory allocations can be generated using +On Linux, detailed profiles of memory allocations can be generated using ``perf record``, without any need to modify the binaries. These profiles can show the traceback in addition to allocation size. This does require debug symbols, from either a debug build or a release with debug symbols build. @@ -234,14 +234,14 @@ recorded allocations, so we can correlate them with the call to free/de-allocate .. tab-set:: .. tab-item:: jemalloc - + .. code-block:: shell - perf probe -x libarrow.so je_arrow_mallocx '$params' - perf probe -x libarrow.so je_arrow_mallocx%return '$retval' - perf probe -x libarrow.so je_arrow_rallocx '$params' - perf probe -x libarrow.so je_arrow_rallocx%return '$retval' - perf probe -x libarrow.so je_arrow_dallocx '$params' + perf probe -x libarrow.so je_arrow_mallocx '$params' + perf probe -x libarrow.so je_arrow_mallocx%return '$retval' + perf probe -x libarrow.so je_arrow_rallocx '$params' + perf probe -x libarrow.so je_arrow_rallocx%return '$retval' + perf probe -x libarrow.so je_arrow_dallocx '$params' PROBE_ARGS="-e probe_libarrow:je_arrow_mallocx \ -e probe_libarrow:je_arrow_mallocx__return \ -e probe_libarrow:je_arrow_rallocx \ @@ -249,13 +249,13 @@ recorded allocations, so we can correlate them with the call to free/de-allocate -e probe_libarrow:je_arrow_dallocx" .. tab-item:: mimalloc - + .. code-block:: shell - perf probe -x libarrow.so mi_malloc_aligned '$params' - perf probe -x libarrow.so mi_malloc_aligned%return '$retval' - perf probe -x libarrow.so mi_realloc_aligned '$params' - perf probe -x libarrow.so mi_realloc_aligned%return '$retval' + perf probe -x libarrow.so mi_malloc_aligned '$params' + perf probe -x libarrow.so mi_malloc_aligned%return '$retval' + perf probe -x libarrow.so mi_realloc_aligned '$params' + perf probe -x libarrow.so mi_realloc_aligned%return '$retval' perf probe -x libarrow.so mi_free '$params' PROBE_ARGS="-e probe_libarrow:mi_malloc_aligned \ -e probe_libarrow:mi_malloc_aligned__return \ @@ -277,9 +277,9 @@ If you want to profile a running process, you can run ``perf record -p `` and it will record until you interrupt with CTRL+C. Alternatively, you can do ``perf record -P sleep 10`` to record for 10 seconds. -The resulting data can be processed with standard tools to work with perf or +The resulting data can be processed with standard tools to work with perf or ``perf script`` can be used to pipe a text format of the data to custom scripts. -The following script parses ``perf script`` output and prints the output in +The following script parses ``perf script`` output and prints the output in new lines delimited JSON for easier processing. .. code-block:: python @@ -354,7 +354,7 @@ Here's an example invocation of that script, with a preview of output data: From there one can answer a number of questions. For example, the following -script will find which allocations were never freed, and print the associated +script will find which allocations were never freed, and print the associated tracebacks along with the count of dangling allocations: .. code-block:: python diff --git a/docs/source/cpp/parquet.rst b/docs/source/cpp/parquet.rst index 3e06352f5dde3..96897d139b351 100644 --- a/docs/source/cpp/parquet.rst +++ b/docs/source/cpp/parquet.rst @@ -51,8 +51,8 @@ FileReader ---------- To read Parquet data into Arrow structures, use :class:`arrow::FileReader`. -To construct, it requires a :class:`::arrow::io::RandomAccessFile` instance -representing the input file. To read the whole file at once, +To construct, it requires a :class:`::arrow::io::RandomAccessFile` instance +representing the input file. To read the whole file at once, use :func:`arrow::FileReader::ReadTable`: .. literalinclude:: ../../../cpp/examples/arrow/parquet_read_write.cc @@ -67,7 +67,7 @@ Finer-grained options are available through the and :class:`ArrowReaderProperties` classes. For reading as a stream of batches, use the :func:`arrow::FileReader::GetRecordBatchReader` -method to retrieve a :class:`arrow::RecordBatchReader`. It will use the batch +method to retrieve a :class:`arrow::RecordBatchReader`. It will use the batch size set in :class:`ArrowReaderProperties`. .. literalinclude:: ../../../cpp/examples/arrow/parquet_read_write.cc @@ -106,8 +106,8 @@ If memory efficiency is more important than performance, then: #. Turn on ``enable_buffered_stream`` in :class:`parquet::ReaderProperties`. In addition, if you know certain columns contain many repeated values, you can -read them as :term:`dictionary encoded` columns. This is -enabled with the ``set_read_dictionary`` setting on :class:`ArrowReaderProperties`. +read them as :term:`dictionary encoded` columns. This is +enabled with the ``set_read_dictionary`` setting on :class:`ArrowReaderProperties`. If the files were written with Arrow C++ and the ``store_schema`` was activated, then the original Arrow schema will be automatically read and will override this setting. @@ -174,7 +174,7 @@ The :func:`arrow::WriteTable` function writes an entire .. note:: - Column compression is off by default in C++. See :ref:`below ` + Column compression is off by default in C++. See :ref:`below ` for how to choose a compression codec in the writer properties. To write out data batch-by-batch, use :class:`arrow::FileWriter`. @@ -191,9 +191,9 @@ StreamWriter The :class:`StreamWriter` allows for Parquet files to be written using standard C++ output operators, similar to reading with the :class:`StreamReader` -class. This type-safe approach also ensures that rows are written without -omitting fields and allows for new row groups to be created automatically -(after certain volume of data) or explicitly by using the :type:`EndRowGroup` +class. This type-safe approach also ensures that rows are written without +omitting fields and allows for new row groups to be created automatically +(after certain volume of data) or explicitly by using the :type:`EndRowGroup` stream modifier. Exceptions are used to signal errors. A :class:`ParquetException` is @@ -266,20 +266,20 @@ group that takes precedent over the ``chunk_size`` passed in the write methods. You can set the version of Parquet to write with ``version``, which determines which logical types are available. In addition, you can set the data page version with ``data_page_version``. It's V1 by default; setting to V2 will allow more -optimal compression (skipping compressing pages where there isn't a space +optimal compression (skipping compressing pages where there isn't a space benefit), but not all readers support this data page version. -Compression is off by default, but to get the most out of Parquet, you should -also choose a compression codec. You can choose one for the whole file or +Compression is off by default, but to get the most out of Parquet, you should +also choose a compression codec. You can choose one for the whole file or choose one for individual columns. If you choose a mix, the file-level option -will apply to columns that don't have a specific compression codec. See +will apply to columns that don't have a specific compression codec. See :class:`::arrow::Compression` for options. -Column data encodings can likewise be applied at the file-level or at the -column level. By default, the writer will attempt to dictionary encode all +Column data encodings can likewise be applied at the file-level or at the +column level. By default, the writer will attempt to dictionary encode all supported columns, unless the dictionary grows too large. This behavior can be changed at file-level or at the column level with ``disable_dictionary()``. -When not using dictionary encoding, it will fallback to the encoding set for +When not using dictionary encoding, it will fallback to the encoding set for the column or the overall file; by default ``Encoding::PLAIN``, but this can be changed with ``encoding()``. @@ -559,7 +559,7 @@ Encryption Parquet C++ implements all features specified in the `encryption specification `__, -except for encryption of column index and bloom filter modules. +except for encryption of column index and bloom filter modules. More specifically, Parquet C++ supports: diff --git a/docs/source/cpp/tables.rst b/docs/source/cpp/tables.rst index b28a9fc1e13a5..d98a2acde6620 100644 --- a/docs/source/cpp/tables.rst +++ b/docs/source/cpp/tables.rst @@ -81,13 +81,13 @@ and computation functions, possibly incremental. :alt: A graphical representation of an Arrow Table and a Record Batch, with structure as described in text above. -Record batches can be sent between implementations, such as via +Record batches can be sent between implementations, such as via :ref:`IPC ` or -via the :doc:`C Data Interface <../format/CDataInterface>`. Tables and +via the :doc:`C Data Interface <../format/CDataInterface>`. Tables and chunked arrays, on the other hand, are concepts in the C++ implementation, not in the Arrow format itself, so they aren't directly portable. -However, a table can be converted to and built from a sequence of record +However, a table can be converted to and built from a sequence of record batches easily without needing to copy the underlying array buffers. A table can be streamed as an arbitrary number of record batches using a :class:`arrow::TableBatchReader`. Conversely, a logical sequence of diff --git a/docs/source/cpp/threading.rst b/docs/source/cpp/threading.rst index 24ad25b5a028a..4a1a65ffe012d 100644 --- a/docs/source/cpp/threading.rst +++ b/docs/source/cpp/threading.rst @@ -99,4 +99,4 @@ Arrow C++ uses :class:`arrow::Future` to communicate results between threads. T an :class:`arrow::Future` will be created when an operation needs to perform some kind of long running task that will block for some period of time. :class:`arrow::Future` objects are mainly meant for internal use and any method that returns an -:class:`arrow::Future` will usually have a synchronous variant as well. \ No newline at end of file +:class:`arrow::Future` will usually have a synchronous variant as well. diff --git a/docs/source/cpp/tutorials/compute_tutorial.rst b/docs/source/cpp/tutorials/compute_tutorial.rst index bcb87e6a8f992..a650865d75ce4 100644 --- a/docs/source/cpp/tutorials/compute_tutorial.rst +++ b/docs/source/cpp/tutorials/compute_tutorial.rst @@ -34,7 +34,7 @@ functionality to: 3. Search for a value in a column -Pre-requisites +Pre-requisites --------------- Before continuing, make sure you have: @@ -49,16 +49,16 @@ Setup Before running some computations, we need to fill in a couple gaps: 1. We need to include necessary headers. - + 2. ``A main()`` is needed to glue things together. 3. We need data to play with. - + Includes ^^^^^^^^ -Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's -compute functionality: +Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's +compute functionality: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/compute_example.cc :language: cpp @@ -340,4 +340,4 @@ Refer to the below for a copy of the complete code: :start-after: (Doc section: Compute Example) :end-before: (Doc section: Compute Example) :linenos: - :lineno-match: \ No newline at end of file + :lineno-match: diff --git a/docs/source/cpp/tutorials/datasets_tutorial.rst b/docs/source/cpp/tutorials/datasets_tutorial.rst index 285fc24d8d599..f60e1e52170ae 100644 --- a/docs/source/cpp/tutorials/datasets_tutorial.rst +++ b/docs/source/cpp/tutorials/datasets_tutorial.rst @@ -33,7 +33,7 @@ file on disk. In this article, you will: 2. write out a partitioned dataset from a Table. -Pre-requisites +Pre-requisites --------------- Before continuing, make sure you have: @@ -50,7 +50,7 @@ Setup Before running some computations, we need to fill in a couple gaps: 1. We need to include necessary headers. - + 2. A ``main()`` is needed to glue things together. 3. We need data on disk to play with. @@ -58,8 +58,8 @@ Before running some computations, we need to fill in a couple gaps: Includes ^^^^^^^^ -Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's -compute functionality for each file type we'll work with in this article: +Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's +compute functionality for each file type we'll work with in this article: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/dataset_example.cc :language: cpp @@ -206,7 +206,7 @@ Build Dataset using Factory ^^^^^^^^^^^^^^^^^^^^^^^^^^^ With a :class:`dataset::FileSystemDatasetFactory` set up, we can actually build our -:class:`dataset::Dataset` with :func:`dataset::FileSystemDatasetFactory::Finish`, just +:class:`dataset::Dataset` with :func:`dataset::FileSystemDatasetFactory::Finish`, just like with an :class:`ArrayBuilder` back in the basic tutorial: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/dataset_example.cc @@ -228,14 +228,14 @@ dataset, and print those out, along with some small info: Move Dataset into Table ^^^^^^^^^^^^^^^^^^^^^^^ -One way we can do something with :class:`Datasets ` is getting -them into a :class:`Table`, where we can do anything we’ve learned we can do to -:class:`Tables ` to that :class:`Table`. +One way we can do something with :class:`Datasets ` is getting +them into a :class:`Table`, where we can do anything we’ve learned we can do to +:class:`Tables
` to that :class:`Table`. .. seealso:: :doc:`/cpp/streaming_execution` for execution that avoids manifesting the entire dataset in memory. -In order to move a :class:`Dataset’s ` contents into a :class:`Table`, -we need a :class:`dataset::Scanner`, which scans the data and outputs it to the :class:`Table`. +In order to move a :class:`Dataset’s ` contents into a :class:`Table`, +we need a :class:`dataset::Scanner`, which scans the data and outputs it to the :class:`Table`. First, we get a :class:`dataset::ScannerBuilder` from the :class:`dataset::Dataset`: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/dataset_example.cc @@ -305,7 +305,7 @@ Create Scanner for Moving Table Data The process for writing a :class:`dataset::Dataset`, once a source of data is available, is similar to the reverse of reading it. Before, we used a :class:`dataset::Scanner` in order to scan into a :class:`Table` – now, we need one to read out of our -:class:`TableBatchReader`. To get that :class:`dataset::Scanner`, we’ll make a :class:`dataset::ScannerBuilder` +:class:`TableBatchReader`. To get that :class:`dataset::Scanner`, we’ll make a :class:`dataset::ScannerBuilder` based on our :class:`TableBatchReader`, then use that Builder to build a :class:`dataset::Scanner`: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/dataset_example.cc @@ -343,7 +343,7 @@ Arrow, so we’ll write back out to that: :start-after: (Doc section: Write Format) :end-before: (Doc section: Write Format) -Configure FileSystemDatasetWriteOptions +Configure FileSystemDatasetWriteOptions ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In order to write to disk, we need some configuration. We’ll do so via @@ -435,11 +435,11 @@ tutorials. With that, you’ve read and written partitioned datasets! This method, with some configuration, will work for any supported dataset format. For an example of such a dataset, the NYC Taxi dataset is a well-known -one, which you can find `here `_. +one, which you can find `here `_. Now you can get larger-than-memory data mapped for use! Which means that now we have to be able to process this data without -pulling it all into memory at once. For this, try Acero. +pulling it all into memory at once. For this, try Acero. .. seealso:: :doc:`/cpp/streaming_execution` for more information on Acero. @@ -450,4 +450,4 @@ Refer to the below for a copy of the complete code: :start-after: (Doc section: Dataset Example) :end-before: (Doc section: Dataset Example) :linenos: - :lineno-match: \ No newline at end of file + :lineno-match: diff --git a/docs/source/cpp/tutorials/io_tutorial.rst b/docs/source/cpp/tutorials/io_tutorial.rst index f981c94b83e32..309f10a350aa3 100644 --- a/docs/source/cpp/tutorials/io_tutorial.rst +++ b/docs/source/cpp/tutorials/io_tutorial.rst @@ -33,7 +33,7 @@ the start to end of an application. In this article, you will: 3. Read a Parquet file into a :class:`Table` and write it back out afterwards -Pre-requisites +Pre-requisites --------------- Before continuing, make sure you have: @@ -50,7 +50,7 @@ Setup Before writing out some file I/O, we need to fill in a couple gaps: 1. We need to include necessary headers. - + 2. A ``main()`` is needed to glue things together. 3. We need files to play with. @@ -58,8 +58,8 @@ Before writing out some file I/O, we need to fill in a couple gaps: Includes ^^^^^^^^ -Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's -I/O functionality for each file type we'll work with in this article: +Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's +I/O functionality for each file type we'll work with in this article: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/file_access_example.cc :language: cpp @@ -156,8 +156,8 @@ Opening an Arrow file Reader ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ An :class:`io::ReadableFile` is too generic to offer all functionality to read an Arrow file. -We need to use it to get an :class:`ipc::RecordBatchFileReader` object. This object implements -all the logic needed to read an Arrow file with correct formatting. We get one through +We need to use it to get an :class:`ipc::RecordBatchFileReader` object. This object implements +all the logic needed to read an Arrow file with correct formatting. We get one through :func:`ipc::RecordBatchFileReader::Open`: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/file_access_example.cc @@ -294,8 +294,8 @@ Write a CSV File from Table CSV writing to :class:`Table` looks exactly like IPC writing to :class:`RecordBatch`, except with our :class:`Table`, and using :func:`ipc::RecordBatchWriter::WriteTable` instead of -:func:`ipc::RecordBatchWriter::WriteRecordBatch`. Note that the same writer class is used -- -we're writing with :func:`ipc::RecordBatchWriter::WriteTable` because we have a :class:`Table`. We’ll target +:func:`ipc::RecordBatchWriter::WriteRecordBatch`. Note that the same writer class is used -- +we're writing with :func:`ipc::RecordBatchWriter::WriteTable` because we have a :class:`Table`. We’ll target a file, use our :class:`Table’s
` :class:`Schema`, and then write the :class:`Table`: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/file_access_example.cc @@ -358,7 +358,7 @@ even though we used :func:`io::ReadableFile::Open`. Note that we pass our Reading a Parquet File to Table ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -With a prepared :class:`parquet::arrow::FileReader` in hand, we can read to a +With a prepared :class:`parquet::arrow::FileReader` in hand, we can read to a :class:`Table`, except we must pass the :class:`Table` by reference instead of outputting to it: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/file_access_example.cc @@ -401,4 +401,4 @@ Refer to the below for a copy of the complete code: :start-after: (Doc section: File I/O) :end-before: (Doc section: File I/O) :linenos: - :lineno-match: \ No newline at end of file + :lineno-match: diff --git a/docs/source/developers/continuous_integration/index.rst b/docs/source/developers/continuous_integration/index.rst index f988b5ab69d50..cfca14e10e48c 100644 --- a/docs/source/developers/continuous_integration/index.rst +++ b/docs/source/developers/continuous_integration/index.rst @@ -27,4 +27,4 @@ Continuous Integration overview docker archery - crossbow \ No newline at end of file + crossbow diff --git a/docs/source/developers/cpp/building.rst b/docs/source/developers/cpp/building.rst index 5fab745679e93..040a046c5153d 100644 --- a/docs/source/developers/cpp/building.rst +++ b/docs/source/developers/cpp/building.rst @@ -67,7 +67,7 @@ On Alpine Linux: gcc \ ninja \ make - + On Fedora Linux: .. code-block:: shell @@ -99,7 +99,7 @@ On macOS, you can use `Homebrew `_: With `vcpkg `_: .. code-block:: shell - + git clone https://github.com/apache/arrow.git cd arrow vcpkg install \ @@ -362,7 +362,7 @@ boolean flags to ``cmake``. * ``-DARROW_GCS=ON``: Build Arrow with GCS support (requires the GCloud SDK for C++) * ``-DARROW_HDFS=ON``: Arrow integration with libhdfs for accessing the Hadoop Filesystem -* ``-DARROW_JEMALLOC=ON``: Build the Arrow jemalloc-based allocator, on by default +* ``-DARROW_JEMALLOC=ON``: Build the Arrow jemalloc-based allocator, on by default * ``-DARROW_JSON=ON``: JSON reader module * ``-DARROW_MIMALLOC=ON``: Build the Arrow mimalloc-based allocator * ``-DARROW_ORC=ON``: Arrow integration with Apache ORC @@ -375,7 +375,7 @@ boolean flags to ``cmake``. instead. * ``-DARROW_S3=ON``: Support for Amazon S3-compatible filesystems * ``-DARROW_SUBSTRAIT=ON``: Build with support for Substrait -* ``-DARROW_WITH_RE2=ON``: Build with support for regular expressions using the re2 +* ``-DARROW_WITH_RE2=ON``: Build with support for regular expressions using the re2 library, on by default and used when ``ARROW_COMPUTE`` or ``ARROW_GANDIVA`` is ``ON`` * ``-DARROW_WITH_UTF8PROC=ON``: Build with support for Unicode properties using the utf8proc library, on by default and used when ``ARROW_COMPUTE`` or ``ARROW_GANDIVA`` @@ -472,7 +472,7 @@ The build system supports a number of third-party dependencies * ``c-ares``: a dependency of gRPC * ``gflags``: for command line utilities (formerly Googleflags) * ``GLOG``: for logging - * ``google_cloud_cpp_storage``: for Google Cloud Storage support, requires + * ``google_cloud_cpp_storage``: for Google Cloud Storage support, requires system cURL and can use the ``BUNDLED`` method described below * ``gRPC``: for remote procedure calls * ``GTest``: Googletest, for testing diff --git a/docs/source/developers/cpp/windows.rst b/docs/source/developers/cpp/windows.rst index 251a45325fe0b..60ac949e81663 100644 --- a/docs/source/developers/cpp/windows.rst +++ b/docs/source/developers/cpp/windows.rst @@ -379,9 +379,9 @@ Downloading the Timezone Database ================================= To run some of the compute unit tests on Windows, the IANA timezone database -and the Windows timezone mapping need to be downloaded first. See +and the Windows timezone mapping need to be downloaded first. See :ref:`download-timezone-database` for download instructions. To set a non-default -path for the timezone database while running the unit tests, set the +path for the timezone database while running the unit tests, set the ``ARROW_TIMEZONE_DATABASE`` environment variable. Replicating Appveyor Builds diff --git a/docs/source/developers/guide/architectural_overview.rst b/docs/source/developers/guide/architectural_overview.rst index 58e05c85f457e..085a814453c84 100644 --- a/docs/source/developers/guide/architectural_overview.rst +++ b/docs/source/developers/guide/architectural_overview.rst @@ -29,8 +29,8 @@ Architectural Overview ********************** -A general overview of Apache Arrow project can be found on the -`front page `_ and in the +A general overview of Apache Arrow project can be found on the +`front page `_ and in the `Apache Arrow Overview `_. You can also have a look at the `Frequently Asked Questions `_. diff --git a/docs/source/developers/guide/communication.rst b/docs/source/developers/guide/communication.rst index a8659f83ac04d..749c94f9419b2 100644 --- a/docs/source/developers/guide/communication.rst +++ b/docs/source/developers/guide/communication.rst @@ -27,7 +27,7 @@ .. _communication: ************* -Communication +Communication ************* **About the contributors** @@ -50,7 +50,7 @@ tags ([C++], [R], [Ruby] etc.) so it gets noticed by the right people. Where to get help 👋 ==================== -For any question you may have or problems you are facing you can write to +For any question you may have or problems you are facing you can write to user or development :ref:`mailing_list` or you can create an issue on :ref:`github`. Also use GitHub to search through the issues, report bugs and create feature requests or proposals. diff --git a/docs/source/developers/guide/documentation.rst b/docs/source/developers/guide/documentation.rst index 3bb3bebef5098..8f9d7311e765f 100644 --- a/docs/source/developers/guide/documentation.rst +++ b/docs/source/developers/guide/documentation.rst @@ -49,7 +49,7 @@ documentation itself, you can search for an issue in GitHub. Documentation improvements are also a great way to gain some experience with our submission and review process without -requiring a lot of local development environment setup. +requiring a lot of local development environment setup. .. note:: Many documentation-only changes can be made directly in the @@ -114,4 +114,3 @@ library. Source folder includes: **Cookbooks** have their own repository ``_ and can be separately cloned and built. - diff --git a/docs/source/developers/guide/index.rst b/docs/source/developers/guide/index.rst index 353c8332ff0b5..0ed27a0ddc54e 100644 --- a/docs/source/developers/guide/index.rst +++ b/docs/source/developers/guide/index.rst @@ -83,17 +83,17 @@ of adding a basic feature. the installation of third-party packages, depending on which build options and components you enable. The C++ build guide has suggestions for commonly encountered issues - you can find it - :ref:`here `. + :ref:`here `. Anytime you are stuck, feel free to reach out via appropriate :ref:`communication` channel. - See a short description about the building process of + See a short description about the building process of :ref:`PyArrow or the R package` or go straight to detailed instructions on how to build one of Arrow libraries in the `documentation `_ . - + #. **Run the tests** - + We should run the tests to check if everything is working correctly. For example, you can run the tests from a terminal for Python @@ -155,7 +155,7 @@ There are lots of ways to contribute to the project besides writing code! * Improving the **documentation** is a great way to start contributing! For more information visit :ref:`documentation` section of the guide. -* **Apache Arrow Cookbooks** are a collection of recipes for solving various problems +* **Apache Arrow Cookbooks** are a collection of recipes for solving various problems and completing different tasks using Apache Arrow. They are also a great way to start contributing. For more information visit `How to contribute to Apache Arrow Cookbook `_ diff --git a/docs/source/developers/guide/resources.rst b/docs/source/developers/guide/resources.rst index f350f469af403..b5905af65499b 100644 --- a/docs/source/developers/guide/resources.rst +++ b/docs/source/developers/guide/resources.rst @@ -78,7 +78,7 @@ Reproducible examples: - `Tidyverse: Make a reprex `_ - `Craft Minimal Bug Reports by Matthew Rocklin `_ -Recommended references +Recommended references ---------------------- - Slatkin, Brett, *Effective Python: 90 Specific Ways to Write Better Python*, Addison-Wesley Professional, 2019 diff --git a/docs/source/developers/guide/step_by_step/finding_issues.rst b/docs/source/developers/guide/step_by_step/finding_issues.rst index 390c56a81c73f..a76b15e917e9a 100644 --- a/docs/source/developers/guide/step_by_step/finding_issues.rst +++ b/docs/source/developers/guide/step_by_step/finding_issues.rst @@ -65,7 +65,7 @@ person who triaged the ticket expected it to be. Don't hesitate to write that in the comments. .. note:: - + When you find a GitHub issue you would like to work on, please mention your interest in the comment section of that issue; that way we will know you are working on it. diff --git a/docs/source/developers/guide/step_by_step/set_up.rst b/docs/source/developers/guide/step_by_step/set_up.rst index 60b472637badb..9a2177568d6f5 100644 --- a/docs/source/developers/guide/step_by_step/set_up.rst +++ b/docs/source/developers/guide/step_by_step/set_up.rst @@ -60,7 +60,7 @@ a username and password each time you execute a git command. RStudio project and will create a ``.Rproj`` file in the root directory. For this reason it is *highly recommended* to clone the repository using the command line or a Git client. - + Get the source code =================== diff --git a/docs/source/developers/guide/step_by_step/styling.rst b/docs/source/developers/guide/step_by_step/styling.rst index bb428b0b6ab40..c155acb389512 100644 --- a/docs/source/developers/guide/step_by_step/styling.rst +++ b/docs/source/developers/guide/step_by_step/styling.rst @@ -59,4 +59,4 @@ check your code and will stop the commit process, described in the following section, if there are any errors. - `Pre-commit installation instructions `_ -- `Pre-commit hooks `_ \ No newline at end of file +- `Pre-commit hooks `_ diff --git a/docs/source/developers/guide/tutorials/index.rst b/docs/source/developers/guide/tutorials/index.rst index dcefab23230f9..5f44231afc9c2 100644 --- a/docs/source/developers/guide/tutorials/index.rst +++ b/docs/source/developers/guide/tutorials/index.rst @@ -25,4 +25,4 @@ Tutorials :maxdepth: 1 python_tutorial - r_tutorial \ No newline at end of file + r_tutorial diff --git a/docs/source/developers/guide/tutorials/python_tutorial.rst b/docs/source/developers/guide/tutorials/python_tutorial.rst index 7f004160b0e75..c12c4489aee95 100644 --- a/docs/source/developers/guide/tutorials/python_tutorial.rst +++ b/docs/source/developers/guide/tutorials/python_tutorial.rst @@ -137,7 +137,7 @@ function is defined in the ``compute.py`` file. After examining the ``compute.py`` file we can see that together with ``_compute.pyx`` the functions from C++ get wrapped into Python. -We will define the new feature at the end of the ``compute.py`` file. +We will define the new feature at the end of the ``compute.py`` file. Lets run some code in the Python console from ``arrow/python`` directory in order to learn more about ``pc.min_max``. @@ -147,10 +147,10 @@ directory in order to learn more about ``pc.min_max``. $ cd python $ python - Python 3.9.7 (default, Oct 22 2021, 13:24:00) + Python 3.9.7 (default, Oct 22 2021, 13:24:00) [Clang 13.0.0 (clang-1300.0.29.3)] on darwin Type "help", "copyright", "credits" or "license" for more information. - + We have entered into the Python console from the shell and we can do some research: @@ -278,7 +278,7 @@ options for the ``pc.min_max`` function we can finish the work. return pa.scalar([('min-', min_t), ('max+', max_t)], type=ty) .. TODO seealso - .. For more information about the Arrow codebase visit + .. For more information about the Arrow codebase visit .. :ref:``. (link to working on the Arrow codebase section) Adding a test @@ -303,24 +303,24 @@ a specific unit test, pass in the test name to the ``-k`` parameter. .. code:: console $ cd python - $ python -m pytest pyarrow/tests/test_compute.py -k test_tutorial_min_max + $ python -m pytest pyarrow/tests/test_compute.py -k test_tutorial_min_max ======================== test session starts ========================== platform darwin -- Python 3.9.7, pytest-6.2.5, py-1.10.0, pluggy-1.0.0 rootdir: /Users/alenkafrim/repos/arrow/python, configfile: setup.cfg plugins: hypothesis-6.24.1, lazy-fixture-0.6.3 - collected 204 items / 203 deselected / 1 selected + collected 204 items / 203 deselected / 1 selected pyarrow/tests/test_compute.py . [100%] ======================== 1 passed, 203 deselected in 0.16s ============ - - $ python -m pytest pyarrow/tests/test_compute.py + + $ python -m pytest pyarrow/tests/test_compute.py ======================== test session starts =========================== platform darwin -- Python 3.9.7, pytest-6.2.5, py-1.10.0, pluggy-1.0.0 rootdir: /Users/alenkafrim/repos/arrow/python, configfile: setup.cfg plugins: hypothesis-6.24.1, lazy-fixture-0.6.3 - collected 204 items + collected 204 items pyarrow/tests/test_compute.py ................................... [ 46%] ................................................. [100%] @@ -339,7 +339,7 @@ utility called `Archery ` to check if code is in line with PEP 8 style guide. .. code:: console - + $ archery lint --python --fix INFO:archery:Running Python formatter (autopep8) INFO:archery:Running Python linter (flake8) @@ -430,7 +430,7 @@ to the branch history): $ git commit -am "Adding a new compute feature for tutorial purposes" [ARROW-14977 170ef85be] Adding a new compute feature for tutorial purposes 2 files changed, 51 insertions(+) - + We can use ``git log`` to check the history of commits: @@ -448,12 +448,12 @@ We can use ``git log`` to check the history of commits: Date: Sun Dec 5 15:19:46 2021 +0900 ARROW-14981: [CI][Docs] Upload built documents - + We can use this in release process instead of building on release manager's local environment. - + Closes #11856 from kou/ci-docs-upload - + Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei ... @@ -478,10 +478,10 @@ called ``origin``. Writing objects: 100% (7/7), 1.19 KiB | 1.19 MiB/s, done. Total 7 (delta 6), reused 0 (delta 0), pack-reused 0 remote: Resolving deltas: 100% (6/6), completed with 6 local objects. - remote: + remote: remote: Create a pull request for 'ARROW-14977' on GitHub by visiting: remote: https://github.com/AlenkaF/arrow/pull/new/ARROW-14977 - remote: + remote: To https://github.com/AlenkaF/arrow.git * [new branch] ARROW-14977 -> ARROW-14977 @@ -490,7 +490,7 @@ to create a Pull Request. On the GitHub Arrow page (main or forked) we will see a yellow notice bar with a note that we made recent pushes to the branch ARROW-14977. That’s great, now we can make the Pull Request -by clicking on **Compare & pull request**. +by clicking on **Compare & pull request**. .. figure:: ../../images/python_tutorial_github_pr_notice.jpeg :scale: 50 % @@ -527,5 +527,5 @@ the code, comment, resolve conversations and so on. The Pull Request we made can be viewed `here `_. .. seealso:: - + For more information about Pull Request workflow see :ref:`pr_lifecycle`. diff --git a/docs/source/developers/java/building.rst b/docs/source/developers/java/building.rst index c059ff676efb2..82053e901186c 100644 --- a/docs/source/developers/java/building.rst +++ b/docs/source/developers/java/building.rst @@ -350,7 +350,7 @@ Arrow repository, and update the following settings: * To enable debugging JNI-based modules like ``dataset``, activate specific profiles in the Maven tab under "Profiles". Ensure the profiles ``arrow-c-data``, ``arrow-jni``, ``generate-libs-cdata-all-os``, - ``generate-libs-jni-macos-linux``, and ``jdk11+`` are enabled, so that the + ``generate-libs-jni-macos-linux``, and ``jdk11+`` are enabled, so that the IDE can build them and enable debugging. You may not need to update all of these settings if you build/test with the diff --git a/docs/source/developers/overview.rst b/docs/source/developers/overview.rst index c7bc4273313bc..5a18b1e4eb8db 100644 --- a/docs/source/developers/overview.rst +++ b/docs/source/developers/overview.rst @@ -75,7 +75,7 @@ checklist for using ``git``: locally, for example if additional commits have been made by a colleague. By using ``--force-with-lease`` instead of ``--force``, you ensure those commits are not overwritten and can fetch those changes if desired. - + .. dropdown:: Setting rebase to be default :animate: fade-in-slide-down :class-container: sd-shadow-none @@ -202,4 +202,3 @@ Implementations that do not intend to implement cross endian support: For other libraries, a discussion to gather consensus on the mailing-list should be had before submitting PRs. - diff --git a/docs/source/developers/release.rst b/docs/source/developers/release.rst index e7431ce0fb7b9..0b3a83dc5aabe 100644 --- a/docs/source/developers/release.rst +++ b/docs/source/developers/release.rst @@ -80,10 +80,10 @@ Ensure local tags are removed, gpg-agent is set and JIRA tickets are correctly a # Delete the local tag for RC1 or later git tag -d apache-arrow- - + # Setup gpg agent for signing artifacts source dev/release/setup-gpg-agent.sh - + # Curate the release # The end of the generated report shows the JIRA tickets with wrong version number assigned. archery release curate @@ -180,7 +180,7 @@ Create the Release Candidate branch from the updated maintenance branch # Start from the updated maintenance branch. git checkout maint-X.Y.Z - + # The following script will create a branch for the Release Candidate, # place the necessary commits updating the version number and then create a git tag # on OSX use gnu-sed with homebrew: brew install gnu-sed (and export to $PATH) @@ -188,7 +188,7 @@ Create the Release Candidate branch from the updated maintenance branch # starts at 0 and increments every time the Release Candidate is burned # so for the first RC this would be: dev/release/01-prepare.sh 4.0.0 5.0.0 0 dev/release/01-prepare.sh - + # Push the release tag (for RC1 or later the --force flag is required) git push -u apache apache-arrow- # Push the release candidate branch in order to trigger verification jobs later @@ -201,23 +201,23 @@ Build source and binaries and submit them # Build the source release tarball and create Pull Request with verification tasks dev/release/02-source.sh - + # Submit binary tasks using crossbow, the command will output the crossbow build id dev/release/03-binary-submit.sh - + # Wait for the crossbow jobs to finish archery crossbow status - + # Download the produced binaries # This will download packages to a directory called packages/release--rc dev/release/04-binary-download.sh - + # Sign and upload the binaries # # On macOS the only way I could get this to work was running "echo "UPDATESTARTUPTTY" | gpg-connect-agent" before running this comment # otherwise I got errors referencing "ioctl" errors. dev/release/05-binary-upload.sh - + # Sign and upload the Java artifacts # # Note that you need to press the "Close" button manually by Web interface diff --git a/docs/source/developers/release_verification.rst b/docs/source/developers/release_verification.rst index 8c301b44a3c42..afd220db6010d 100644 --- a/docs/source/developers/release_verification.rst +++ b/docs/source/developers/release_verification.rst @@ -55,7 +55,7 @@ and test the result on their own platform in order to cast a +1 vote. # this will create and automatically clean up a temporary directory for the verification environment and will run the source verification TEST_DEFAULT=0 TEST_SOURCE=1 verify-release-candidate.sh $VERSION $RC_NUM - + # to verify only certain implementations use the TEST_DEFAULT=0 and TEST_* variables # here are a couple of examples, but see the source code for the available options TEST_DEFAULT=0 TEST_CPP=1 verify-release-candidate.sh $VERSION $RC_NUM # only C++ tests diff --git a/docs/source/developers/reviewing.rst b/docs/source/developers/reviewing.rst index b6e0c1f4023bd..1550d6aa7ce61 100644 --- a/docs/source/developers/reviewing.rst +++ b/docs/source/developers/reviewing.rst @@ -260,14 +260,14 @@ Social aspects Labelling ========= -While reviewing PRs, we should try to identify whether the corresponding issue +While reviewing PRs, we should try to identify whether the corresponding issue needs to be marked with one or both of the following issue labels: * **Critical Fix**: The change fixes either: (a) a security vulnerability; (b) a bug that causes incorrect or invalid data to be produced; or (c) a bug that causes a crash (while the API contract is upheld). This is intended to mark fixes to issues that may affect users without their - knowledge. For this reason, fixing bugs that cause errors don't count, since + knowledge. For this reason, fixing bugs that cause errors don't count, since those bugs are usually obvious. Bugs that cause crashes are considered critical because they are a possible vector of Denial-of-Service attacks. * **Breaking Change**: The change breaks backwards compatibility in a public API. @@ -275,7 +275,7 @@ needs to be marked with one or both of the following issue labels: compatibility, except for the few places where we do guarantee ABI compatibility (such as C Data Interface). Experimental APIs are *not* exempt from this; they are just more likely to be associated with this tag. - + Breaking changes and critical fixes are separate: breaking changes alter the API contract, while critical fixes make the implementation align with the existing API contract. For example, fixing a bug that caused a Parquet reader diff --git a/docs/source/format/CDataInterface/PyCapsuleInterface.rst b/docs/source/format/CDataInterface/PyCapsuleInterface.rst index 03095aa2e9356..67f77f53f012b 100644 --- a/docs/source/format/CDataInterface/PyCapsuleInterface.rst +++ b/docs/source/format/CDataInterface/PyCapsuleInterface.rst @@ -64,7 +64,7 @@ structures should be wrapped in capsules. Capsules avoid invalid access by attaching a name to the pointer and avoid memory leaks by attaching a destructor. Thus, they are much safer than passing pointers as integers. -`PyCapsule`_ allows for a ``name`` to be associated with the capsule, allowing +`PyCapsule`_ allows for a ``name`` to be associated with the capsule, allowing consumers to verify that the capsule contains the expected kind of data. To make sure Arrow structures are recognized, the following names must be used: @@ -133,8 +133,8 @@ Arrays and record batches (contiguous tables) can implement the method Export the object as a pair of ArrowSchema and ArrowArray structures. - :param requested_schema: A PyCapsule containing a C ArrowSchema representation - of a requested schema. Conversion to this schema is best-effort. See + :param requested_schema: A PyCapsule containing a C ArrowSchema representation + of a requested schema. Conversion to this schema is best-effort. See `Schema Requests`_. :type requested_schema: PyCapsule or None @@ -152,8 +152,8 @@ Tables / DataFrames and streams can implement the method ``__arrow_c_stream__``. Export the object as an ArrowArrayStream. - :param requested_schema: A PyCapsule containing a C ArrowSchema representation - of a requested schema. Conversion to this schema is best-effort. See + :param requested_schema: A PyCapsule containing a C ArrowSchema representation + of a requested schema. Conversion to this schema is best-effort. See `Schema Requests`_. :type requested_schema: PyCapsule or None @@ -192,7 +192,7 @@ schema transformations. Protocol Typehints ------------------ -The following typehints can be copied into your library to annotate that a +The following typehints can be copied into your library to annotate that a function accepts an object implementing one of these protocols. .. code-block:: python @@ -248,7 +248,7 @@ Below is the code to create a PyCapsule for an ``ArrowSchema``. The code for } free(schema); } - + PyObject* ExportArrowSchemaPyCapsule() { struct ArrowSchema* schema = (struct ArrowSchema*)malloc(sizeof(struct ArrowSchema)); @@ -270,9 +270,9 @@ Below is the code to create a PyCapsule for an ``ArrowSchema``. The code for ) if schema.release != NULL: schema.release(schema) - + free(schema) - + cdef object export_arrow_schema_py_capsule(): cdef ArrowSchema* schema = malloc(sizeof(ArrowSchema)) # It's recommended to immediately wrap the struct in a capsule, so @@ -305,7 +305,7 @@ code for ``ArrowArray`` and ``ArrowArrayStream`` is similar. .. code-block:: c #include - + // If the capsule is not an ArrowSchema, will return NULL and set an exception. struct ArrowSchema* GetArrowSchemaPyCapsule(PyObject* capsule) { return PyCapsule_GetPointer(capsule, "arrow_schema"); @@ -316,7 +316,7 @@ code for ``ArrowArray`` and ``ArrowArrayStream`` is similar. .. code-block:: cython cimport cpython - + cdef ArrowSchema* get_arrow_schema_py_capsule(object capsule) except NULL: return cpython.PyCapsule_GetPointer(capsule, 'arrow_schema') @@ -429,7 +429,7 @@ implementing the DataFrame Interchange Protocol. Comparison to ``__arrow_array__`` protocol ------------------------------------------ -The :ref:`arrow_array_protocol` protocol is a dunder method that +The :ref:`arrow_array_protocol` protocol is a dunder method that defines how PyArrow should import an object as an Arrow array. Unlike this protocol, it is specific to PyArrow and isn't used by other libraries. It is -also limited to arrays and does not support schemas, tabular structures, or streams. \ No newline at end of file +also limited to arrays and does not support schemas, tabular structures, or streams. diff --git a/docs/source/format/Glossary.rst b/docs/source/format/Glossary.rst index 3f2f118a95d6d..11c19c5fa70e9 100644 --- a/docs/source/format/Glossary.rst +++ b/docs/source/format/Glossary.rst @@ -211,7 +211,7 @@ Glossary its bindings, and Go). .. image:: ../cpp/tables-versus-record-batches.svg - :alt: A graphical representation of an Arrow Table and a + :alt: A graphical representation of an Arrow Table and a Record Batch, with structure as described in text above. .. seealso:: :term:`chunked array`, :term:`record batch` diff --git a/docs/source/format/Integration.rst b/docs/source/format/Integration.rst index 1a9b1b97f07ee..c800255687796 100644 --- a/docs/source/format/Integration.rst +++ b/docs/source/format/Integration.rst @@ -501,14 +501,14 @@ integration testing actually tests. There are two types of integration test cases: the ones populated on the fly by the data generator in the Archery utility, and *gold* files that exist -in the `arrow-testing ` +in the `arrow-testing ` repository. Data Generator Tests ~~~~~~~~~~~~~~~~~~~~ This is the high-level description of the cases which are generated and -tested using the ``archery integration`` command (see ``get_generated_json_files`` +tested using the ``archery integration`` command (see ``get_generated_json_files`` in ``datagen.py``): * Primitive Types @@ -549,7 +549,7 @@ Gold File Integration Tests Pre-generated json and arrow IPC files (both file and stream format) exist in the `arrow-testing `__ repository in the ``data/arrow-ipc-stream/integration`` directory. These serve as -*gold* files that are assumed to be correct for use in testing. They are +*gold* files that are assumed to be correct for use in testing. They are referenced by ``runner.py`` in the code for the :ref:`Archery ` utility. Below are the test cases which are covered by them: @@ -563,7 +563,7 @@ utility. Below are the test cases which are covered by them: + intervals + maps + nested types (list, struct) - + primitives + + primitives + primitive with no batches + primitive with zero length batches diff --git a/docs/source/java/algorithm.rst b/docs/source/java/algorithm.rst index 316fd38fa0990..06ed32bd48cf7 100644 --- a/docs/source/java/algorithm.rst +++ b/docs/source/java/algorithm.rst @@ -20,12 +20,12 @@ Java Algorithms Arrow's Java library provides algorithms for some commonly-used functionalities. The algorithms are provided in the ``org.apache.arrow.algorithm`` -package of the ``algorithm`` module. +package of the ``algorithm`` module. Comparing Vector Elements ------------------------- -Comparing vector elements is the basic for many algorithms. Vector +Comparing vector elements is the basic for many algorithms. Vector elements can be compared in one of the two ways: 1. **Equality comparison**: there are two possible results for this type of comparisons: ``equal`` and ``unequal``. @@ -36,30 +36,30 @@ interface. and ``greater than``. This comparison is supported by the abstract class ``org.apache.arrow.algorithm.sort.VectorValueComparator``. We provide default implementations to compare vector elements. However, users can also define ways -for customized comparisons. +for customized comparisons. Vector Element Search --------------------- -A search algorithm tries to find a particular value in a vector. When successful, a vector index is +A search algorithm tries to find a particular value in a vector. When successful, a vector index is returned; otherwise, a ``-1`` is returned. The following search algorithms are provided: -1. **Linear search**: this algorithm simply traverses the vector from the beginning, until a match is +1. **Linear search**: this algorithm simply traverses the vector from the beginning, until a match is found, or the end of the vector is reached. So it takes ``O(n)`` time, where ``n`` is the number of elements in the vector. This algorithm is implemented in ``org.apache.arrow.algorithm.search.VectorSearcher#linearSearch``. -2. **Binary search**: this represents a more efficient search algorithm, as it runs in ``O(log(n))`` time. +2. **Binary search**: this represents a more efficient search algorithm, as it runs in ``O(log(n))`` time. However, it is only applicable to sorted vectors. To get a sorted vector, one can use one of our sorting algorithms, which will be discussed in the next section. This algorithm is implemented in ``org.apache.arrow.algorithm.search.VectorSearcher#binarySearch``. 3. **Parallel search**: when the vector is large, it takes a long time to traverse the elements to search -for a value. To make this process faster, one can split the vector into multiple partitions, and perform the +for a value. To make this process faster, one can split the vector into multiple partitions, and perform the search for each partition in parallel. This is supported by ``org.apache.arrow.algorithm.search.ParallelSearcher``. -4. **Range search**: for many scenarios, there can be multiple matching values in the vector. +4. **Range search**: for many scenarios, there can be multiple matching values in the vector. If the vector is sorted, the matching values reside in a contiguous region in the vector. The -range search algorithm tries to find the upper/lower bound of the region in ``O(log(n))`` time. +range search algorithm tries to find the upper/lower bound of the region in ``O(log(n))`` time. An implementation is provided in ``org.apache.arrow.algorithm.search.VectorRangeSearcher``. Vector Sorting @@ -72,19 +72,19 @@ classified into the following categories: 1. **In-place sorter**: an in-place sorter performs the sorting by manipulating the original vector, without creating any new vector. So it just returns the original vector after the sorting operations. Currently, we have ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter`` for in-place -sorting in ``O(nlog(n))`` time. As the name suggests, it only supports fixed width vectors. +sorting in ``O(nlog(n))`` time. As the name suggests, it only supports fixed width vectors. 2. **Out-of-place sorter**: an out-of-place sorter does not mutate the original vector. Instead, it copies vector elements to a new vector in sorted order, and returns the new vector. -We have ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter.FixedWidthOutOfPlaceVectorSorter`` +We have ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter.FixedWidthOutOfPlaceVectorSorter`` and ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter.VariableWidthOutOfPlaceVectorSorter`` -for fixed width and variable width vectors, respectively. Both algorithms run in ``O(nlog(n))`` time. +for fixed width and variable width vectors, respectively. Both algorithms run in ``O(nlog(n))`` time. 3. **Index sorter**: this sorter does not actually sort the vector. Instead, it returns an integer vector, which correspond to indices of vector elements in sorted order. With the index vector, one can easily construct a sorted vector. In addition, some other tasks can be easily achieved, like finding the ``k``th -smallest value in the vector. Index sorting is supported by ``org.apache.arrow.algorithm.sort.IndexSorter``, -which runs in ``O(nlog(n))`` time. It is applicable to vectors of any type. +smallest value in the vector. Index sorting is supported by ``org.apache.arrow.algorithm.sort.IndexSorter``, +which runs in ``O(nlog(n))`` time. It is applicable to vectors of any type. Other Algorithms ---------------- diff --git a/docs/source/java/flight.rst b/docs/source/java/flight.rst index e009998be4f4e..6d26583aeefa6 100644 --- a/docs/source/java/flight.rst +++ b/docs/source/java/flight.rst @@ -184,7 +184,7 @@ Handshake-based authentication can be enabled by implementing ``ServerAuthHandler``. Authentication consists of two parts: on initial client connection, the server and client authentication implementations can perform any negotiation needed. The client authentication -handler then provides a token that will be attached to future calls. +handler then provides a token that will be attached to future calls. The client send data to be validated through ``ClientAuthHandler.authenticate`` The server validate data received through ``ServerAuthHandler.authenticate``. diff --git a/docs/source/java/flight_sql_jdbc_driver.rst b/docs/source/java/flight_sql_jdbc_driver.rst index 0ace2185983a9..cc8822247b007 100644 --- a/docs/source/java/flight_sql_jdbc_driver.rst +++ b/docs/source/java/flight_sql_jdbc_driver.rst @@ -169,8 +169,8 @@ when using the JDBC Driver Manager to connect. When supplying using the Properties object, values should *not* be URI-encoded. Parameters specified by the URI supercede parameters supplied by the -Properties object. When calling the `user/password overload of -DriverManager#getConnection() +Properties object. When calling the `user/password overload of +DriverManager#getConnection() `_, the username and password supplied on the URI supercede the username and password arguments to the function call. diff --git a/docs/source/java/memory.rst b/docs/source/java/memory.rst index 036befa148692..8014a27444ac9 100644 --- a/docs/source/java/memory.rst +++ b/docs/source/java/memory.rst @@ -20,7 +20,7 @@ Memory Management ================= The memory modules contain all the functionality that Arrow uses to allocate and deallocate memory. This document is divided in two parts: -The first part, *Memory Basics*, provides a high-level introduction. The following section, *Arrow Memory In-Depth*, fills in the details. +The first part, *Memory Basics*, provides a high-level introduction. The following section, *Arrow Memory In-Depth*, fills in the details. .. contents:: @@ -39,7 +39,7 @@ Getting Started Arrow's memory management is built around the needs of the columnar format and using off-heap memory. Arrow Java has its own independent implementation. It does not wrap the C++ implementation, although the framework is flexible enough -to be used with memory allocated in C++ that is used by Java code. +to be used with memory allocated in C++ that is used by Java code. Arrow provides multiple modules: the core interfaces, and implementations of the interfaces. Users need the core interfaces, and exactly one of the implementations. @@ -67,9 +67,9 @@ Why Arrow Uses Direct Memory BufferAllocator --------------- -The `BufferAllocator`_ is primarily an arena or nursery used for accounting of buffers (ArrowBuf instances). -As the name suggests, it can allocate new buffers associated with itself, but it can also -handle the accounting for buffers allocated elsewhere. For example, it handles the Java-side accounting for +The `BufferAllocator`_ is primarily an arena or nursery used for accounting of buffers (ArrowBuf instances). +As the name suggests, it can allocate new buffers associated with itself, but it can also +handle the accounting for buffers allocated elsewhere. For example, it handles the Java-side accounting for memory allocated in C++ and shared with Java using the C-Data Interface. In the code below it performs an allocation: .. code-block:: Java @@ -100,21 +100,21 @@ memory from a child allocator, those allocations are also reflected in all paren effectively sets the program-wide memory limit, and serves as the master bookkeeper for all memory allocations. Child allocators are not strictly required, but can help better organize code. For instance, a lower memory limit can -be set for a particular section of code. The child allocator can be closed when that section completes, -at which point it checks that that section didn't leak any memory. +be set for a particular section of code. The child allocator can be closed when that section completes, +at which point it checks that that section didn't leak any memory. Child allocators can also be named, which makes it easier to tell where an ArrowBuf came from during debugging. Reference counting ------------------ -Because direct memory is expensive to allocate and deallocate, allocators may share direct buffers. To managed shared buffers -deterministically, we use manual reference counting instead of the garbage collector. +Because direct memory is expensive to allocate and deallocate, allocators may share direct buffers. To managed shared buffers +deterministically, we use manual reference counting instead of the garbage collector. This simply means that each buffer has a counter keeping track of the number of references to the buffer, and the user is responsible for properly incrementing/decrementing the counter as the buffer is used. In Arrow, each ArrowBuf has an associated `ReferenceManager`_ that tracks the reference count. You can retrieve -it with ArrowBuf.getReferenceManager(). The reference count is updated using `ReferenceManager.release`_ to decrement the count, -and `ReferenceManager.retain`_ to increment it. +it with ArrowBuf.getReferenceManager(). The reference count is updated using `ReferenceManager.release`_ to decrement the count, +and `ReferenceManager.retain`_ to increment it. Of course, this is tedious and error-prone, so instead of directly working with buffers, we typically use higher-level APIs like ValueVector. Such classes generally implement Closeable/AutoCloseable and will automatically @@ -289,7 +289,7 @@ Finally, enabling the ``TRACE`` logging level will automatically provide this st | at (#8:1) Sometimes, explicitly passing allocators around is difficult. For example, it -can be hard to pass around extra state, like an allocator, through layers of +can be hard to pass around extra state, like an allocator, through layers of existing application or framework code. A global or singleton allocator instance can be useful here, though it should not be your first choice. @@ -370,7 +370,7 @@ Arrow’s memory model is based on the following basic concepts: leaks. - The same physical memory can be shared by multiple allocators and the allocator must provide an accounting paradigm for this purpose. - + Reserving Memory ---------------- @@ -384,17 +384,17 @@ Arrow provides two different ways to reserve memory: - ``AllocationReservation`` via BufferAllocator.newReservation(): Allows a short-term preallocation strategy so that a particular subsystem can ensure future memory is available to support a - particular request. - + particular request. + Reference Counting Details -------------------------- -Typically, the ReferenceManager implementation used is an instance of `BufferLedger`_. -A BufferLedger is a ReferenceManager that also maintains the relationship between an ``AllocationManager``, +Typically, the ReferenceManager implementation used is an instance of `BufferLedger`_. +A BufferLedger is a ReferenceManager that also maintains the relationship between an ``AllocationManager``, a ``BufferAllocator`` and one or more individual ``ArrowBuf``\ s -All ArrowBufs (direct or sliced) related to a single BufferLedger/BufferAllocator combination -share the same reference count and either all will be valid or all will be invalid. +All ArrowBufs (direct or sliced) related to a single BufferLedger/BufferAllocator combination +share the same reference count and either all will be valid or all will be invalid. For simplicity of accounting, we treat that memory as being used by one of the BufferAllocators associated with the memory. When that allocator releases its claim on that memory, the memory ownership is then moved to @@ -411,7 +411,7 @@ There are several Allocator types in Arrow Java: - ``ChildAllocator`` - A child allocator that derives from the root allocator Many BufferAllocators can reference the same piece of physical memory at the same -time. It is the AllocationManager’s responsibility to ensure that in this situation, +time. It is the AllocationManager’s responsibility to ensure that in this situation, all memory is accurately accounted for from the Root’s perspective and also to ensure that the memory is correctly released once all BufferAllocators have stopped using that memory. diff --git a/docs/source/java/quickstartguide.rst b/docs/source/java/quickstartguide.rst index e358681c57830..a71ddc5b5e55f 100644 --- a/docs/source/java/quickstartguide.rst +++ b/docs/source/java/quickstartguide.rst @@ -313,4 +313,4 @@ Example: Read the dataset from the previous example from an Arrow IPC file (rand More examples available at `Arrow Java Cookbook`_. -.. _`Arrow Java Cookbook`: https://arrow.apache.org/cookbook/java \ No newline at end of file +.. _`Arrow Java Cookbook`: https://arrow.apache.org/cookbook/java diff --git a/docs/source/java/vector.rst b/docs/source/java/vector.rst index abbbd1a236d6d..1c3e123cf50fb 100644 --- a/docs/source/java/vector.rst +++ b/docs/source/java/vector.rst @@ -226,7 +226,7 @@ A :class:`ListVector` is a vector that holds a list of values for each index. Wo For example, the code below shows how to build a :class:`ListVector` of int's using the writer :class:`UnionListWriter`. We build a vector from 0 to 9 and each index contains a list with values [[0, 0, 0, 0, 0], [0, 1, 2, 3, 4], [0, 2, 4, 6, 8], …, [0, 9, 18, 27, 36]]. List values can be added in any order so writing a list such as [3, 1, 2] would be just as valid. .. code-block:: Java - + try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); ListVector listVector = ListVector.empty("vector", allocator)) { UnionListWriter writer = listVector.getWriter(); @@ -240,7 +240,7 @@ For example, the code below shows how to build a :class:`ListVector` of int's us writer.endList(); } listVector.setValueCount(10); - } + } :class:`ListVector` values can be accessed either through the get API or through the reader class :class:`UnionListReader`. To read all the values, first enumerate through the indexes, and then enumerate through the inner list values. diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index 928c607d139ce..ae48578a1bd61 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -52,10 +52,10 @@ Aggregations Cumulative Functions -------------------- -Cumulative functions are vector functions that perform a running accumulation on -their input using a given binary associative operation with an identity element -(a monoid) and output an array containing the corresponding intermediate running -values. The input is expected to be of numeric type. By default these functions +Cumulative functions are vector functions that perform a running accumulation on +their input using a given binary associative operation with an identity element +(a monoid) and output an array containing the corresponding intermediate running +values. The input is expected to be of numeric type. By default these functions do not detect overflow. They are also available in an overflow-checking variant, suffixed ``_checked``, which throws an ``ArrowInvalid`` exception when overflow is detected. diff --git a/docs/source/python/api/substrait.rst b/docs/source/python/api/substrait.rst index 66e88fcd279ae..1556be9dbd011 100644 --- a/docs/source/python/api/substrait.rst +++ b/docs/source/python/api/substrait.rst @@ -50,4 +50,4 @@ Utility .. autosummary:: :toctree: ../generated/ - get_supported_functions \ No newline at end of file + get_supported_functions diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index c02059a4f8faa..ce3dfabb0e689 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -23,7 +23,7 @@ Compute Functions ================= Arrow supports logical compute operations over inputs of possibly -varying types. +varying types. The standard compute operations are provided by the :mod:`pyarrow.compute` module and can be used directly:: @@ -91,7 +91,7 @@ Grouped Aggregations ==================== PyArrow supports grouped aggregations over :class:`pyarrow.Table` through the -:meth:`pyarrow.Table.group_by` method. +:meth:`pyarrow.Table.group_by` method. The method will return a grouping declaration to which the hash aggregation functions can be applied:: @@ -300,7 +300,7 @@ Filtering by Expressions :class:`.Table` and :class:`.Dataset` can both be filtered using a boolean :class:`.Expression`. -The expression can be built starting from a +The expression can be built starting from a :func:`pyarrow.compute.field`. Comparisons and transformations can then be applied to one or more fields to build the filter expression you care about. @@ -325,7 +325,7 @@ in column ``"nums"`` by the ``bit_wise_and`` operation equals ``0``. Only the numbers where the last bit was ``0`` will return a ``0`` as the result of ``num & 1`` and as all numbers where the last bit is ``0`` are multiples of ``2`` we will be filtering for the even numbers only. - + Once we have our filter, we can provide it to the :meth:`.Table.filter` method to filter our table only for the matching rows: @@ -392,7 +392,7 @@ User-Defined Functions PyArrow allows defining and registering custom compute functions. These functions can then be called from Python as well as C++ (and potentially any other implementation wrapping Arrow C++, such as the R ``arrow`` package) -using their registered function name. +using their registered function name. UDF support is limited to scalar functions. A scalar function is a function which executes elementwise operations on arrays or scalars. In general, the output of a @@ -441,7 +441,7 @@ output type need to be defined. Using :func:`pyarrow.compute.register_scalar_fun function_docs, input_types, output_type) - + The implementation of a user-defined function always takes a first *context* parameter (named ``ctx`` in the example above) which is an instance of @@ -497,9 +497,9 @@ the GCD of one column with the scalar value 30. We will be re-using the category: [["A","B","C","D"]] Note that ``ds.field('')._call(...)`` returns a :func:`pyarrow.compute.Expression`. -The arguments passed to this function call are expressions, not scalar values +The arguments passed to this function call are expressions, not scalar values (notice the difference between :func:`pyarrow.scalar` and :func:`pyarrow.compute.scalar`, -the latter produces an expression). +the latter produces an expression). This expression is evaluated when the projection operator executes it. Projection Expressions diff --git a/docs/source/python/dataset.rst b/docs/source/python/dataset.rst index daab36f9a7be9..00469fd57becf 100644 --- a/docs/source/python/dataset.rst +++ b/docs/source/python/dataset.rst @@ -575,28 +575,28 @@ Partitioning performance considerations Partitioning datasets has two aspects that affect performance: it increases the number of files and it creates a directory structure around the files. Both of these have benefits -as well as costs. Depending on the configuration and the size of your dataset, the costs -can outweigh the benefits. +as well as costs. Depending on the configuration and the size of your dataset, the costs +can outweigh the benefits. -Because partitions split up the dataset into multiple files, partitioned datasets can be -read and written with parallelism. However, each additional file adds a little overhead in -processing for filesystem interaction. It also increases the overall dataset size since +Because partitions split up the dataset into multiple files, partitioned datasets can be +read and written with parallelism. However, each additional file adds a little overhead in +processing for filesystem interaction. It also increases the overall dataset size since each file has some shared metadata. For example, each parquet file contains the schema and -group-level statistics. The number of partitions is a floor for the number of files. If -you partition a dataset by date with a year of data, you will have at least 365 files. If -you further partition by another dimension with 1,000 unique values, you will have up to +group-level statistics. The number of partitions is a floor for the number of files. If +you partition a dataset by date with a year of data, you will have at least 365 files. If +you further partition by another dimension with 1,000 unique values, you will have up to 365,000 files. This fine of partitioning often leads to small files that mostly consist of metadata. -Partitioned datasets create nested folder structures, and those allow us to prune which +Partitioned datasets create nested folder structures, and those allow us to prune which files are loaded in a scan. However, this adds overhead to discovering files in the dataset, as we'll need to recursively "list directory" to find the data files. Too fine partitions can cause problems here: Partitioning a dataset by date for a years worth -of data will require 365 list calls to find all the files; adding another column with +of data will require 365 list calls to find all the files; adding another column with cardinality 1,000 will make that 365,365 calls. The most optimal partitioning layout will depend on your data, access patterns, and which -systems will be reading the data. Most systems, including Arrow, should work across a +systems will be reading the data. Most systems, including Arrow, should work across a range of file sizes and partitioning layouts, but there are extremes you should avoid. These guidelines can help avoid some known worst cases: @@ -611,35 +611,35 @@ of file size. Arrow's file writer provides sensible defaults for group sizing in Configuring files open during a write ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -When writing data to the disk, there are a few parameters that can be +When writing data to the disk, there are a few parameters that can be important to optimize the writes, such as the number of rows per file and the maximum number of open files allowed during the write. Set the maximum number of files opened with the ``max_open_files`` parameter of :meth:`write_dataset`. -If ``max_open_files`` is set greater than 0 then this will limit the maximum +If ``max_open_files`` is set greater than 0 then this will limit the maximum number of files that can be left open. This only applies to writing partitioned datasets, where rows are dispatched to the appropriate file depending on their partition values. If an attempt is made to open too many files then the least recently used file will be closed. If this setting is set too low you may end up fragmenting your data into many small files. -If your process is concurrently using other file handlers, either with a -dataset scanner or otherwise, you may hit a system file handler limit. For +If your process is concurrently using other file handlers, either with a +dataset scanner or otherwise, you may hit a system file handler limit. For example, if you are scanning a dataset with 300 files and writing out to 900 files, the total of 1200 files may be over a system limit. (On Linux, this might be a "Too Many Open Files" error.) You can either reduce this ``max_open_files`` setting or increase the file handler limit on your system. The default value is 900 which allows some number of files -to be open by the scanner before hitting the default Linux limit of 1024. +to be open by the scanner before hitting the default Linux limit of 1024. -Another important configuration used in :meth:`write_dataset` is ``max_rows_per_file``. +Another important configuration used in :meth:`write_dataset` is ``max_rows_per_file``. Set the maximum number of rows written in each file with the ``max_rows_per_files`` parameter of :meth:`write_dataset`. -If ``max_rows_per_file`` is set greater than 0 then this will limit how many +If ``max_rows_per_file`` is set greater than 0 then this will limit how many rows are placed in any single file. Otherwise there will be no limit and one file will be created in each output directory unless files need to be closed to respect ``max_open_files``. This setting is the primary way to control file size. @@ -653,22 +653,22 @@ Configuring rows per group during a write The volume of data written to the disk per each group can be configured. This configuration includes a lower and an upper bound. -The minimum number of rows required to form a row group is +The minimum number of rows required to form a row group is defined with the ``min_rows_per_group`` parameter of :meth:`write_dataset`. .. note:: - If ``min_rows_per_group`` is set greater than 0 then this will cause the - dataset writer to batch incoming data and only write the row groups to the - disk when sufficient rows have accumulated. The final row group size may be - less than this value if other options such as ``max_open_files`` or + If ``min_rows_per_group`` is set greater than 0 then this will cause the + dataset writer to batch incoming data and only write the row groups to the + disk when sufficient rows have accumulated. The final row group size may be + less than this value if other options such as ``max_open_files`` or ``max_rows_per_file`` force smaller row group sizes. The maximum number of rows allowed per group is defined with the ``max_rows_per_group`` parameter of :meth:`write_dataset`. -If ``max_rows_per_group`` is set greater than 0 then the dataset writer may split -up large incoming batches into multiple row groups. If this value is set then -``min_rows_per_group`` should also be set or else you may end up with very small +If ``max_rows_per_group`` is set greater than 0 then the dataset writer may split +up large incoming batches into multiple row groups. If this value is set then +``min_rows_per_group`` should also be set or else you may end up with very small row groups (e.g. if the incoming row group size is just barely larger than this value). Row groups are built into the Parquet and IPC/Feather formats but don't affect JSON or CSV. @@ -719,7 +719,7 @@ Customizing & inspecting written files By default the dataset API will create files named "part-i.format" where "i" is a integer generated during the write and "format" is the file format specified in the write_dataset call. For simple datasets it may be possible to know which files will be created but for -larger or partitioned datasets it is not so easy. The ``file_visitor`` keyword can be used +larger or partitioned datasets it is not so easy. The ``file_visitor`` keyword can be used to supply a visitor that will be called as each file is created: .. ipython:: python diff --git a/docs/source/python/dlpack.rst b/docs/source/python/dlpack.rst index f612ebabde5c9..024c2800e1107 100644 --- a/docs/source/python/dlpack.rst +++ b/docs/source/python/dlpack.rst @@ -90,4 +90,4 @@ Convert a PyArrow CPU array to PyTorch tensor: >>> import torch >>> torch.from_dlpack(array) - tensor([2, 0, 2, 4]) + tensor([2, 0, 2, 4]) diff --git a/docs/source/python/filesystems.rst b/docs/source/python/filesystems.rst index 5309250351d8e..22f983a60c349 100644 --- a/docs/source/python/filesystems.rst +++ b/docs/source/python/filesystems.rst @@ -233,7 +233,7 @@ generate a credentials file in the default location:: To connect to a public bucket without using any credentials, you must pass ``anonymous=True`` to :class:`GcsFileSystem`. Otherwise, the filesystem -will report ``Couldn't resolve host name`` since there are different host +will report ``Couldn't resolve host name`` since there are different host names for authenticated and public access. Example showing how you can read contents from a GCS bucket:: @@ -314,7 +314,7 @@ For example:: # using this to read a partitioned dataset import pyarrow.dataset as ds ds.dataset("data/", filesystem=fs) - + Similarly for Azure Blob Storage:: import adlfs diff --git a/docs/source/python/getstarted.rst b/docs/source/python/getstarted.rst index d38fcadab288f..42e415c40b835 100644 --- a/docs/source/python/getstarted.rst +++ b/docs/source/python/getstarted.rst @@ -37,7 +37,7 @@ in tabular data. Arrow also provides support for various formats to get those tabular data in and out of disk and networks. Most commonly used formats are -Parquet (:ref:`parquet`) and the IPC format (:ref:`ipc`). +Parquet (:ref:`parquet`) and the IPC format (:ref:`ipc`). Creating Arrays and Tables -------------------------- @@ -63,7 +63,7 @@ in tabular data when attached to a column name birthdays_table = pa.table([days, months, years], names=["days", "months", "years"]) - + birthdays_table See :ref:`data` for more details. @@ -75,7 +75,7 @@ Once you have tabular data, Arrow provides out of the box the features to save and restore that data for common formats like Parquet: -.. ipython:: python +.. ipython:: python import pyarrow.parquet as pq @@ -92,14 +92,14 @@ data will be as quick as possible reloaded_birthdays Saving and loading back data in arrow is usually done through -:ref:`Parquet `, :ref:`IPC format ` (:ref:`feather`), +:ref:`Parquet `, :ref:`IPC format ` (:ref:`feather`), :ref:`CSV ` or :ref:`Line-Delimited JSON ` formats. Performing Computations ----------------------- Arrow ships with a bunch of compute functions that can be applied -to its arrays and tables, so through the compute functions +to its arrays and tables, so through the compute functions it's possible to apply transformations to the data .. ipython:: python @@ -122,7 +122,7 @@ smaller chunks import pyarrow.dataset as ds - ds.write_dataset(birthdays_table, "savedir", format="parquet", + ds.write_dataset(birthdays_table, "savedir", format="parquet", partitioning=ds.partitioning( pa.schema([birthdays_table.schema.field("years")]) )) @@ -151,8 +151,8 @@ how to project them, etc., refer to :ref:`dataset` documentation. Continuing from here -------------------- -For digging further into Arrow, you might want to read the -:doc:`PyArrow Documentation <./index>` itself or the +For digging further into Arrow, you might want to read the +:doc:`PyArrow Documentation <./index>` itself or the `Arrow Python Cookbook `_ diff --git a/docs/source/python/getting_involved.rst b/docs/source/python/getting_involved.rst index 7b3bcf2ac527a..9fda3c7c78488 100644 --- a/docs/source/python/getting_involved.rst +++ b/docs/source/python/getting_involved.rst @@ -54,7 +54,7 @@ used as foundations to build easier to use entities. exposed to the user are declared. In some cases, those files might directly import the entities from inner implementation if they want to expose it as is without modification. -* The ``lib.pyx`` file is where the majority of the core C++ libarrow +* The ``lib.pyx`` file is where the majority of the core C++ libarrow capabilities are exposed to Python. Most of the implementation of this module relies on included ``*.pxi`` files where the specific pieces are built. While being exposed to Python as ``pyarrow.lib`` its content @@ -73,4 +73,4 @@ used as foundations to build easier to use entities. PyArrow is also based on PyArrow C++, dedicated pieces of code that live in ``python/pyarrow/src/arrow/python`` directory and provide the low level code for capabilities like converting to and from numpy or pandas and the classes - that allow to use Python objects and callbacks in C++. \ No newline at end of file + that allow to use Python objects and callbacks in C++. diff --git a/docs/source/python/integration/python_r.rst b/docs/source/python/integration/python_r.rst index 20627c3782d3c..ec5dfc366fdf9 100644 --- a/docs/source/python/integration/python_r.rst +++ b/docs/source/python/integration/python_r.rst @@ -29,7 +29,7 @@ marshaling and unmarshaling data. The article takes for granted that you have a ``Python`` environment with ``pyarrow`` correctly installed and an ``R`` environment with - ``arrow`` library correctly installed. + ``arrow`` library correctly installed. See `Python Install Instructions `_ and `R Install instructions `_ for further details. @@ -52,7 +52,7 @@ We could save such a function in a ``addthree.R`` file so that we can make it available for reuse. Once the ``addthree.R`` file is created we can invoke any of its functions -from Python using the +from Python using the `rpy2 `_ library which enables a R runtime within the Python interpreter. @@ -91,12 +91,12 @@ to access the ``R`` function and print the expected result: .. code-block:: bash - $ python addthree.py + $ python addthree.py 6 If instead of passing around basic data types we want to pass around Arrow Arrays, we can do so relying on the -`rpy2-arrow `_ +`rpy2-arrow `_ module which implements ``rpy2`` support for Arrow types. ``rpy2-arrow`` can be installed through ``pip``: @@ -189,7 +189,7 @@ Invoking the ``addthree.R`` script will print the outcome of adding .. code-block:: bash - $ R --silent -f addthree.R + $ R --silent -f addthree.R Array [ @@ -219,7 +219,7 @@ necessary to import an Arrow Array in R from the C Data interface. That work will be done by the ``addthree_cdata`` function which invokes the ``addthree`` function once the Array is imported. -Our ``addthree.R`` will thus have both the ``addthree_cdata`` and the +Our ``addthree.R`` will thus have both the ``addthree_cdata`` and the ``addthree`` functions: .. code-block:: R @@ -261,7 +261,7 @@ Our ``addthree.py`` will thus become: # Import the pyarrow module that provides access to the C Data interface from pyarrow.cffi import ffi as arrow_c - # Allocate structures where we will export the Array data + # Allocate structures where we will export the Array data # and the Array schema. They will be released when we exit the with block. with arrow_c.new("struct ArrowArray*") as c_array, \ arrow_c.new("struct ArrowSchema*") as c_schema: @@ -274,7 +274,7 @@ Our ``addthree.py`` will thus become: array.type._export_to_c(c_schema_ptr) # Invoke the R addthree_cdata function passing the references - # to the array and schema C Data structures. + # to the array and schema C Data structures. # Those references are passed as strings as R doesn't have # native support for 64bit integers, so the integers are # converted to their string representation for R to convert it back. @@ -289,19 +289,19 @@ Our ``addthree.py`` will thus become: # Once the returned array is exported to a C Data infrastructure # we can import it back into pyarrow using Array._import_from_c py_array = pyarrow.Array._import_from_c(c_array_ptr, c_schema_ptr) - + print("RESULT", py_array) Running the newly changed ``addthree.py`` will now print the Array resulting -from adding ``3`` to all the elements of the original +from adding ``3`` to all the elements of the original ``pyarrow.array((1, 2, 3))`` array: .. code-block:: bash - $ python addthree.py + $ python addthree.py R[write to console]: Attaching package: ‘arrow’ RESULT [ 4, 5, 6 - ] \ No newline at end of file + ] diff --git a/docs/source/python/ipc.rst b/docs/source/python/ipc.rst index 27cd14a68853d..f55e8f8bc5dc3 100644 --- a/docs/source/python/ipc.rst +++ b/docs/source/python/ipc.rst @@ -76,12 +76,12 @@ this one can be created with :func:`~pyarrow.ipc.new_stream`: .. ipython:: python sink = pa.BufferOutputStream() - + with pa.ipc.new_stream(sink, batch.schema) as writer: for i in range(5): writer.write_batch(batch) -Here we used an in-memory Arrow buffer stream (``sink``), +Here we used an in-memory Arrow buffer stream (``sink``), but this could have been a socket or some other IO sink. When creating the ``StreamWriter``, we pass the schema, since the schema @@ -102,7 +102,7 @@ convenience function ``pyarrow.ipc.open_stream``: with pa.ipc.open_stream(buf) as reader: schema = reader.schema batches = [b for b in reader] - + schema len(batches) @@ -126,7 +126,7 @@ The :class:`~pyarrow.RecordBatchFileWriter` has the same API as .. ipython:: python sink = pa.BufferOutputStream() - + with pa.ipc.new_file(sink, batch.schema) as writer: for i in range(10): writer.write_batch(batch) @@ -164,7 +164,7 @@ DataFrame output: with pa.ipc.open_file(buf) as reader: df = reader.read_pandas() - + df[:5] Efficiently Writing and Reading Arrow Data diff --git a/docs/source/python/json.rst b/docs/source/python/json.rst index 99ecbc19a1230..eff6135d895a7 100644 --- a/docs/source/python/json.rst +++ b/docs/source/python/json.rst @@ -21,7 +21,7 @@ Reading JSON files ================== -Arrow supports reading columnar data from line-delimited JSON files. +Arrow supports reading columnar data from line-delimited JSON files. In this context, a JSON file consists of multiple JSON objects, one per line, representing individual data rows. For example, this file represents two rows of data with four columns "a", "b", "c", "d": diff --git a/docs/source/python/orc.rst b/docs/source/python/orc.rst index bfa68fc34d895..76c293d742010 100644 --- a/docs/source/python/orc.rst +++ b/docs/source/python/orc.rst @@ -112,7 +112,7 @@ control various settings when writing an ORC file. * ``file_version``, the ORC format version to use. ``'0.11'`` ensures compatibility with older readers, while ``'0.12'`` is the newer one. -* ``stripe_size``, to control the approximate size of data within a column +* ``stripe_size``, to control the approximate size of data within a column stripe. This currently defaults to 64MB. See the :func:`~pyarrow.orc.write_table()` docstring for more details. diff --git a/docs/source/python/parquet.rst b/docs/source/python/parquet.rst index d4717897660b6..029ed4f1a3e15 100644 --- a/docs/source/python/parquet.rst +++ b/docs/source/python/parquet.rst @@ -32,7 +32,7 @@ performance data IO. Apache Arrow is an ideal in-memory transport layer for data that is being read or written with Parquet files. We have been concurrently developing the `C++ -implementation of +implementation of Apache Parquet `_, which includes a native, multithreaded C++ adapter to and from in-memory Arrow data. PyArrow includes Python bindings to this code, which thus enables reading diff --git a/docs/source/python/timestamps.rst b/docs/source/python/timestamps.rst index 64a2a354dddef..cecbd5b595bc7 100644 --- a/docs/source/python/timestamps.rst +++ b/docs/source/python/timestamps.rst @@ -51,8 +51,8 @@ This implies a few things when round-tripping timestamps: #. Timezone information is lost (all timestamps that result from converting from spark to arrow/pandas are "time zone naive"). #. Timestamps are truncated to microseconds. -#. The session time zone might have unintuitive impacts on - translation of timestamp values. +#. The session time zone might have unintuitive impacts on + translation of timestamp values. Spark to Pandas (through Apache Arrow) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -62,8 +62,8 @@ The following cases assume the Spark configuration :: - >>> pdf = pd.DataFrame({'naive': [datetime(2019, 1, 1, 0)], - ... 'aware': [Timestamp(year=2019, month=1, day=1, + >>> pdf = pd.DataFrame({'naive': [datetime(2019, 1, 1, 0)], + ... 'aware': [Timestamp(year=2019, month=1, day=1, ... nanosecond=500, tz=timezone(timedelta(hours=-8)))]}) >>> pdf naive aware @@ -77,7 +77,7 @@ The following cases assume the Spark configuration +-------------------+-------------------+ |2019-01-01 00:00:00|2019-01-01 08:00:00| +-------------------+-------------------+ - + Note that conversion of the aware timestamp is shifted to reflect the time assuming UTC (it represents the same instant in time). For naive timestamps, Spark treats them as being in the system local @@ -129,7 +129,7 @@ session time zone is still PST: |2019-01-01 00:00:00|2019-01-01 00:00:00| +-------------------+-------------------+ - + >>> pst_df.toPandas() naive aware 0 2019-01-01 2019-01-01 @@ -141,7 +141,7 @@ session time zone is still PST: aware 1 non-null datetime64[ns] dtypes: datetime64[ns](2) memory usage: 96.0 bytes - + Notice that, in addition to being a "time zone naive" timestamp, the 'aware' value will now differ when converting to an epoch offset. Spark does the conversion by first converting to the session time zone (or system local time zone if @@ -158,9 +158,9 @@ time: >>> (pst_df.toPandas()['aware'][0].timestamp()-pdf['aware'][0].timestamp())/3600 -8.0 -The same type of conversion happens with the data frame converted while -the session time zone was UTC. In this case both naive and aware -represent different instants in time (the naive instant is due to +The same type of conversion happens with the data frame converted while +the session time zone was UTC. In this case both naive and aware +represent different instants in time (the naive instant is due to the change in session time zone between creating data frames): :: @@ -179,9 +179,9 @@ the change in session time zone between creating data frames): Note that the surprising shift for aware doesn't happen when the session time zone is UTC (but the timestamps still become "time zone naive"): - + :: - + >>> spark.conf.set("spark.sql.session.timeZone", "UTC") >>> pst_df.show() +-------------------+-------------------+ @@ -189,7 +189,7 @@ still become "time zone naive"): +-------------------+-------------------+ |2019-01-01 08:00:00|2019-01-01 08:00:00| +-------------------+-------------------+ - + >>> pst_df.toPandas()['aware'][0] Timestamp('2019-01-01 08:00:00') >>> pdf['aware'][0]