Skip to content

Commit

Permalink
apacheGH-39990: [Docs][CI] Add sphinx-lint for docs linting (apache#4…
Browse files Browse the repository at this point in the history
…0022)

### What changes are included in this PR?

This adds developer tooling to the repo for linting the docs by adding the sphinx-lint tool to archery and our pre-commit hooks. In both locations, only two rules are enabled at the moment (Discussed in apache#40006): `trailing-whitespace` and `missing-final-newline`.

This PR also fixes the individual issues covered by the new tooling. 

### Are these changes tested?

Yes, though manually. I tested this works by running `archery lint --docs` and `pre-commit` without and without changes that should get caught by the rules. It works as expected.

### Are there any user-facing changes?

Yes,

1. Developers that use pre-commit hooks will see a change in behavior when they modify docs
2. Developers using archery will see a new --docs option in `archery lint`
3. Developers working on the docs may see CI failures related to the new checks

* Closes: apache#39990
* GitHub Issue: apache#39990

Authored-by: Bryce Mecum <[email protected]>
Signed-off-by: Bryce Mecum <[email protected]>
  • Loading branch information
amoeba authored and tolleybot committed May 2, 2024
1 parent f9c3802 commit abc300e
Show file tree
Hide file tree
Showing 69 changed files with 488 additions and 434 deletions.
6 changes: 6 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,9 @@ repos:
?^cpp/cmake_modules/UseCython\.cmake$|
?^cpp/src/arrow/util/config\.h\.cmake$|
)
- repo: https://github.com/sphinx-contrib/sphinx-lint
rev: v0.9.1
hooks:
- id: sphinx-lint
files: ^docs/
args: ['--disable', 'all', '--enable', 'trailing-whitespace,missing-final-newline', 'docs']
1 change: 1 addition & 0 deletions ci/conda_env_sphinx.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ pydata-sphinx-theme=0.14
sphinx-autobuild
sphinx-design
sphinx-copybutton
sphinx-lint
sphinxcontrib-jquery
sphinx==6.2
# Requirement for doctest-cython
Expand Down
6 changes: 4 additions & 2 deletions dev/archery/archery/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ def build(ctx, src, build_dir, force, targets, **kwargs):
"Check all sources files for license texts via Apache RAT."),
LintCheck('r', "Lint R files."),
LintCheck('docker', "Lint Dockerfiles with hadolint."),
LintCheck('docs', "Lint docs with sphinx-lint."),
]


Expand All @@ -285,9 +286,10 @@ def decorate_lint_command(cmd):
help="Run IWYU on all C++ files if enabled")
@click.option("-a", "--all", is_flag=True, default=False,
help="Enable all checks.")
@click.argument("path", required=False)
@decorate_lint_command
@click.pass_context
def lint(ctx, src, fix, iwyu_all, **checks):
def lint(ctx, src, fix, iwyu_all, path, **checks):
if checks.pop('all'):
# "--all" is given => enable all non-selected checks
for k, v in checks.items():
Expand All @@ -297,7 +299,7 @@ def lint(ctx, src, fix, iwyu_all, **checks):
raise click.UsageError(
"Need to enable at least one lint check (try --help)")
try:
linter(src, fix, iwyu_all=iwyu_all, **checks)
linter(src, fix, iwyu_all=iwyu_all, path=path, **checks)
except LintValidationException:
sys.exit(1)

Expand Down
52 changes: 50 additions & 2 deletions dev/archery/archery/utils/lint.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,10 +436,55 @@ def docker_linter(src):
cwd=src.path))


def linter(src, fix=False, *, clang_format=False, cpplint=False,
class SphinxLint(Command):
def __init__(self, src, path=None, sphinx_lint_bin=None, disable=None, enable=None):
self.src = src
self.path = path
self.bin = default_bin(sphinx_lint_bin, "sphinx-lint")
self.disable = disable or "all"
self.enable = enable

def lint(self, *args, check=False):
docs_path = os.path.join(self.src.path, "docs")

args = []

if self.disable:
args.extend(["--disable", self.disable])

if self.enable:
args.extend(["--enable", self.enable])

if self.path is not None:
args.extend([self.path])
else:
args.extend([docs_path])

return self.run(*args, check=check)


def docs_linter(src, path=None):
"""Run sphinx-lint on docs."""
logger.info("Running docs linter (sphinx-lint)")

sphinx_lint = SphinxLint(
src,
path=path,
disable="all",
enable="trailing-whitespace,missing-final-newline"
)

if not sphinx_lint.available:
logger.error("sphinx-lint linter requested but sphinx-lint binary not found")
return

yield LintResult.from_cmd(sphinx_lint.lint())


def linter(src, fix=False, path=None, *, clang_format=False, cpplint=False,
clang_tidy=False, iwyu=False, iwyu_all=False,
python=False, numpydoc=False, cmake_format=False, rat=False,
r=False, docker=False):
r=False, docker=False, docs=False):
"""Run all linters."""
with tmpdir(prefix="arrow-lint-") as root:
build_dir = os.path.join(root, "cpp-build")
Expand Down Expand Up @@ -481,6 +526,9 @@ def linter(src, fix=False, *, clang_format=False, cpplint=False,
if docker:
results.extend(docker_linter(src))

if docs:
results.extend(docs_linter(src, path))

# Raise error if one linter failed, ensuring calling code can exit with
# non-zero.
for result in results:
Expand Down
2 changes: 1 addition & 1 deletion dev/archery/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
'integration': ['cffi'],
'integration-java': ['jpype1'],
'lint': ['numpydoc==1.1.0', 'autopep8', 'flake8==6.1.0', 'cython-lint',
'cmake_format==0.6.13'],
'cmake_format==0.6.13', 'sphinx-lint==0.9.1'],
'numpydoc': ['numpydoc==1.1.0'],
'release': ['pygithub', jinja_req, 'jira', 'semver', 'gitpython'],
}
Expand Down
1 change: 1 addition & 0 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@ pydata-sphinx-theme~=0.14
sphinx-autobuild
sphinx-design
sphinx-copybutton
sphinx-lint
sphinx==6.2
pandas
6 changes: 3 additions & 3 deletions docs/source/cpp/acero/developer_guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ Examples
task (described below) as completed which allows the plan to finish.
* The ``fetch`` node, in ``InputReceived``, may decide that it has all the data it needs. It can then call
``StopProducing`` on its input.

Initialization / Construction / Destruction
-------------------------------------------

Expand Down Expand Up @@ -271,7 +271,7 @@ distributed systems. Once that has been done then it should be possible to do a
meaning exchanging between multiple exec plan instances on a single system) if desired.

.. figure:: dist_plan.svg

A distributed plan can provide parallelism even if the plans themselves run serially

Pipeline Parallelism
Expand Down Expand Up @@ -472,7 +472,7 @@ Benchmarking

The most complete macro benchmarking for Acero is provided by https://github.com/voltrondata-labs/arrowbench
These include a set of TPC-H benchmarks, executed from the R-dplyr integration, which are run on every Arrow commit and
reported to Conbench at https://conbench.ursa.dev/
reported to Conbench at https://conbench.ursa.dev/

In addition to these TPC-H benchmarks there are a number of micro-benchmarks for various nodes (hash-join, asof-join,
etc.) Finally, the compute functions themselves should mostly have micro-benchmarks. For more on micro benchmarks you
Expand Down
8 changes: 4 additions & 4 deletions docs/source/cpp/acero/overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ is very similar to a RecordBatch. It can have zero or more columns and all of t
must have the same length. There are a few key differences from ExecBatch:

.. figure:: rb_vs_eb.svg

Both the record batch and the exec batch have strong ownership of the arrays & buffers

* An `ExecBatch` does not have a schema. This is because an `ExecBatch` is assumed to be
Expand All @@ -217,7 +217,7 @@ must have the same length. There are a few key differences from ExecBatch:
also has a length property which describes how many rows are in a batch. So another way to
view a `Scalar` is a constant array with `length` elements.
* An `ExecBatch` contains additional information used by the exec plan. For example, an
`index` can be used to describe a batch's position in an ordered stream. We expect
`index` can be used to describe a batch's position in an ordered stream. We expect
that `ExecBatch` will also evolve to contain additional fields such as a selection vector.

.. figure:: scalar_vs_array.svg
Expand Down Expand Up @@ -266,5 +266,5 @@ various query representations (e.g. Substrait). The Declaration objects are the
with the DeclarationToXyz methods, are the current public API for Acero.

.. figure:: decl_vs_ep.svg
A declaration is a blueprint that is used to instantiate exec plan instances

A declaration is a blueprint that is used to instantiate exec plan instances
46 changes: 23 additions & 23 deletions docs/source/cpp/acero/substrait.rst
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ Aggregate Relations
* Each measure's arguments must be direct references.
* A measure may not have a filter
* A measure may not have sorts
* A measure's invocation must be AGGREGATION_INVOCATION_ALL or
* A measure's invocation must be AGGREGATION_INVOCATION_ALL or
AGGREGATION_INVOCATION_UNSPECIFIED
* A measure's phase must be AGGREGATION_PHASE_INITIAL_TO_RESULT

Expand Down Expand Up @@ -146,73 +146,73 @@ Types
- Caveat
* - boolean
- boolean
-
-
* - i8
- int8
-
-
* - i16
- int16
-
-
* - i32
- int32
-
-
* - i64
- int64
-
-
* - fp32
- float32
-
-
* - fp64
- float64
-
-
* - string
- string
-
-
* - binary
- binary
-
-
* - timestamp
- timestamp<MICRO,"">
-
-
* - timestamp_tz
- timestamp<MICRO,"UTC">
-
-
* - date
- date32<DAY>
-
-
* - time
- time64<MICRO>
-
-
* - interval_year
-
-
- Not currently supported
* - interval_day
-
-
- Not currently supported
* - uuid
-
-
- Not currently supported
* - FIXEDCHAR<L>
-
-
- Not currently supported
* - VARCHAR<L>
-
-
- Not currently supported
* - FIXEDBINARY<L>
- fixed_size_binary<L>
-
-
* - DECIMAL<P,S>
- decimal128<P,S>
-
-
* - STRUCT<T1...TN>
- struct<T1...TN>
- Arrow struct fields will have no name (empty string)
* - NSTRUCT<N:T1...N:Tn>
-
-
- Not currently supported
* - LIST<T>
- list<T>
-
-
* - MAP<K,V>
- map<K,V>
- K must not be nullable
Expand Down
Loading

0 comments on commit abc300e

Please sign in to comment.