Skip to content

Commit

Permalink
Parse math environments (#108)
Browse files Browse the repository at this point in the history
* new MATH_ENVS const, remove from SKIP_ENVS

* added support for parse modes (math v. not math)

* read math envs + test, change constant names

* remove f-string for py3.4 compat
  • Loading branch information
alvinwan authored Aug 1, 2020
1 parent a7976e7 commit 5133486
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 40 deletions.
75 changes: 47 additions & 28 deletions TexSoup/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,23 @@
from TexSoup.tokens import (
TC,
tokenize,
SKIP_ENVS,
SKIP_ENV_NAMES,
MATH_ENV_NAMES,
)
import functools
import string
import sys


MATH_ENVS = (
MODE_MATH = 'mode:math'
MODE_NON_MATH = 'mode:non-math'
MATH_SIMPLE_ENVS = (
TexDisplayMathModeEnv,
TexMathModeEnv,
TexDisplayMathEnv,
TexMathEnv
)
MATH_TOKEN_TO_ENV = {env.token_begin: env for env in MATH_ENVS}
MATH_TOKEN_TO_ENV = {env.token_begin: env for env in MATH_SIMPLE_ENVS}
ARG_BEGIN_TO_ENV = {arg.token_begin: arg for arg in arg_type}

SIGNATURES = {
Expand All @@ -44,7 +47,7 @@ def read_tex(buf, skip_envs=(), tolerance=0):
"""
while buf.hasNext():
yield read_expr(buf,
skip_envs=SKIP_ENVS + skip_envs,
skip_envs=SKIP_ENV_NAMES + skip_envs,
tolerance=tolerance)


Expand Down Expand Up @@ -74,32 +77,36 @@ def wrapper(buf, *args, **kwargs):
return wrapper


def read_expr(src, skip_envs=(), tolerance=0):
def read_expr(src, skip_envs=(), tolerance=0, mode=MODE_NON_MATH):
r"""Read next expression from buffer
:param Buffer src: a buffer of tokens
:param Tuple[str] skip_envs: environments to skip parsing
:param int tolerance: error tolerance level (only supports 0 or 1)
:param str mode: math or not math mode
:return: parsed expression
:rtype: [TexExpr, Token]
"""
c = next(src)
if c.category in MATH_TOKEN_TO_ENV.keys():
expr = MATH_TOKEN_TO_ENV[c.category]([], position=c.position)
return read_math_env(src, expr)
return read_math_env(src, expr, tolerance=tolerance)
elif c.category == TC.Escape:
name, args = read_command(src, tolerance=tolerance)
name, args = read_command(src, tolerance=tolerance, mode=mode)
if name == 'item':
assert mode != MODE_MATH, 'Command \item invalid in math mode.'
contents = read_item(src)
expr = TexCmd(name, contents, args, position=c.position)
elif name == 'begin':
assert args, 'Begin command must be followed by an env name.'
expr = TexNamedEnv(
args[0].string, args=args[1:], position=c.position)
if expr.name in MATH_ENV_NAMES:
mode = MODE_MATH
if expr.name in skip_envs:
read_skip_env(src, expr)
else:
read_env(src, expr, tolerance=tolerance)
read_env(src, expr, tolerance=tolerance, mode=mode)
else:
expr = TexCmd(name, args=args, position=c.position)
return expr
Expand Down Expand Up @@ -170,6 +177,7 @@ def unclosed_env_handler(src, expr, end):
:param Buffer src: a buffer of tokens
:param TexExpr expr: expression for the environment
:param int tolerance: error tolerance level (only supports 0 or 1)
:param end str: Actual end token (as opposed to expected)
"""
clo = CharToLineOffset(str(src))
Expand All @@ -179,7 +187,7 @@ def unclosed_env_handler(src, expr, end):
line, offset, expr.name, expr.end, explanation))


def read_math_env(src, expr):
def read_math_env(src, expr, tolerance=0):
r"""Read the environment from buffer.
Advances the buffer until right after the end of the environment. Adds
Expand All @@ -197,11 +205,13 @@ def read_math_env(src, expr):
...
EOFError: [Line: 0, Offset: 7] "$" env expecting $. Reached end of file.
"""
content = src.forward_until(lambda c: c.category == expr.token_end)
contents = []
while src.hasNext() and src.peek().category != expr.token_end:
contents.append(read_expr(src, tolerance=tolerance, mode=MODE_MATH))
if not src.hasNext() or src.peek().category != expr.token_end:
unclosed_env_handler(src, expr, src.peek())
next(src)
expr.append(content)
expr.append(*contents)
return expr


Expand Down Expand Up @@ -235,7 +245,7 @@ def condition(s): return s.startswith('\\end{%s}' % expr.name)
return expr


def read_env(src, expr, tolerance=0):
def read_env(src, expr, tolerance=0, mode=MODE_NON_MATH):
r"""Read the environment from buffer.
Advances the buffer until right after the end of the environment. Adds
Expand All @@ -244,6 +254,7 @@ def read_env(src, expr, tolerance=0):
:param Buffer src: a buffer of tokens
:param TexExpr expr: expression for the environment
:param int tolerance: error tolerance level (only supports 0 or 1)
:param str mode: math or not math mode
:rtype: TexExpr
>>> from TexSoup.category import categorize
Expand All @@ -264,10 +275,10 @@ def read_env(src, expr, tolerance=0):
while src.hasNext():
if src.peek().category == TC.Escape:
name, args = make_read_peek(read_command)(
src, 1, skip=1, tolerance=tolerance)
src, 1, skip=1, tolerance=tolerance, mode=mode)
if name == 'end':
break
contents.append(read_expr(src, tolerance=tolerance))
contents.append(read_expr(src, tolerance=tolerance, mode=mode))
error = not src.hasNext() or not args or args[0].string != expr.name
if error and tolerance == 0:
unclosed_env_handler(src, expr, src.peek((0, 6)))
Expand All @@ -284,7 +295,8 @@ def read_env(src, expr, tolerance=0):

# TODO: handle macro-weirdness e.g., \def\blah[#1][[[[[[[[#2{"#1 . #2"}
# TODO: add newcommand macro
def read_args(src, n_required=-1, n_optional=-1, args=None, tolerance=0):
def read_args(src, n_required=-1, n_optional=-1, args=None, tolerance=0,
mode=MODE_NON_MATH):
r"""Read all arguments from buffer.
This function assumes that the command name has already been parsed. By
Expand All @@ -300,6 +312,7 @@ def read_args(src, n_required=-1, n_optional=-1, args=None, tolerance=0):
:param int n_optional: Number of optional arguments. If < 0, all valid
bracket groups will be captured.
:param int tolerance: error tolerance level (only supports 0 or 1)
:param str mode: math or not math mode
:return: parsed arguments
:rtype: TexArgs
Expand All @@ -325,17 +338,18 @@ def read_args(src, n_required=-1, n_optional=-1, args=None, tolerance=0):
if n_required == 0 and n_optional == 0:
return args

n_optional = read_arg_optional(src, args, n_optional, tolerance)
n_required = read_arg_required(src, args, n_required, tolerance)
n_optional = read_arg_optional(src, args, n_optional, tolerance, mode)
n_required = read_arg_required(src, args, n_required, tolerance, mode)

if src.hasNext() and src.peek().category == TC.BracketBegin:
n_optional = read_arg_optional(src, args, n_optional, tolerance)
n_optional = read_arg_optional(src, args, n_optional, tolerance, mode)
if src.hasNext() and src.peek().category == TC.GroupBegin:
n_required = read_arg_required(src, args, n_required, tolerance)
n_required = read_arg_required(src, args, n_required, tolerance, mode)
return args


def read_arg_optional(src, args, n_optional=-1, tolerance=0):
def read_arg_optional(
src, args, n_optional=-1, tolerance=0, mode=MODE_NON_MATH):
"""Read next optional argument from buffer.
If the command has remaining optional arguments, look for:
Expand All @@ -349,6 +363,7 @@ def read_arg_optional(src, args, n_optional=-1, tolerance=0):
:param int n_optional: Number of optional arguments. If < 0, all valid
bracket groups will be captured.
:param int tolerance: error tolerance level (only supports 0 or 1)
:param str mode: math or not math mode
:return: number of remaining optional arguments
:rtype: int
"""
Expand All @@ -358,12 +373,13 @@ def read_arg_optional(src, args, n_optional=-1, tolerance=0):
if spacer:
src.backward(1)
break
args.append(read_arg(src, next(src), tolerance=tolerance))
args.append(read_arg(src, next(src), tolerance=tolerance, mode=mode))
n_optional -= 1
return n_optional


def read_arg_required(src, args, n_required=-1, tolerance=0):
def read_arg_required(
src, args, n_required=-1, tolerance=0, mode=MODE_NON_MATH):
r"""Read next required argument from buffer.
If the command has remaining required arguments, look for:
Expand All @@ -379,6 +395,7 @@ def read_arg_required(src, args, n_required=-1, tolerance=0):
:param int n_required: Number of required arguments. If < 0, all valid
brace groups will be captured.
:param int tolerance: error tolerance level (only supports 0 or 1)
:param str mode: math or not math mode
:return: number of remaining optional arguments
:rtype: int
Expand All @@ -397,7 +414,8 @@ def read_arg_required(src, args, n_required=-1, tolerance=0):
spacer = read_spacer(src)

if src.hasNext() and src.peek().category == TC.GroupBegin:
args.append(read_arg(src, next(src), tolerance=tolerance))
args.append(read_arg(
src, next(src), tolerance=tolerance, mode=mode))
n_required -= 1
continue
elif src.hasNext() and n_required > 0:
Expand All @@ -411,14 +429,15 @@ def read_arg_required(src, args, n_required=-1, tolerance=0):
return n_required


def read_arg(src, c, tolerance=0):
def read_arg(src, c, tolerance=0, mode=MODE_NON_MATH):
r"""Read the argument from buffer.
Advances buffer until right before the end of the argument.
:param Buffer src: a buffer of tokens
:param str c: argument token (starting token)
:param int tolerance: error tolerance level (only supports 0 or 1)
:param str mode: math or not math mode
:return: the parsed argument
:rtype: TexGroup
Expand All @@ -439,7 +458,7 @@ def read_arg(src, c, tolerance=0):
src.forward()
return arg(*content[1:], position=c.position)
else:
content.append(read_expr(src, tolerance=tolerance))
content.append(read_expr(src, tolerance=tolerance, mode=mode))

if tolerance == 0:
clo = CharToLineOffset(str(src))
Expand Down Expand Up @@ -478,7 +497,7 @@ def read_spacer(buf):


def read_command(buf, n_required_args=-1, n_optional_args=-1, skip=0,
tolerance=0):
tolerance=0, mode=MODE_NON_MATH):
r"""Parses command and all arguments. Assumes escape has just been parsed.
No whitespace is allowed between escape and command name. e.g.,
Expand All @@ -505,7 +524,7 @@ def read_command(buf, n_required_args=-1, n_optional_args=-1, skip=0,
('item', [])
>>> buf.peek()
' aaa '
# >>> buf = Buffer(tokenize(categorize('\\sect abcd')))
# >>> _ = next(buf)
# >>> read_command(buf)
Expand All @@ -519,5 +538,5 @@ def read_command(buf, n_required_args=-1, n_optional_args=-1, skip=0,
if n_required_args < 0 and n_optional_args < 0:
n_required_args, n_optional_args = SIGNATURES.get(name, (-1, -1))
args = read_args(buf, n_required_args, n_optional_args,
tolerance=tolerance)
tolerance=tolerance, mode=mode)
return name, args
20 changes: 11 additions & 9 deletions TexSoup/tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,17 @@
import string

# Custom higher-level combinations of primitives
SKIP_ENVS = ('verbatim', 'equation', 'lstlisting', 'align', 'alignat',
'equation*', 'align*', 'math', 'displaymath', 'split', 'array',
'eqnarray', 'eqnarray*', 'multline', 'multline*', 'gather',
'gather*', 'flalign', 'flalign*',
'$', '$$', r'\[', r'\]', r'\(', r'\)')
BRACKETS_DELIMITERS = {'(', ')', '<', '>', '[', ']', '{', '}',
r'\{', r'\}', '.' '|', r'\langle', r'\rangle',
r'\lfloor', '\rfloor', r'\lceil', r'\rceil',
r'\ulcorner', r'\urcorner', r'\lbrack', r'\rbrack'}
SKIP_ENV_NAMES = ('lstlisting', 'verbatim')
MATH_ENV_NAMES = (
'align', 'align*', 'alignat', 'array', 'displaymath', 'eqnarray',
'eqnarray*', 'equation', 'equation*', 'flalign', 'flalign*', 'gather',
'gather*', 'math', 'multline', 'multline*', 'split'
)
BRACKETS_DELIMITERS = {
'(', ')', '<', '>', '[', ']', '{', '}', r'\{', r'\}', '.' '|', r'\langle',
r'\rangle', r'\lfloor', '\rfloor', r'\lceil', r'\rceil', r'\ulcorner',
r'\urcorner', r'\lbrack', r'\rbrack'
}
# TODO: looks like left-right do have to match
SIZE_PREFIX = ('left', 'right', 'big', 'Big', 'bigg', 'Bigg')
PUNCTUATION_COMMANDS = {command + bracket
Expand Down
15 changes: 15 additions & 0 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,21 @@ def test_access_position(chikin):
assert clo(chikin.section.position) == (4, 0)


def test_math_env_change():
"""Tests that commands in math environments can be found / modified"""
soup = TexSoup(r'\begin{align}\infer{A}{B}\infer{C}{D}\end{align}')
assert soup.infer is not None, repr(soup.expr)
for infer in soup.find_all('infer'):
infer.args = infer.args[::-1]
assert str(soup) == r'\begin{align}\infer{B}{A}\infer{D}{C}\end{align}'

soup = TexSoup(r'$$\infer{A}{B}\infer{C}{D}$$')
assert soup.infer is not None, repr(soup.expr)
for infer in soup.find_all('infer'):
infer.args = infer.args[::-1]
assert str(soup) == r'$$\infer{B}{A}\infer{D}{C}$$'


#########
# TEXT #
########
Expand Down
6 changes: 3 additions & 3 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,11 +410,11 @@ def test_non_letter_commands():
(whether valid or not).
"""
for punctuation in '!@#$%^&*_+-=~`<>,./?;:|':
tex = rf"""
tex = r"""
\begin{{document}}
\lstinline{{\{punctuation} Word [a-z]+}}
\lstinline{{\{} Word [a-z]+}}
\end{{document}}
"""
""".format(punctuation)
soup = TexSoup(tex)
assert str(soup) == tex

Expand Down

0 comments on commit 5133486

Please sign in to comment.