Parse math environments (#108)

* new MATH_ENVS const, remove from SKIP_ENVS * added support for parse modes (math v. not math) * read math envs + test, change constant names * remove f-string for py3.4 compat
alvinwan · Aug 1, 2020 · 5133486 · 5133486
1 parent a7976e7
commit 5133486
Show file tree

Hide file tree

Showing 4 changed files with 76 additions and 40 deletions.
diff --git a/TexSoup/reader.py b/TexSoup/reader.py
@@ -7,20 +7,23 @@
 from TexSoup.tokens import (
     TC,
     tokenize,
-    SKIP_ENVS,
+    SKIP_ENV_NAMES,
+    MATH_ENV_NAMES,
 )
 import functools
 import string
 import sys
 
 
-MATH_ENVS = (
+MODE_MATH = 'mode:math'
+MODE_NON_MATH = 'mode:non-math'
+MATH_SIMPLE_ENVS = (
     TexDisplayMathModeEnv,
     TexMathModeEnv,
     TexDisplayMathEnv,
     TexMathEnv
 )
-MATH_TOKEN_TO_ENV = {env.token_begin: env for env in MATH_ENVS}
+MATH_TOKEN_TO_ENV = {env.token_begin: env for env in MATH_SIMPLE_ENVS}
 ARG_BEGIN_TO_ENV = {arg.token_begin: arg for arg in arg_type}
 
 SIGNATURES = {
@@ -44,7 +47,7 @@ def read_tex(buf, skip_envs=(), tolerance=0):
     """
     while buf.hasNext():
         yield read_expr(buf,
-                        skip_envs=SKIP_ENVS + skip_envs,
+                        skip_envs=SKIP_ENV_NAMES + skip_envs,
                         tolerance=tolerance)
 
 
@@ -74,32 +77,36 @@ def wrapper(buf, *args, **kwargs):
     return wrapper
 
 
-def read_expr(src, skip_envs=(), tolerance=0):
+def read_expr(src, skip_envs=(), tolerance=0, mode=MODE_NON_MATH):
     r"""Read next expression from buffer
 
     :param Buffer src: a buffer of tokens
     :param Tuple[str] skip_envs: environments to skip parsing
     :param int tolerance: error tolerance level (only supports 0 or 1)
+    :param str mode: math or not math mode
     :return: parsed expression
     :rtype: [TexExpr, Token]
     """
     c = next(src)
     if c.category in MATH_TOKEN_TO_ENV.keys():
         expr = MATH_TOKEN_TO_ENV[c.category]([], position=c.position)
-        return read_math_env(src, expr)
+        return read_math_env(src, expr, tolerance=tolerance)
     elif c.category == TC.Escape:
-        name, args = read_command(src, tolerance=tolerance)
+        name, args = read_command(src, tolerance=tolerance, mode=mode)
         if name == 'item':
+            assert mode != MODE_MATH, 'Command \item invalid in math mode.'
             contents = read_item(src)
             expr = TexCmd(name, contents, args, position=c.position)
         elif name == 'begin':
             assert args, 'Begin command must be followed by an env name.'
             expr = TexNamedEnv(
                 args[0].string, args=args[1:], position=c.position)
+            if expr.name in MATH_ENV_NAMES:
+                mode = MODE_MATH
             if expr.name in skip_envs:
                 read_skip_env(src, expr)
             else:
-                read_env(src, expr, tolerance=tolerance)
+                read_env(src, expr, tolerance=tolerance, mode=mode)
         else:
             expr = TexCmd(name, args=args, position=c.position)
         return expr
@@ -170,6 +177,7 @@ def unclosed_env_handler(src, expr, end):
 
     :param Buffer src: a buffer of tokens
     :param TexExpr expr: expression for the environment
+    :param int tolerance: error tolerance level (only supports 0 or 1)
     :param end str: Actual end token (as opposed to expected)
     """
     clo = CharToLineOffset(str(src))
@@ -179,7 +187,7 @@ def unclosed_env_handler(src, expr, end):
         line, offset, expr.name, expr.end, explanation))
 
 
-def read_math_env(src, expr):
+def read_math_env(src, expr, tolerance=0):
     r"""Read the environment from buffer.
 
     Advances the buffer until right after the end of the environment. Adds
@@ -197,11 +205,13 @@ def read_math_env(src, expr):
         ...
     EOFError: [Line: 0, Offset: 7] "$" env expecting $. Reached end of file.
     """
-    content = src.forward_until(lambda c: c.category == expr.token_end)
+    contents = []
+    while src.hasNext() and src.peek().category != expr.token_end:
+        contents.append(read_expr(src, tolerance=tolerance, mode=MODE_MATH))
     if not src.hasNext() or src.peek().category != expr.token_end:
         unclosed_env_handler(src, expr, src.peek())
     next(src)
-    expr.append(content)
+    expr.append(*contents)
     return expr
 
 
@@ -235,7 +245,7 @@ def condition(s): return s.startswith('\\end{%s}' % expr.name)
     return expr
 
 
-def read_env(src, expr, tolerance=0):
+def read_env(src, expr, tolerance=0, mode=MODE_NON_MATH):
     r"""Read the environment from buffer.
 
     Advances the buffer until right after the end of the environment. Adds
@@ -244,6 +254,7 @@ def read_env(src, expr, tolerance=0):
     :param Buffer src: a buffer of tokens
     :param TexExpr expr: expression for the environment
     :param int tolerance: error tolerance level (only supports 0 or 1)
+    :param str mode: math or not math mode
     :rtype: TexExpr
 
     >>> from TexSoup.category import categorize
@@ -264,10 +275,10 @@ def read_env(src, expr, tolerance=0):
     while src.hasNext():
         if src.peek().category == TC.Escape:
             name, args = make_read_peek(read_command)(
-                src, 1, skip=1, tolerance=tolerance)
+                src, 1, skip=1, tolerance=tolerance, mode=mode)
             if name == 'end':
                 break
-        contents.append(read_expr(src, tolerance=tolerance))
+        contents.append(read_expr(src, tolerance=tolerance, mode=mode))
     error = not src.hasNext() or not args or args[0].string != expr.name
     if error and tolerance == 0:
         unclosed_env_handler(src, expr, src.peek((0, 6)))
@@ -284,7 +295,8 @@ def read_env(src, expr, tolerance=0):
 
 # TODO: handle macro-weirdness e.g., \def\blah[#1][[[[[[[[#2{"#1 . #2"}
 # TODO: add newcommand macro
-def read_args(src, n_required=-1, n_optional=-1, args=None, tolerance=0):
+def read_args(src, n_required=-1, n_optional=-1, args=None, tolerance=0,
+        mode=MODE_NON_MATH):
     r"""Read all arguments from buffer.
 
     This function assumes that the command name has already been parsed. By
@@ -300,6 +312,7 @@ def read_args(src, n_required=-1, n_optional=-1, args=None, tolerance=0):
     :param int n_optional: Number of optional arguments. If < 0, all valid
                            bracket groups will be captured.
     :param int tolerance: error tolerance level (only supports 0 or 1)
+    :param str mode: math or not math mode
     :return: parsed arguments
     :rtype: TexArgs
 
@@ -325,17 +338,18 @@ def read_args(src, n_required=-1, n_optional=-1, args=None, tolerance=0):
     if n_required == 0 and n_optional == 0:
         return args
 
-    n_optional = read_arg_optional(src, args, n_optional, tolerance)
-    n_required = read_arg_required(src, args, n_required, tolerance)
+    n_optional = read_arg_optional(src, args, n_optional, tolerance, mode)
+    n_required = read_arg_required(src, args, n_required, tolerance, mode)
 
     if src.hasNext() and src.peek().category == TC.BracketBegin:
-        n_optional = read_arg_optional(src, args, n_optional, tolerance)
+        n_optional = read_arg_optional(src, args, n_optional, tolerance, mode)
     if src.hasNext() and src.peek().category == TC.GroupBegin:
-        n_required = read_arg_required(src, args, n_required, tolerance)
+        n_required = read_arg_required(src, args, n_required, tolerance, mode)
     return args
 
 
-def read_arg_optional(src, args, n_optional=-1, tolerance=0):
+def read_arg_optional(
+        src, args, n_optional=-1, tolerance=0, mode=MODE_NON_MATH):
     """Read next optional argument from buffer.
 
     If the command has remaining optional arguments, look for:
@@ -349,6 +363,7 @@ def read_arg_optional(src, args, n_optional=-1, tolerance=0):
     :param int n_optional: Number of optional arguments. If < 0, all valid
                            bracket groups will be captured.
     :param int tolerance: error tolerance level (only supports 0 or 1)
+    :param str mode: math or not math mode
     :return: number of remaining optional arguments
     :rtype: int
     """
@@ -358,12 +373,13 @@ def read_arg_optional(src, args, n_optional=-1, tolerance=0):
             if spacer:
                 src.backward(1)
             break
-        args.append(read_arg(src, next(src), tolerance=tolerance))
+        args.append(read_arg(src, next(src), tolerance=tolerance, mode=mode))
         n_optional -= 1
     return n_optional
 
 
-def read_arg_required(src, args, n_required=-1, tolerance=0):
+def read_arg_required(
+        src, args, n_required=-1, tolerance=0, mode=MODE_NON_MATH):
     r"""Read next required argument from buffer.
 
     If the command has remaining required arguments, look for:
@@ -379,6 +395,7 @@ def read_arg_required(src, args, n_required=-1, tolerance=0):
     :param int n_required: Number of required arguments. If < 0, all valid
                            brace groups will be captured.
     :param int tolerance: error tolerance level (only supports 0 or 1)
+    :param str mode: math or not math mode
     :return: number of remaining optional arguments
     :rtype: int
 
@@ -397,7 +414,8 @@ def read_arg_required(src, args, n_required=-1, tolerance=0):
         spacer = read_spacer(src)
 
         if src.hasNext() and src.peek().category == TC.GroupBegin:
-            args.append(read_arg(src, next(src), tolerance=tolerance))
+            args.append(read_arg(
+                src, next(src), tolerance=tolerance, mode=mode))
             n_required -= 1
             continue
         elif src.hasNext() and n_required > 0:
@@ -411,14 +429,15 @@ def read_arg_required(src, args, n_required=-1, tolerance=0):
     return n_required
 
 
-def read_arg(src, c, tolerance=0):
+def read_arg(src, c, tolerance=0, mode=MODE_NON_MATH):
     r"""Read the argument from buffer.
 
     Advances buffer until right before the end of the argument.
 
     :param Buffer src: a buffer of tokens
     :param str c: argument token (starting token)
     :param int tolerance: error tolerance level (only supports 0 or 1)
+    :param str mode: math or not math mode
     :return: the parsed argument
     :rtype: TexGroup
 
@@ -439,7 +458,7 @@ def read_arg(src, c, tolerance=0):
             src.forward()
             return arg(*content[1:], position=c.position)
         else:
-            content.append(read_expr(src, tolerance=tolerance))
+            content.append(read_expr(src, tolerance=tolerance, mode=mode))
 
     if tolerance == 0:
         clo = CharToLineOffset(str(src))
@@ -478,7 +497,7 @@ def read_spacer(buf):
 
 
 def read_command(buf, n_required_args=-1, n_optional_args=-1, skip=0,
-                 tolerance=0):
+                 tolerance=0, mode=MODE_NON_MATH):
     r"""Parses command and all arguments. Assumes escape has just been parsed.
 
     No whitespace is allowed between escape and command name. e.g.,
@@ -505,7 +524,7 @@ def read_command(buf, n_required_args=-1, n_optional_args=-1, skip=0,
     ('item', [])
     >>> buf.peek()
     ' aaa '
-    
+
     # >>> buf = Buffer(tokenize(categorize('\\sect abcd')))
     # >>> _ = next(buf)
     # >>> read_command(buf)
@@ -519,5 +538,5 @@ def read_command(buf, n_required_args=-1, n_optional_args=-1, skip=0,
     if n_required_args < 0 and n_optional_args < 0:
         n_required_args, n_optional_args = SIGNATURES.get(name, (-1, -1))
     args = read_args(buf, n_required_args, n_optional_args,
-                     tolerance=tolerance)
+                     tolerance=tolerance, mode=mode)
     return name, args
diff --git a/TexSoup/tokens.py b/TexSoup/tokens.py
@@ -12,15 +12,17 @@
 import string
 
 # Custom higher-level combinations of primitives
-SKIP_ENVS = ('verbatim', 'equation', 'lstlisting', 'align', 'alignat',
-             'equation*', 'align*', 'math', 'displaymath', 'split', 'array',
-             'eqnarray', 'eqnarray*', 'multline', 'multline*', 'gather',
-             'gather*', 'flalign', 'flalign*',
-             '$', '$$', r'\[', r'\]', r'\(', r'\)')
-BRACKETS_DELIMITERS = {'(', ')', '<', '>', '[', ']', '{', '}',
-                       r'\{', r'\}', '.' '|', r'\langle', r'\rangle',
-                       r'\lfloor', '\rfloor', r'\lceil', r'\rceil',
-                       r'\ulcorner', r'\urcorner', r'\lbrack', r'\rbrack'}
+SKIP_ENV_NAMES = ('lstlisting', 'verbatim')
+MATH_ENV_NAMES = (
+    'align', 'align*', 'alignat', 'array', 'displaymath', 'eqnarray',
+    'eqnarray*', 'equation', 'equation*', 'flalign', 'flalign*', 'gather',
+    'gather*', 'math', 'multline', 'multline*', 'split'
+)
+BRACKETS_DELIMITERS = {
+    '(', ')', '<', '>', '[', ']', '{', '}', r'\{', r'\}', '.' '|', r'\langle',
+    r'\rangle', r'\lfloor', '\rfloor', r'\lceil', r'\rceil', r'\ulcorner',
+    r'\urcorner', r'\lbrack', r'\rbrack'
+}
 # TODO: looks like left-right do have to match
 SIZE_PREFIX = ('left', 'right', 'big', 'Big', 'bigg', 'Bigg')
 PUNCTUATION_COMMANDS = {command + bracket

diff --git a/tests/test_api.py b/tests/test_api.py
@@ -186,6 +186,21 @@ def test_access_position(chikin):
     assert clo(chikin.section.position) == (4, 0)
 
 
+def test_math_env_change():
+    """Tests that commands in math environments can be found / modified"""
+    soup = TexSoup(r'\begin{align}\infer{A}{B}\infer{C}{D}\end{align}')
+    assert soup.infer is not None, repr(soup.expr)
+    for infer in soup.find_all('infer'):
+        infer.args = infer.args[::-1]
+    assert str(soup) == r'\begin{align}\infer{B}{A}\infer{D}{C}\end{align}'
+
+    soup = TexSoup(r'$$\infer{A}{B}\infer{C}{D}$$')
+    assert soup.infer is not None, repr(soup.expr)
+    for infer in soup.find_all('infer'):
+        infer.args = infer.args[::-1]
+    assert str(soup) == r'$$\infer{B}{A}\infer{D}{C}$$'
+
+
 #########
 # TEXT #
 ########

diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -410,11 +410,11 @@ def test_non_letter_commands():
     (whether valid or not).
     """
     for punctuation in '!@#$%^&*_+-=~`<>,./?;:|':
-        tex = rf"""
+        tex = r"""
         \begin{{document}}
-        \lstinline{{\{punctuation} Word [a-z]+}}
+        \lstinline{{\{} Word [a-z]+}}
         \end{{document}}
-        """
+        """.format(punctuation)
         soup = TexSoup(tex)
         assert str(soup) == tex