From 084c3ddc49defc8fc27debe89e4b5a371a9a909e Mon Sep 17 00:00:00 2001 From: Michael Weiss Date: Mon, 18 Sep 2023 22:36:49 +0200 Subject: [PATCH] :sparkles: handle concatenated fields with inner quotes in splitter (#398) This allows the splitter to correctly handle #-based string concatenation. Note: This will still lead to downstream problems, as some of these concatenated fields will not have a recognized enclosing, and as string interpolation does not yet work with concatenated references. However, these cases did not work before either and this this PR does not (knowingly) introduce any regressions. The hereby mentioned problems will be addressed in a subsequent PR. This is the first pr to address (but not yet close) #396 --- bibtexparser/splitter.py | 93 +++++++++++++-------- tests/splitter_tests/test_splitter_entry.py | 59 +++++++++++++ 2 files changed, 117 insertions(+), 35 deletions(-) diff --git a/bibtexparser/splitter.py b/bibtexparser/splitter.py index fd831a9..72ce58e 100644 --- a/bibtexparser/splitter.py +++ b/bibtexparser/splitter.py @@ -128,19 +128,64 @@ def _move_to_closed_bracket(self) -> int: end_index=m.start() - 1, ) - def _move_to_end_of_double_quoted_string(self) -> int: - """Index of the closing double quote.""" + def _move_to_comma_or_closing_curly_bracket( + self, currently_quote_escaped=False, num_open_curls=0 + ) -> int: + """Index of the end of the field, taking quote-escape into account.""" + + if num_open_curls > 0 and currently_quote_escaped: + raise ParserStateException( + message="Internal error in parser. " + "Found a field-value that is both quote-escaped and curly-escaped. " + "Please report this bug." + ) + + def _is_escaped(): + return currently_quote_escaped or num_open_curls > 0 + + # iterate over marks until we find end of field while True: - m = self._next_mark(accept_eof=False) + next_mark = self._next_mark(accept_eof=False) + + # Handle "escape" characters + if next_mark.group(0) == '"' and not num_open_curls > 0: + currently_quote_escaped = not currently_quote_escaped + continue + elif next_mark.group(0) == "{" and not currently_quote_escaped: + num_open_curls += 1 + continue + elif ( + next_mark.group(0) == "}" + and not currently_quote_escaped + and num_open_curls > 0 + ): + num_open_curls -= 1 + continue + + # Check for end of field + elif next_mark.group(0) == "," and not _is_escaped(): + self._unaccepted_mark = next_mark + return next_mark.start() + # Check for end of entry: + elif next_mark.group(0) == "}" and not _is_escaped(): + self._unaccepted_mark = next_mark + return next_mark.start() + + # Sanity-check: If new block is starting, we abort + elif next_mark.group(0).startswith("@"): + self._unaccepted_mark = next_mark + + if currently_quote_escaped: + looking_for = '`"`' + elif num_open_curls > 0: + looking_for = "`}`" + else: + looking_for = "`,` or `}`" - if m.group(0) == '"': - return m.start() - elif m.group(0).startswith("@"): - self._unaccepted_mark = m raise BlockAbortedException( - abort_reason=f"Unexpected block start: `{m.group(0)}`. " - f'Was still looking for field-value closing `"`', - end_index=m.start() - 1, + abort_reason=f"Unexpected block start: `{next_mark.group(0)}`. " + f"Was still looking for field-value closing {looking_for} ", + end_index=next_mark.start() - 1, ) def _move_to_end_of_entry( @@ -171,31 +216,9 @@ def _move_to_end_of_entry( start_line = self._current_line key_end = equals_mark.start() value_start = equals_mark.end() - value_start_mark = self._next_mark(accept_eof=False) - - if value_start_mark.group(0) == "{": - value_end = self._move_to_closed_bracket() + 1 - elif value_start_mark.group(0) == '"': - value_end = self._move_to_end_of_double_quoted_string() + 1 - else: - # e.g. String reference or integer. Ended by the observed mark - # (as there is no start mark). - # Should be either a comma or a "}" - value_start = equals_mark.end() - value_end = value_start_mark.start() - # We expect a comma (after a closed field-value), or at the end of entry, a closing bracket - if not value_start_mark.group(0) in [ - ",", - "}", - ]: - self._unaccepted_mark = value_start_mark - raise BlockAbortedException( - abort_reason=f"Unexpected character `{value_start_mark.group(0)}` " - f"after field-value. Expected a comma or closing bracket.", - end_index=value_start_mark.start(), - ) - # Put comma back into stream, as still expected. - self._unaccepted_mark = value_start_mark + value_end = self._move_to_comma_or_closing_curly_bracket( + currently_quote_escaped=False, num_open_curls=0 + ) key = self.bibstr[key_start:key_end].strip() value = self.bibstr[value_start:value_end].strip() diff --git a/tests/splitter_tests/test_splitter_entry.py b/tests/splitter_tests/test_splitter_entry.py index 9f2806d..ca67439 100644 --- a/tests/splitter_tests/test_splitter_entry.py +++ b/tests/splitter_tests/test_splitter_entry.py @@ -198,6 +198,65 @@ def test_entry_without_fields(entry_without_fields: str): assert len(library.entries[1].fields) == 1 +@pytest.mark.parametrize( + "entry, expected", + [ + # See issue #396 + pytest.param( + r'@INBOOK{inbook-full, relevant_field = 10 # "~" # jan}', + r'10 # "~" # jan', + id="inner quotes", + ), + pytest.param( + r'@INBOOK{inbook-full, relevant_field = 10 # "~" # jan,}', + r'10 # "~" # jan', + id="inner quotes + comma", + ), + pytest.param( + r'@INBOOK{inbook-full, relevant_field = 10 # "~" # jan, author = "Paul"}', + r'10 # "~" # jan', + id="inner quotes + other field", + ), + pytest.param( + r'@INBOOK{inbook-full, relevant_field = "~" # jan}', + r'"~" # jan', + id=r"starting quotes", + ), + pytest.param( + r'@INBOOK{inbook-full, relevant_field = "~" # jan, }', + r'"~" # jan', + id=r"starting quotes + comma", + ), + pytest.param( + r'@INBOOK{inbook-full, relevant_field = "~" # jan, author = "Paul"}', + r'"~" # jan', + id="starting quotes + other field", + ), + pytest.param( + r'@INBOOK{inbook-full, relevant_field = jan # "~"}', + r'jan # "~"', + id=r"ending quotes", + ), + pytest.param( + r'@INBOOK{inbook-full, relevant_field = jan # "~",}', + r'jan # "~"', + id=r"ending quotes + comma", + ), + pytest.param( + r'@INBOOK{inbook-full, relevant_field = jan # "~", author = "Paul"}', + r'jan # "~"', + id="ending quotes + other field", + ), + ], +) +def test_entry_with_concatenated_field(entry, expected): + """For motivation why we need this, please see issue #384""" + library: Library = Splitter(entry).split() + assert len(library.entries) == 1 + assert len(library.failed_blocks) == 0 + assert library.entries[0]["relevant_field"] == expected + + @pytest.mark.parametrize( "entry", [