From f0e5c2eb6e35d2a5c25006616588918375c8891a Mon Sep 17 00:00:00 2001 From: Tyler Gregg Date: Fri, 27 Sep 2024 15:51:23 -0700 Subject: [PATCH] Adds support for reading Ion 1.1 text syntax for e-expressions and expression groups. --- .../amazon/ion/impl/IonReaderTextRawX.java | 160 ++++++++++++------ .../amazon/ion/impl/IonReaderTextUserX.java | 4 +- .../ion/impl/IonRawTextReaderTest_1_1.java | 142 ++++++++++++++++ 3 files changed, 248 insertions(+), 58 deletions(-) create mode 100644 src/test/java/com/amazon/ion/impl/IonRawTextReaderTest_1_1.java diff --git a/src/main/java/com/amazon/ion/impl/IonReaderTextRawX.java b/src/main/java/com/amazon/ion/impl/IonReaderTextRawX.java index 4449ef4122..f2a7a72fde 100644 --- a/src/main/java/com/amazon/ion/impl/IonReaderTextRawX.java +++ b/src/main/java/com/amazon/ion/impl/IonReaderTextRawX.java @@ -1,18 +1,5 @@ -/* - * Copyright 2007-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"). - * You may not use this file except in compliance with the License. - * A copy of the License is located at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * or in the "license" file accompanying this file. This file is distributed - * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either - * express or implied. See the License for the specific language governing - * permissions and limitations under the License. - */ - +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 package com.amazon.ion.impl; import static com.amazon.ion.SymbolTable.UNKNOWN_SYMBOL_ID; @@ -30,7 +17,6 @@ import com.amazon.ion.impl._Private_ScalarConversions.AS_TYPE; import com.amazon.ion.impl._Private_ScalarConversions.ValueVariant; import java.io.IOException; -import java.math.BigInteger; import java.util.Iterator; /** @@ -72,7 +58,6 @@ abstract class IonReaderTextRawX implements IonTextReader { - public abstract BigInteger bigIntegerValue(); // static final boolean _object_parser = false; static final boolean _debug = false; @@ -126,7 +111,10 @@ private final String get_state_name(int state) { static final int ACTION_FINISH_LOB = 13; static final int ACTION_FINISH_DATAGRAM = 14; static final int ACTION_EOF = 15; - static final int ACTION_count = 16; + + static final int ACTION_START_E_EXPRESSION = 16; + static final int ACTION_START_EXPRESSION_GROUP = 17; + @SuppressWarnings("unused") private final String get_action_name(int action) { switch(action) { @@ -144,6 +132,8 @@ private final String get_action_name(int action) { case ACTION_FINISH_CONTAINER: return "ACTION_FINISH_CONTAINER"; case ACTION_FINISH_LOB: return "ACTION_FINISH_LOB"; case ACTION_FINISH_DATAGRAM: return "ACTION_FINISH_DATAGRAM"; + case ACTION_START_E_EXPRESSION: return "ACTION_START_E_EXPRESSION"; + case ACTION_START_EXPRESSION_GROUP: return "ACTION_START_EXPRESSION_GROUP"; case ACTION_EOF: return "ACTION_EOF"; default: return ""; } @@ -190,6 +180,8 @@ static final int[][] makeTransitionActionArray() actions[STATE_BEFORE_ANNOTATION_SEXP][IonTokenConstsX.TOKEN_EOF] = 0; actions[STATE_BEFORE_ANNOTATION_SEXP][IonTokenConstsX.TOKEN_SYMBOL_OPERATOR] = ACTION_LOAD_SCALAR; actions[STATE_BEFORE_ANNOTATION_SEXP][IonTokenConstsX.TOKEN_DOT] = ACTION_LOAD_SCALAR; + actions[STATE_BEFORE_ANNOTATION_SEXP][IonTokenConstsX.TOKEN_COLON] = ACTION_START_E_EXPRESSION; + actions[STATE_BEFORE_ANNOTATION_SEXP][IonTokenConstsX.TOKEN_DOUBLE_COLON] = ACTION_START_EXPRESSION_GROUP; actions[STATE_BEFORE_ANNOTATION_SEXP][IonTokenConstsX.TOKEN_CLOSE_PAREN] = ACTION_FINISH_CONTAINER; actions[STATE_BEFORE_ANNOTATION_SEXP][IonTokenConstsX.TOKEN_CLOSE_BRACE] = ACTION_FINISH_CONTAINER; actions[STATE_BEFORE_ANNOTATION_SEXP][IonTokenConstsX.TOKEN_CLOSE_SQUARE] = ACTION_FINISH_CONTAINER; @@ -252,6 +244,45 @@ static int[] makeTransition2ActionArray() { return a; } + private class ContainerState { + IonType type; + boolean isEExpression; + boolean isExpressionGroup; + + private void setFlags() { + switch (type) { + case LIST: + _container_is_struct = false; + _container_prohibits_commas = false; + break; + case DATAGRAM: + case SEXP: + _container_is_struct = false; + _container_prohibits_commas = true; + break; + case STRUCT: + _container_is_struct = true; + _container_prohibits_commas = false; + break; + default: + throw new IllegalStateException("type must be a container, not a " + type); + } + } + + void init(IonType type) { + this.type = type; + this.isEExpression = _container_is_e_expression; + this.isExpressionGroup = _container_is_expression_group; + setFlags(); + } + + void restore() { + _container_is_e_expression = isEExpression; + _container_is_expression_group = isExpressionGroup; + setFlags(); + } + } + // // actual class members (preceding values are just parsing // control constants). @@ -262,10 +293,13 @@ static int[] makeTransition2ActionArray() { boolean _eof; int _state; - IonType[] _container_state_stack = new IonType[DEFAULT_STACK_DEPTH]; + ContainerState[] _container_state_stack = new ContainerState[DEFAULT_STACK_DEPTH]; int _container_state_top; boolean _container_is_struct; // helper bool's set on push and pop and used boolean _container_prohibits_commas; // frequently during state transitions actions + boolean _container_is_e_expression; + boolean _container_is_expression_group; + boolean _is_expression_syntax_allowed; boolean _has_next_called; IonType _value_type; @@ -295,13 +329,25 @@ enum LOB_STATE { EMPTY, READ, FINISHED } LOB_STATE _lob_loaded; byte[] _lob_bytes; int _lob_actual_len; + int minorVersion = 0; protected IonReaderTextRawX() { super(); _nesting_parent = null; + for (int i = 0; i < _container_state_stack.length; i++) { + _container_state_stack[i] = new ContainerState(); + } } + /** + * Sets the Ion minor version. + * @param minorVersion the version. + */ + void setMinorVersion(int minorVersion) { + this.minorVersion = minorVersion; + _is_expression_syntax_allowed = false; + } /** * @return This implementation always returns null. @@ -350,6 +396,9 @@ protected final void re_init(UnifiedInputStreamX iis _container_state_top = 0; _container_is_struct = false; _container_prohibits_commas = false; + _container_is_e_expression = false; + _container_is_expression_group = false; + _is_expression_syntax_allowed = false; _has_next_called = false; _value_type = null; _value_keyword = 0; @@ -463,6 +512,8 @@ protected final boolean has_next_raw_value() { finish_value(null); clear_value(); parse_to_next_value(); + // Any expression syntax for the current container must have already occurred. + _is_expression_syntax_allowed = false; } catch (IOException e) { throw new IonException(e); @@ -547,29 +598,6 @@ private final void clear_value() _value_start_offset = -1; } - private final void set_container_flags(IonType t) { - switch (t) { - case LIST: - _container_is_struct = false; - _container_prohibits_commas = false; - break; - case SEXP: - _container_is_struct = false; - _container_prohibits_commas = true; - break; - case STRUCT: - _container_is_struct = true; - _container_prohibits_commas = false; - break; - case DATAGRAM: - _container_is_struct = false; - _container_prohibits_commas = true; - break; - default: - throw new IllegalArgumentException("type must be a container, not a "+t.toString()); - } - } - private int get_state_after_value() { int state_after_scalar; @@ -599,7 +627,7 @@ private final int get_state_after_annotation() { int state_after_annotation; switch(get_state_int()) { case STATE_AFTER_VALUE_CONTENTS: - IonType container = top_state(); + IonType container = top_state().type; switch(container) { case STRUCT: case LIST: @@ -633,13 +661,13 @@ private final int get_state_after_annotation() { } private final int get_state_after_container() { - IonType container = top_state(); + IonType container = top_state().type; int new_state = get_state_after_container(container); return new_state; } private final int get_state_after_container(int token) { - IonType container = top_state(); + IonType container = top_state().type; switch(container) { case STRUCT: @@ -772,7 +800,6 @@ private final SymbolToken parseSymbolToken(String context, return new SymbolTokenImpl(text, sid); } - protected final void parse_to_next_value() throws IOException { int t; @@ -1061,6 +1088,25 @@ else if (t == IonTokenConstsX.TOKEN_DOT) { set_state(STATE_EOF); _eof = true; return; + case ACTION_START_E_EXPRESSION: + case ACTION_START_EXPRESSION_GROUP: + _container_is_e_expression = action == ACTION_START_E_EXPRESSION; + _container_is_expression_group = action == ACTION_START_EXPRESSION_GROUP; + if (!_is_expression_syntax_allowed) { + parse_error(String.format("unexpected token encountered: %s", IonTokenConstsX.getTokenName(t))); + } + ContainerState top = top_state(); + top.isEExpression = _container_is_e_expression; + top.isExpressionGroup = _container_is_expression_group; + set_state(STATE_BEFORE_ANNOTATION_SEXP); + _scanner.tokenIsFinished(); + // Reset the current value start used to define a span, since the :: + // isn't part of the span when it's hoisted. + _value_start_offset = _scanner.getStartingOffset(); + // Since expression syntax was already found, it is not allowed again. + _is_expression_syntax_allowed = false; + t = _scanner.nextToken(); + break; default: parse_error("unexpected token encountered: "+IonTokenConstsX.getTokenName(t)); } } @@ -1200,17 +1246,19 @@ private final void push_container_state(IonType newContainer) int oldlen = _container_state_stack.length; if (_container_state_top >= oldlen) { int newlen = oldlen * 2; - IonType[] temp = new IonType[newlen]; + ContainerState[] temp = new ContainerState[newlen]; System.arraycopy(_container_state_stack, 0, temp, 0, oldlen); + for (int i = oldlen; i < temp.length; i++) { + temp[i] = new ContainerState(); + } _container_state_stack = temp; } - set_container_flags(newContainer); - _container_state_stack[_container_state_top++] = newContainer; + _container_state_stack[_container_state_top++].init(newContainer); } private final void pop_container_state() { _container_state_top--; - set_container_flags(top_state()); + top_state().restore(); _eof = false; _has_next_called = false; @@ -1218,10 +1266,8 @@ private final void pop_container_state() { set_state(new_state); } - private final IonType top_state() { - int top = _container_state_top - 1; - IonType top_container = _container_state_stack[top]; - return top_container; + private final ContainerState top_state() { + return _container_state_stack[_container_state_top - 1]; } public IonType getType() @@ -1260,14 +1306,14 @@ private boolean is_in_struct_internal() public IonType getContainerType() { if (_container_state_top == 0) return IonType.DATAGRAM; - return _container_state_stack[_container_state_top - 1]; + return _container_state_stack[_container_state_top - 1].type; } public int getDepth() { int depth = _container_state_top; if (depth > 0) { int debugging_depth = depth; - IonType top_type = _container_state_stack[0]; + IonType top_type = _container_state_stack[0].type; if (_nesting_parent == null) { if (IonType.DATAGRAM.equals(top_type)) { depth--; @@ -1336,7 +1382,9 @@ public void stepIn() switch (_value_type) { case STRUCT: case LIST: + break; case SEXP: + _is_expression_syntax_allowed = minorVersion > 0; break; default: throw new IllegalStateException("Unexpected value type: " + _value_type); diff --git a/src/main/java/com/amazon/ion/impl/IonReaderTextUserX.java b/src/main/java/com/amazon/ion/impl/IonReaderTextUserX.java index 67d27958aa..fd91c4d7a5 100644 --- a/src/main/java/com/amazon/ion/impl/IonReaderTextUserX.java +++ b/src/main/java/com/amazon/ion/impl/IonReaderTextUserX.java @@ -134,13 +134,13 @@ private final boolean has_next_user_value() String version = symbolValue().getText(); if (isIonVersionMarker(version)) { - // TODO: Determine if Ion 1.0 and 1.1 need separate branches here. if (ION_1_0.equals(version) || "$ion_1_1".equals(version)) { + setMinorVersion(version.charAt(version.length() - 1) - '0'); if (_value_keyword != IonTokenConstsX.KEYWORD_sid) { symbol_table_reset(); - push_symbol_table(_system_symtab); + push_symbol_table(_system_symtab); // TODO install the correct system symbol table for the active Ion version. } _has_next_called = false; } diff --git a/src/test/java/com/amazon/ion/impl/IonRawTextReaderTest_1_1.java b/src/test/java/com/amazon/ion/impl/IonRawTextReaderTest_1_1.java new file mode 100644 index 0000000000..53cea88f40 --- /dev/null +++ b/src/test/java/com/amazon/ion/impl/IonRawTextReaderTest_1_1.java @@ -0,0 +1,142 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +package com.amazon.ion.impl; + +import com.amazon.ion.IonReader; +import com.amazon.ion.IonType; +import com.amazon.ion.system.SimpleCatalog; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class IonRawTextReaderTest_1_1 { + + enum ExpressionType { + E_EXPRESSION, + EXPRESSION_GROUP, + NONE; + + void verifyExpressionType(IonReader reader) { + IonReaderTextRawX rawReader = (IonReaderTextRawX) reader; + switch (this) { + case E_EXPRESSION: + assertTrue(rawReader._container_is_e_expression); + assertFalse(rawReader._container_is_expression_group); + break; + case EXPRESSION_GROUP: + assertFalse(rawReader._container_is_e_expression); + assertTrue(rawReader._container_is_expression_group); + break; + case NONE: + assertFalse(rawReader._container_is_e_expression); + assertFalse(rawReader._container_is_expression_group); + break; + } + } + } + + public static class Parameter { + final String input; + final String firstSymbol; + final String secondSymbol; + final ExpressionType expressionType; + + private Parameter(String input, String firstSymbol, String secondSymbol, ExpressionType expressionType) { + this.input = input; + this.firstSymbol = firstSymbol; + this.secondSymbol = secondSymbol; + this.expressionType = expressionType; + } + + IonReader newTextReader() { + return new IonReaderTextUserX( + new SimpleCatalog(), + LocalSymbolTable.DEFAULT_LST_FACTORY, + UnifiedInputStreamX.makeStream(input) + ); + } + + static Parameter of(String input, String firstSymbol, String secondSymbol, ExpressionType expressionType) { + return new Parameter(input, firstSymbol, secondSymbol, expressionType); + } + + @Override + public String toString() { + return input; + } + } + + static Parameter[] validParameters() { + return new Parameter[] { + Parameter.of("$ion_1_1 (:foo)", "foo", null, ExpressionType.E_EXPRESSION), + Parameter.of("$ion_1_1 (:foo bar)", "foo", "bar", ExpressionType.E_EXPRESSION), + Parameter.of("$ion_1_1 (::foo)", "foo", null, ExpressionType.EXPRESSION_GROUP), // TODO do we want to require whitespace after ::? + Parameter.of("$ion_1_1 (:: foo bar)", "foo", "bar", ExpressionType.EXPRESSION_GROUP), + Parameter.of("$ion_1_1 (:)", null, null, ExpressionType.E_EXPRESSION), + Parameter.of("$ion_1_1 (::)", null, null, ExpressionType.EXPRESSION_GROUP), + Parameter.of("$ion_1_1 (.foo)", ".", "foo", ExpressionType.NONE), + Parameter.of("$ion_1_1 (.. foo)", "..", "foo", ExpressionType.NONE), + Parameter.of("$ion_1_1 (.+ foo)", ".+", "foo", ExpressionType.NONE), + Parameter.of("$ion_1_1 (..+ foo)", "..+", "foo", ExpressionType.NONE), + Parameter.of("$ion_1_1 (.+ foo)", ".+", "foo", ExpressionType.NONE), + Parameter.of("$ion_1_1 (..+ foo)", "..+", "foo", ExpressionType.NONE), + }; + } + + @ParameterizedTest + @MethodSource("validParameters") + public void validExpressionSyntax(Parameter parameter) throws Exception { + try (IonReader reader = parameter.newTextReader()) { + reader.next(); + reader.stepIn(); + if (parameter.firstSymbol == null) { + assertNull(reader.next()); + } else { + assertEquals(IonType.SYMBOL, reader.next()); + assertEquals(parameter.firstSymbol, reader.stringValue()); + } + parameter.expressionType.verifyExpressionType(reader); + if (parameter.secondSymbol == null) { + assertNull(reader.next()); + } else { + assertEquals(IonType.SYMBOL, reader.next()); + assertEquals(parameter.secondSymbol, reader.stringValue()); + assertNull(reader.next()); + } + reader.stepOut(); + assertNull(reader.next()); + } + } + + static Parameter[] invalidParameters() { + return new Parameter[] { + // Colon is not a valid operator in Ion 1.0. + Parameter.of("$ion_1_0 (:foo)", null, null, ExpressionType.NONE), + Parameter.of("$ion_1_0 (::foo)", null, null, ExpressionType.NONE), + // Colon is not a valid operator in Ion 1.1 except at the beginning of an s-expression. + Parameter.of("$ion_1_1 (:foo :)", null, null, ExpressionType.NONE), + Parameter.of("$ion_1_1 (::foo ::)", null, null, ExpressionType.NONE), + Parameter.of("$ion_1_1 (foo :)", null, null, ExpressionType.NONE), + Parameter.of("$ion_1_1 (foo ::)", null, null, ExpressionType.NONE), + }; + } + + @ParameterizedTest + @MethodSource("invalidParameters") + public void invalidExpressionSyntax(Parameter parameter) throws Exception { + try (IonReader reader = parameter.newTextReader()) { + reader.next(); + reader.stepIn(); + assertThrows(IonReaderTextRawX.IonReaderTextParsingException.class, () -> { + reader.next(); + // Some of the invalid s-expressions begin with a valid symbol and fail on the second element. + reader.next(); + }); + } + } +}