From e5025ea696eb19e19dba31fe8b87af20a3ec8cd9 Mon Sep 17 00:00:00 2001 From: Tyler Gregg Date: Thu, 30 May 2024 17:20:54 -0700 Subject: [PATCH] Ensures annotations are not overwritten when filling a delimited container. --- .../com/amazon/ion/impl/IonCursorBinary.java | 94 +++++++++++++++++-- ...IonReaderContinuableApplicationBinary.java | 25 ++--- .../java/com/amazon/ion/impl/MarkerList.java | 2 +- .../com/amazon/ion/Ion_1_1_RoundTripTest.kt | 33 +++---- ...onReaderContinuableTopLevelBinaryTest.java | 50 +++++++++- 5 files changed, 164 insertions(+), 40 deletions(-) diff --git a/src/main/java/com/amazon/ion/impl/IonCursorBinary.java b/src/main/java/com/amazon/ion/impl/IonCursorBinary.java index 520b7e36b..be11ccee1 100644 --- a/src/main/java/com/amazon/ion/impl/IonCursorBinary.java +++ b/src/main/java/com/amazon/ion/impl/IonCursorBinary.java @@ -20,6 +20,8 @@ import java.nio.ByteOrder; import static com.amazon.ion.impl.IonTypeID.DELIMITED_END_ID; +import static com.amazon.ion.impl.IonTypeID.ONE_ANNOTATION_FLEX_SYM_LOWER_NIBBLE_1_1; +import static com.amazon.ion.impl.IonTypeID.ONE_ANNOTATION_SID_LOWER_NIBBLE_1_1; import static com.amazon.ion.impl.IonTypeID.TWO_ANNOTATION_FLEX_SYMS_LOWER_NIBBLE_1_1; import static com.amazon.ion.impl.IonTypeID.TWO_ANNOTATION_SIDS_LOWER_NIBBLE_1_1; import static com.amazon.ion.util.IonStreamUtils.throwAsIonException; @@ -155,6 +157,14 @@ private static class RefillableState { */ int lastUnbufferedByte = -1; + /** + * Whether to skip over annotation sequences rather than recording them for consumption by the user. This is + * used when probing forward in the stream for the end of a delimited container while remaining logically + * positioned on the current value. This is only needed in 'slow' mode because in quick mode the entire + * container is assumed to be buffered in its entirety and no probing occurs. + */ + private boolean skipAnnotations = false; + RefillableState(InputStream inputStream, int capacity, int maximumBufferSize, State initialState) { this.inputStream = inputStream; this.capacity = capacity; @@ -1195,6 +1205,38 @@ private boolean uncheckedReadAnnotationWrapperHeader_1_1(IonTypeID valueTid) { return false; } + /** + * Skips a non-length-prefixed annotation sequence (opcodes E4, E5, E7, or E8), ensuring enough space is available + * in the buffer. After this method returns, `peekIndex` points to the first byte after the end of the annotation + * sequence. + * @param valueTid the type ID of the annotation sequence to skip. + * @return true if there are not enough bytes in the stream to complete the annotation sequence; otherwise, false. + */ + private boolean slowSkipNonPrefixedAnnotations_1_1(IonTypeID valueTid) { + if (valueTid.isInlineable) { + // Opcodes 0xE7 (one annotation FlexSym) and 0xE8 (two annotation FlexSyms) + if (slowSkipFlexSym_1_1()) { + return true; + } + if (valueTid.lowerNibble == TWO_ANNOTATION_FLEX_SYMS_LOWER_NIBBLE_1_1) { + // Opcode 0xE8 (two annotation FlexSyms) + return slowSkipFlexSym_1_1(); + } + } else { + // Opcodes 0xE4 (one annotation SID) and 0xE5 (two annotation SIDs) + int annotationSid = (int) slowReadFlexUInt_1_1(); + if (annotationSid < 0) { + return true; + } + if (valueTid.lowerNibble == TWO_ANNOTATION_SIDS_LOWER_NIBBLE_1_1) { + // Opcode 0xE5 (two annotation SIDs) + annotationSid = (int) slowReadFlexUInt_1_1(); + return annotationSid < 0; + } + } + return false; + } + /** * Reads the header of an Ion 1.1 annotation wrapper, ensuring enough data is available in the buffer. Sets * `valueMarker` with the start and end indices of the wrapped value. Sets `annotationSequenceMarker` with the start @@ -1205,7 +1247,9 @@ private boolean uncheckedReadAnnotationWrapperHeader_1_1(IonTypeID valueTid) { * @return true if there are not enough bytes in the stream to complete the value; otherwise, false. */ private boolean slowReadAnnotationWrapperHeader_1_1(IonTypeID valueTid) { - annotationTokenMarkers.clear(); + if (!refillableState.skipAnnotations) { + annotationTokenMarkers.clear(); + } if (valueTid.variableLength) { // Opcodes 0xE6 (variable-length annotation SIDs) and 0xE9 (variable-length annotation FlexSyms) // At this point the value must be at least 3 more bytes: one for the smallest-possible annotations @@ -1221,13 +1265,22 @@ private boolean slowReadAnnotationWrapperHeader_1_1(IonTypeID valueTid) { if (!fillAt(peekIndex, annotationsLength)) { return true; } - annotationSequenceMarker.typeId = valueTid; - annotationSequenceMarker.startIndex = peekIndex; - annotationSequenceMarker.endIndex = annotationSequenceMarker.startIndex + annotationsLength; - peekIndex = annotationSequenceMarker.endIndex; + long annotationsEnd = peekIndex + annotationsLength; + if (!refillableState.skipAnnotations) { + annotationSequenceMarker.typeId = valueTid; + annotationSequenceMarker.startIndex = peekIndex; + annotationSequenceMarker.endIndex = annotationsEnd; + } + peekIndex = annotationsEnd; } else { // At this point the value must have at least one more byte for each annotation FlexSym (one for lower // nibble 7, two for lower nibble 8), plus one for the smallest-possible value representation. + if (!fillAt(peekIndex, (valueTid.lowerNibble == ONE_ANNOTATION_FLEX_SYM_LOWER_NIBBLE_1_1 || valueTid.lowerNibble == ONE_ANNOTATION_SID_LOWER_NIBBLE_1_1) ? 2 : 3)) { + return true; + } + if (refillableState.skipAnnotations) { + return slowSkipNonPrefixedAnnotations_1_1(valueTid); + } if (valueTid.isInlineable) { // Opcodes 0xE7 (one annotation FlexSym) and 0xE8 (two annotation FlexSyms) Marker provisionalMarker = annotationTokenMarkers.provisionalElement(); @@ -1473,6 +1526,29 @@ private long slowReadFlexSym_1_1(Marker markerToSet) { return result; } + /** + * Skips a FlexSym, ensuring enough space is available in the buffer. After this method returns, `peekIndex` points + * to the first byte after the end of the FlexSym. + * @return true if there are not enough bytes in the stream to complete the FlexSym; otherwise, false. + */ + private boolean slowSkipFlexSym_1_1() { + long result = slowReadFlexInt_1_1(); + if (result == 0) { + int nextByte = slowReadByte(); + if (nextByte < 0) { + return true; + } + if ((byte) nextByte != OpCodes.INLINE_SYMBOL_ZERO_LENGTH && (byte) nextByte != OpCodes.STRING_ZERO_LENGTH && (byte) nextByte != OpCodes.DELIMITED_END_MARKER) { + throw new IonException("FlexSyms may only wrap symbol zero, empty string, or delimited end."); + } + return false; + } else if (result < 0) { + peekIndex -= result; + return false; + } + return false; + } + /** * Reads the field name at `peekIndex`, ensuring enough bytes are available in the buffer. After this method returns * `peekIndex` points to the first byte of the value that follows the field name. If the field name contained a @@ -1661,7 +1737,11 @@ private boolean slowFindDelimitedEnd_1_1() { long savedCheckpoint = checkpoint; int savedContainerIndex = containerIndex; Marker savedParent = parent; - boolean savedHasAnnotations = hasAnnotations; // TODO have to set aside all annotation markers... or go into a mode where the markers are not recorded + boolean savedHasAnnotations = hasAnnotations; + // The cursor remains logically positioned at the current value despite probing forward for the end of the + // delimited value. Accordingly, do not overwrite the existing annotations with any annotations found during + // the probe. + refillableState.skipAnnotations = true; // ------------ // TODO performance: the following line causes the end indexes of any child delimited containers that are not @@ -1670,6 +1750,7 @@ private boolean slowFindDelimitedEnd_1_1() { // additional complexity. seekPastDelimitedContainer_1_1(); + refillableState.skipAnnotations = false; boolean isReady = event != Event.NEEDS_DATA; if (refillableState.isSkippingCurrentValue) { // This delimited container is oversized. The cursor must seek past it. @@ -1850,7 +1931,6 @@ private void reset() { annotationSequenceMarker.typeId = null; annotationSequenceMarker.startIndex = -1; annotationSequenceMarker.endIndex = -1; - // TOOD annotationTokenMarkers? } /** diff --git a/src/main/java/com/amazon/ion/impl/IonReaderContinuableApplicationBinary.java b/src/main/java/com/amazon/ion/impl/IonReaderContinuableApplicationBinary.java index 73b3a4b32..0b7d25bf5 100644 --- a/src/main/java/com/amazon/ion/impl/IonReaderContinuableApplicationBinary.java +++ b/src/main/java/com/amazon/ion/impl/IonReaderContinuableApplicationBinary.java @@ -44,21 +44,12 @@ class IonReaderContinuableApplicationBinary extends IonReaderContinuableCoreBinary implements IonReaderContinuableApplication { // The UTF-8 encoded bytes representing the text `$ion_symbol_table`. - private static final byte[] ION_SYMBOL_TABLE_UTF8; - private static final byte[] IMPORTS_UTF8; - private static final byte[] SYMBOLS_UTF8; - private static final byte[] NAME_UTF8; - private static final byte[] VERSION_UTF8; - private static final byte[] MAX_ID_UTF8; - - static { - ION_SYMBOL_TABLE_UTF8 = SystemSymbols.ION_SYMBOL_TABLE.getBytes(StandardCharsets.UTF_8); - IMPORTS_UTF8 = SystemSymbols.IMPORTS.getBytes(StandardCharsets.UTF_8); - SYMBOLS_UTF8 = SystemSymbols.SYMBOLS.getBytes(StandardCharsets.UTF_8); - NAME_UTF8 = SystemSymbols.NAME.getBytes(StandardCharsets.UTF_8); - VERSION_UTF8 = SystemSymbols.VERSION.getBytes(StandardCharsets.UTF_8); - MAX_ID_UTF8 = SystemSymbols.MAX_ID.getBytes(StandardCharsets.UTF_8); - } + private static final byte[] ION_SYMBOL_TABLE_UTF8 = SystemSymbols.ION_SYMBOL_TABLE.getBytes(StandardCharsets.UTF_8); + private static final byte[] IMPORTS_UTF8 = SystemSymbols.IMPORTS.getBytes(StandardCharsets.UTF_8); + private static final byte[] SYMBOLS_UTF8 = SystemSymbols.SYMBOLS.getBytes(StandardCharsets.UTF_8); + private static final byte[] NAME_UTF8 = SystemSymbols.NAME.getBytes(StandardCharsets.UTF_8); + private static final byte[] VERSION_UTF8 = SystemSymbols.VERSION.getBytes(StandardCharsets.UTF_8); + private static final byte[] MAX_ID_UTF8 = SystemSymbols.MAX_ID.getBytes(StandardCharsets.UTF_8); // An IonCatalog containing zero shared symbol tables. private static final IonCatalog EMPTY_CATALOG = new SimpleCatalog(); @@ -974,6 +965,10 @@ private enum State { * @return true if the bytes match; otherwise, false. */ private static boolean bytesMatch(byte[] target, byte[] buffer, int start, int end) { + // TODO if this ends up on a critical performance path, see if it's faster to copy the bytes into a + // pre-allocated buffer and then perform a comparison. It's possible that a combination of System.arraycopy() + // and Arrays.equals(byte[], byte[]) is faster because it can be more easily optimized with native code by the + // JVM—both are annotated with @HotSpotIntrinsicCandidate. int length = end - start; if (length != target.length) { return false; diff --git a/src/main/java/com/amazon/ion/impl/MarkerList.java b/src/main/java/com/amazon/ion/impl/MarkerList.java index 9567d51b2..276f2745d 100644 --- a/src/main/java/com/amazon/ion/impl/MarkerList.java +++ b/src/main/java/com/amazon/ion/impl/MarkerList.java @@ -37,7 +37,7 @@ public int size() { * @return The number of markers, including provisional markers, in the list. */ public int provisionalSize() { - return provisionalIndex; // TODO Math.max(provisionalIndex, numberOfValues)? provisionalIndex + numberOfValues? + return provisionalIndex; } /** diff --git a/src/test/java/com/amazon/ion/Ion_1_1_RoundTripTest.kt b/src/test/java/com/amazon/ion/Ion_1_1_RoundTripTest.kt index bfa76e73b..ffada400f 100644 --- a/src/test/java/com/amazon/ion/Ion_1_1_RoundTripTest.kt +++ b/src/test/java/com/amazon/ion/Ion_1_1_RoundTripTest.kt @@ -441,29 +441,29 @@ abstract class Ion_1_1_RoundTripBase { @JvmStatic protected val WRITER_INTERNED_PREFIXED: (OutputStream) -> IonWriter = ION_1_1.binaryWriterBuilder() - .withSymbolInliningStrategy(SymbolInliningStrategy.NEVER_INLINE) - .withDelimitedContainerStrategy(DelimitedContainerStrategy.ALWAYS_PREFIXED)::build + .withSymbolInliningStrategy(SymbolInliningStrategy.NEVER_INLINE) + .withDelimitedContainerStrategy(DelimitedContainerStrategy.ALWAYS_PREFIXED)::build @JvmStatic protected val WRITER_INLINE_PREFIXED: (OutputStream) -> IonWriter = ION_1_1.binaryWriterBuilder() - .withSymbolInliningStrategy(SymbolInliningStrategy.ALWAYS_INLINE) - .withDelimitedContainerStrategy(DelimitedContainerStrategy.ALWAYS_PREFIXED)::build + .withSymbolInliningStrategy(SymbolInliningStrategy.ALWAYS_INLINE) + .withDelimitedContainerStrategy(DelimitedContainerStrategy.ALWAYS_PREFIXED)::build @JvmStatic protected val WRITER_INTERNED_DELIMITED: (OutputStream) -> IonWriter = ION_1_1.binaryWriterBuilder() - .withSymbolInliningStrategy(SymbolInliningStrategy.NEVER_INLINE) - .withDelimitedContainerStrategy(DelimitedContainerStrategy.ALWAYS_DELIMITED)::build + .withSymbolInliningStrategy(SymbolInliningStrategy.NEVER_INLINE) + .withDelimitedContainerStrategy(DelimitedContainerStrategy.ALWAYS_DELIMITED)::build @JvmStatic protected val WRITER_INLINE_DELIMITED: (OutputStream) -> IonWriter = ION_1_1.binaryWriterBuilder() - .withSymbolInliningStrategy(SymbolInliningStrategy.ALWAYS_INLINE) - .withDelimitedContainerStrategy(DelimitedContainerStrategy.ALWAYS_DELIMITED)::build + .withSymbolInliningStrategy(SymbolInliningStrategy.ALWAYS_INLINE) + .withDelimitedContainerStrategy(DelimitedContainerStrategy.ALWAYS_DELIMITED)::build /** * Checks if this ByteArray contains Ion Binary. */ private fun ByteArray.isIonBinary(): Boolean { return get(0) == 0xE0.toByte() && - get(1) == 0x01.toByte() && - get(2) in setOf(0, 1) && - get(3) == 0xEA.toByte() + get(1) == 0x01.toByte() && + get(2) in setOf(0, 1) && + get(3) == 0xEA.toByte() } /** @@ -473,11 +473,11 @@ abstract class Ion_1_1_RoundTripBase { protected fun ByteArray.printDisplayString() { if (isIonBinary()) { map { it.toHexString(HexFormat.UpperCase) } - .windowed(4, 4, partialWindows = true) - .windowed(8, 8, partialWindows = true) - .forEach { - println(it.joinToString(" ") { it.joinToString(" ") }) - } + .windowed(4, 4, partialWindows = true) + .windowed(8, 8, partialWindows = true) + .forEach { + println(it.joinToString(" ") { it.joinToString(" ") }) + } } else { println(toString(Charsets.UTF_8)) } @@ -525,6 +525,7 @@ abstract class Ion_1_1_RoundTripBase { $10 $11 $12 $13 $14 """.trimIndent() ), + ionText("foo::(bar::baz::{abc: zar::qux::xyz::123, def: 456})") ) + files().flatMap { f -> val ion = ION.loader.load(f) // If there are embedded documents, flatten them into separate test cases. diff --git a/src/test/java/com/amazon/ion/impl/IonReaderContinuableTopLevelBinaryTest.java b/src/test/java/com/amazon/ion/impl/IonReaderContinuableTopLevelBinaryTest.java index be5b43ab7..94aa57e93 100644 --- a/src/test/java/com/amazon/ion/impl/IonReaderContinuableTopLevelBinaryTest.java +++ b/src/test/java/com/amazon/ion/impl/IonReaderContinuableTopLevelBinaryTest.java @@ -5018,7 +5018,6 @@ private void readAnnotationsThatForceBufferShiftInDelimitedStruct_1_1( byte[] inputBytes, int initialBufferSize ) throws Exception { - //readerBuilder = readerBuilder.withIncrementalReadingEnabled(false); reader = boundedReaderFor(constructFromBytes, inputBytes, initialBufferSize, Integer.MAX_VALUE, byteCountingHandler); assertSequence( container(IonType.STRUCT, @@ -5920,5 +5919,54 @@ public void readDataThatSwitchesVersionsMultipleTimes(boolean constructFromBytes closeAndCount(); } + @ParameterizedTest(name = "constructFromBytes={0}") + @ValueSource(booleans = {true, false}) + public void prefixedAnnotatedContainerInsideDelimitedAnnotatedContainerPreservesSidAnnotations(boolean constructFromBytes) throws Exception { + byte[] input = withIvm(1, hexStringToByteArray(cleanCommentedHexBytes( + "E4 09 | Annotation symbol 4 (name) \n" + + "F1 | Delimited list start \n" + + "E5 0B 0D | Annotation symbol 5 (version), annotation symbol 6 (imports) \n" + + "C7 | Prefixed s-exp length 7 \n" + + "E4 0F | Annotation symbol 7 (symbols) \n" + + "60 | Int 0 \n" + + "E5 11 13 | Annotation symbol 8 (max_id), annotation symbol 9 ($ion_shared_symbol_table) \n" + + "60 | Int 0 \n" + + "F0 | End of delimited list" + ))); + reader = readerFor(readerBuilder, constructFromBytes, input); + assertSequence( + next(IonType.LIST), annotations("name"), STEP_IN, + next(IonType.SEXP), annotations("version", "imports"), + STEP_OUT, + next(null) + ); + closeAndCount(); + } + + @ParameterizedTest(name = "constructFromBytes={0}") + @ValueSource(booleans = {true, false}) + public void prefixedAnnotatedContainerInsideDelimitedAnnotatedContainerPreservesFlexSymAnnotations(boolean constructFromBytes) throws Exception { + byte[] input = withIvm(1, hexStringToByteArray(cleanCommentedHexBytes( + "E7 FF 61 | Annotation FlexSym 'a' \n" + + "F1 | Delimited list start \n" + + "E8 FF 62 09 | Annotation FlexSym 'b', annotation FlexSym SID 4 (name) \n" + + "C8 | Prefixed s-exp length 8 \n" + + "E4 0F | Annotation symbol 7 (symbols) \n" + + "60 | Int 0 \n" + + "E8 0B FF 63 | Annotation FlexSym SID 5 (version), annotation FlexSym 'c' \n" + + "60 | Int 0 \n" + + "F0 | End of delimited list" + ))); + readerBuilder = readerBuilder.withIncrementalReadingEnabled(false); + reader = readerFor(readerBuilder, constructFromBytes, input); + assertSequence( + next(IonType.LIST), annotations("a"), STEP_IN, + next(IonType.SEXP), annotations("b", "name"), + STEP_OUT, + next(null) + ); + closeAndCount(); + } + // TODO Ion 1.1 symbol tables with all kinds of annotation encodings (opcodes E4 - E9, inline and SID) }