From b08cbc415fb3f0f57d2ecf6e193b655a0937eb68 Mon Sep 17 00:00:00 2001 From: Matthew Pope <81593196+popematt@users.noreply.github.com> Date: Tue, 14 Nov 2023 13:45:44 -0800 Subject: [PATCH] Adds support for text and lob types and some annotations (#637) --- .../amazon/ion/impl/bin/IonEncoder_1_1.java | 128 ++++++++++++ .../ion/impl/bin/Ion_1_1_Constants.java | 3 + src/com/amazon/ion/impl/bin/OpCodes.java | 20 ++ .../ion/impl/bin/IonEncoder_1_1Test.java | 196 +++++++++++++++++- 4 files changed, 345 insertions(+), 2 deletions(-) diff --git a/src/com/amazon/ion/impl/bin/IonEncoder_1_1.java b/src/com/amazon/ion/impl/bin/IonEncoder_1_1.java index 11218dc8cd..7b1ca9d4b2 100644 --- a/src/com/amazon/ion/impl/bin/IonEncoder_1_1.java +++ b/src/com/amazon/ion/impl/bin/IonEncoder_1_1.java @@ -1,11 +1,15 @@ package com.amazon.ion.impl.bin; import com.amazon.ion.Decimal; +import com.amazon.ion.IonText; import com.amazon.ion.IonType; import com.amazon.ion.Timestamp; +import com.amazon.ion.impl.bin.utf8.Utf8StringEncoder; +import com.amazon.ion.impl.bin.utf8.Utf8StringEncoderPool; import java.math.BigDecimal; import java.math.BigInteger; +import java.nio.charset.StandardCharsets; import static com.amazon.ion.impl.bin.Ion_1_1_Constants.*; import static java.lang.Double.doubleToRawLongBits; @@ -428,4 +432,128 @@ static int writeLongFormTimestampValue(WriteBuffer buffer, Timestamp value) { return 1 + WriteBuffer.flexUIntLength(dataLength) + dataLength; } + /** + * Writes a String to the given WriteBuffer using the Ion 1.1 encoding for Ion Strings. + * @return the number of bytes written + */ + public static int writeStringValue(WriteBuffer buffer, String value) { + return writeInlineText(buffer, value, IonType.STRING, OpCodes.STRING_ZERO_LENGTH, OpCodes.VARIABLE_LENGTH_STRING); + } + + /** + * Writes an inline Symbol to the given WriteBuffer using the Ion 1.1 encoding for Ion Symbols. + * @return the number of bytes written + */ + public static int writeSymbolValue(WriteBuffer buffer, String value) { + return writeInlineText(buffer, value, IonType.SYMBOL, OpCodes.INLINE_SYMBOL_ZERO_LENGTH, OpCodes.VARIABLE_LENGTH_INLINE_SYMBOL); + } + + private static int writeInlineText(WriteBuffer buffer, String value, IonType type, byte zeroLengthOpCode, byte variableLengthOpCode) { + if (value == null) { + return writeNullValue(buffer, type); + } + + // TODO: When merging into the Ion 1.1 raw writer, keep a single instance of the Utf8StringEncoder + // instead of fetching one on every call. + Utf8StringEncoder.Result encoderResult = Utf8StringEncoderPool.getInstance().getOrCreate().encode(value); + + byte[] utf8Buffer = encoderResult.getBuffer(); + int numValueBytes = encoderResult.getEncodedLength(); + int numLengthBytes = 0; + + if (numValueBytes <= 0xF) { + buffer.writeByte((byte)(zeroLengthOpCode | numValueBytes)); + } else { + buffer.writeByte(variableLengthOpCode); + numLengthBytes = buffer.writeFlexUInt(numValueBytes); + } + buffer.writeBytes(utf8Buffer, 0, numValueBytes); + return 1 + numLengthBytes + numValueBytes; + } + + /** + * Writes an interned Symbol's address to the given WriteBuffer using the Ion 1.1 encoding for Ion Symbols. + * @return the number of bytes written + * + * TODO: Do we need to support Symbol Addresses greater than Long.MAX_VALUE? + */ + public static int writeSymbolValue(WriteBuffer buffer, long value) { + if (value < 0) { + throw new IllegalArgumentException("Symbol Address cannot be negative; was: " + value); + } else if (value < FIRST_2_BYTE_SYMBOL_ADDRESS) { + buffer.writeByte(OpCodes.SYMBOL_ADDRESS_1_BYTE); + buffer.writeFixedUInt(value); + return 2; + } else if (value < FIRST_MANY_BYTE_SYMBOL_ADDRESS) { + buffer.writeByte(OpCodes.SYMBOL_ADDRESS_2_BYTES); + buffer.writeFixedIntOrUInt(value - FIRST_2_BYTE_SYMBOL_ADDRESS, 2); + return 3; + } else { + buffer.writeByte(OpCodes.SYMBOL_ADDRESS_MANY_BYTES); + int addressBytes = buffer.writeFlexUInt(value - FIRST_MANY_BYTE_SYMBOL_ADDRESS); + return 1 + addressBytes; + } + } + + /** + * Writes a Blob to the given WriteBuffer using the Ion 1.1 encoding for Ion Blobs. + * @return the number of bytes written + */ + public static int writeBlobValue(WriteBuffer buffer, byte[] value) { + if (value == null) { + return writeNullValue(buffer, IonType.BLOB); + } + + buffer.writeByte(OpCodes.VARIABLE_LENGTH_BLOB); + int numLengthBytes = buffer.writeFlexUInt(value.length); + buffer.writeBytes(value); + return 1 + numLengthBytes + value.length; + } + + /** + * Writes a Clob to the given WriteBuffer using the Ion 1.1 encoding for Ion Clobs. + * @return the number of bytes written + */ + public static int writeClobValue(WriteBuffer buffer, byte[] value) { + if (value == null) { + return writeNullValue(buffer, IonType.CLOB); + } + + buffer.writeByte(OpCodes.VARIABLE_LENGTH_CLOB); + int numLengthBytes = buffer.writeFlexUInt(value.length); + buffer.writeBytes(value); + return 1 + numLengthBytes + value.length; + } + + // TODO: Implement FlexSym Annotations + + /** + * Writes annotations using the given symbol addresses. + */ + public static int writeAnnotations(WriteBuffer buffer, long[] annotations) { + if (annotations == null || annotations.length == 0) { + return 0; + } + if (annotations.length == 1) { + buffer.writeByte(OpCodes.ANNOTATIONS_1_SYMBOL_ADDRESS); + int numAddressBytes = buffer.writeFlexUInt(annotations[0]); + return 1 + numAddressBytes; + } else if (annotations.length == 2) { + buffer.writeByte(OpCodes.ANNOTATIONS_2_SYMBOL_ADDRESS); + int numAddressBytes = buffer.writeFlexUInt(annotations[0]); + numAddressBytes += buffer.writeFlexUInt(annotations[1]); + return 1 + numAddressBytes; + } else { + int numAddressBytes = 0; + for (long ann : annotations) { + numAddressBytes += WriteBuffer.flexUIntLength(ann); + } + buffer.writeByte(OpCodes.ANNOTATIONS_MANY_SYMBOL_ADDRESS); + int numLengthBytes = buffer.writeFlexUInt(numAddressBytes); + for (long ann : annotations) { + buffer.writeFlexUInt(ann); + } + return 1 + numLengthBytes + numAddressBytes; + } + } } diff --git a/src/com/amazon/ion/impl/bin/Ion_1_1_Constants.java b/src/com/amazon/ion/impl/bin/Ion_1_1_Constants.java index 91a7647042..8df501b349 100644 --- a/src/com/amazon/ion/impl/bin/Ion_1_1_Constants.java +++ b/src/com/amazon/ion/impl/bin/Ion_1_1_Constants.java @@ -6,6 +6,9 @@ public class Ion_1_1_Constants { private Ion_1_1_Constants() {} + static final int FIRST_2_BYTE_SYMBOL_ADDRESS = 256; + static final int FIRST_MANY_BYTE_SYMBOL_ADDRESS = 65792; + //////// Timestamp Field Constants //////// // S_TIMESTAMP_* is applicable to all short-form timestamps diff --git a/src/com/amazon/ion/impl/bin/OpCodes.java b/src/com/amazon/ion/impl/bin/OpCodes.java index ecfc14e8b0..a84c4bc853 100644 --- a/src/com/amazon/ion/impl/bin/OpCodes.java +++ b/src/com/amazon/ion/impl/bin/OpCodes.java @@ -35,10 +35,30 @@ private OpCodes() {} public static final byte TIMESTAMP_NANOS_PRECISION_WITH_OFFSET = 0x7C; // 0x7D-0x7F Reserved + public static final byte STRING_ZERO_LENGTH = (byte) 0x80; + + public static final byte INLINE_SYMBOL_ZERO_LENGTH = (byte) 0x90; + + public static final byte SYMBOL_ADDRESS_1_BYTE = (byte) 0xE1; + public static final byte SYMBOL_ADDRESS_2_BYTES = (byte) 0xE2; + public static final byte SYMBOL_ADDRESS_MANY_BYTES = (byte) 0xE3; + public static final byte ANNOTATIONS_1_SYMBOL_ADDRESS = (byte) 0xE4; + public static final byte ANNOTATIONS_2_SYMBOL_ADDRESS = (byte) 0xE5; + public static final byte ANNOTATIONS_MANY_SYMBOL_ADDRESS = (byte) 0xE6; + public static final byte ANNOTATIONS_1_FLEX_SYM = (byte) 0xE7; + public static final byte ANNOTATIONS_2_FLEX_SYM = (byte) 0xE8; + public static final byte ANNOTATIONS_MANY_FLEX_SYM = (byte) 0xE9; public static final byte NULL_UNTYPED = (byte) 0xEA; public static final byte NULL_TYPED = (byte) 0xEB; + // 0xEC, 0xED NOP + // 0xEE Reserved + // 0xEF System Macro Invocation public static final byte VARIABLE_LENGTH_INTEGER = (byte) 0xF5; public static final byte VARIABLE_LENGTH_DECIMAL = (byte) 0xF6; public static final byte VARIABLE_LENGTH_TIMESTAMP = (byte) 0xF7; + public static final byte VARIABLE_LENGTH_STRING = (byte) 0xF8; + public static final byte VARIABLE_LENGTH_INLINE_SYMBOL = (byte) 0xF9; + public static final byte VARIABLE_LENGTH_BLOB = (byte) 0xFE; + public static final byte VARIABLE_LENGTH_CLOB = (byte) 0xFF; } diff --git a/test/com/amazon/ion/impl/bin/IonEncoder_1_1Test.java b/test/com/amazon/ion/impl/bin/IonEncoder_1_1Test.java index 2f25adb22b..478433660c 100644 --- a/test/com/amazon/ion/impl/bin/IonEncoder_1_1Test.java +++ b/test/com/amazon/ion/impl/bin/IonEncoder_1_1Test.java @@ -1,5 +1,6 @@ package com.amazon.ion.impl.bin; +import com.amazon.ion.BitUtils; import com.amazon.ion.Decimal; import com.amazon.ion.IonType; import com.amazon.ion.Timestamp; @@ -15,6 +16,10 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.math.BigInteger; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; import java.util.function.BiFunction; public class IonEncoder_1_1Test { @@ -47,6 +52,16 @@ private void assertWritingValue(String expectedBytes, T value, BiFunction void assertWritingValue(byte[] expectedBytes, T value, BiFunction writeOperation) { + int numBytes = writeOperation.apply(buf, value); + Assertions.assertEquals(expectedBytes, bytes()); + Assertions.assertEquals(expectedBytes.length, numBytes); + } + /** * Checks that the function writes the expected bytes and returns the expected count of written bytes for the * given input value. The expectedBytes should be a string of space-separated binary octets. @@ -455,6 +470,127 @@ public void testWriteTimestampValueForNullTimestamp() { Assertions.assertEquals(2, numBytes); } + @ParameterizedTest + @CsvSource({ + "'', 80", + "'a', 81 61", + "'ab', 82 61 62", + "'abc', 83 61 62 63", + "'fourteen bytes', 8E 66 6F 75 72 74 65 65 6E 20 62 79 74 65 73", + "'this has sixteen', F8 21 74 68 69 73 20 68 61 73 20 73 69 78 74 65 65 6E", + "'variable length encoding', F8 31 76 61 72 69 61 62 6C 65 20 6C 65 6E 67 74 68 20 65 6E 63 6F 64 69 6E 67", + }) + public void testWriteStringValue(String value, String expectedBytes) { + assertWritingValue(expectedBytes, value, IonEncoder_1_1::writeStringValue); + } + + @Test + public void testWriteStringValueForNull() { + int numBytes = IonEncoder_1_1.writeStringValue(buf, null); + Assertions.assertEquals("EB 05", byteArrayToHex(bytes())); + Assertions.assertEquals(2, numBytes); + } + + @ParameterizedTest + @CsvSource({ + "'', 90", + "'a', 91 61", + "'ab', 92 61 62", + "'abc', 93 61 62 63", + "'fourteen bytes', 9E 66 6F 75 72 74 65 65 6E 20 62 79 74 65 73", + "'this has sixteen', F9 21 74 68 69 73 20 68 61 73 20 73 69 78 74 65 65 6E", + "'variable length encoding', F9 31 76 61 72 69 61 62 6C 65 20 6C 65 6E 67 74 68 20 65 6E 63 6F 64 69 6E 67", + }) + public void testWriteSymbolValue(String value, String expectedBytes) { + assertWritingValue(expectedBytes, value, IonEncoder_1_1::writeSymbolValue); + } + + @ParameterizedTest + @CsvSource({ + "0, E1 00", + "1, E1 01", + "255, E1 FF", + "256, E2 00 00", + "257, E2 01 00", + "512, E2 00 01", + "513, E2 01 01", + "65535, E2 FF FE", + "65791, E2 FF FF", + "65792, E3 01", + "65793, E3 03", + "65919, E3 FF", + "65920, E3 02 02", + "9223372036854775807, E3 00 FF FD FD FF FF FF FF FF" + }) + public void testWriteSymbolValue(long value, String expectedBytes) { + assertWritingValue(expectedBytes, value, IonEncoder_1_1::writeSymbolValue); + } + + @Test + public void testWriteSymbolValueForNull() { + int numBytes = IonEncoder_1_1.writeSymbolValue(buf, null); + Assertions.assertEquals("EB 06", byteArrayToHex(bytes())); + Assertions.assertEquals(2, numBytes); + } + + @ParameterizedTest + @CsvSource({ + "'', FE 01", // + "20, FE 03 20", + "49 20 61 70 70 6C 61 75 64 20 79 6F 75 72 20 63 75 72 69 6F 73 69 74 79, " + + "FE 31 49 20 61 70 70 6C 61 75 64 20 79 6F 75 72 20 63 75 72 69 6F 73 69 74 79" + }) + public void testWriteBlobValue(@ConvertWith(HexStringToByteArray.class) byte[] value, String expectedBytes) { + assertWritingValue(expectedBytes, value, IonEncoder_1_1::writeBlobValue); + } + + @Test + public void testWriteBlobValueForNull() { + int numBytes = IonEncoder_1_1.writeBlobValue(buf, null); + Assertions.assertEquals("EB 07", byteArrayToHex(bytes())); + Assertions.assertEquals(2, numBytes); + } + + @ParameterizedTest + @CsvSource({ + "'', FF 01", + "20, FF 03 20", + "49 20 61 70 70 6C 61 75 64 20 79 6F 75 72 20 63 75 72 69 6F 73 69 74 79, " + + "FF 31 49 20 61 70 70 6C 61 75 64 20 79 6F 75 72 20 63 75 72 69 6F 73 69 74 79" + }) + public void testWriteClobValue(@ConvertWith(HexStringToByteArray.class) byte[] value, String expectedBytes) { + assertWritingValue(expectedBytes, value, IonEncoder_1_1::writeClobValue); + } + + @Test + public void testWriteClobValueForNull() { + int numBytes = IonEncoder_1_1.writeClobValue(buf, null); + Assertions.assertEquals("EB 08", byteArrayToHex(bytes())); + Assertions.assertEquals(2, numBytes); + } + + @ParameterizedTest + @CsvSource({ + " '', ''", // Empty array of annotations + " $0, E4 01", + " $10, E4 15", + " $256, E4 02 04", + " $10 $11, E5 15 17", + " $256 $257, E5 02 04 06 04", + " $10 $11 $12, E6 07 15 17 19", + "$256 $257 $258, E6 0D 02 04 06 04 0A 04", + }) + public void testWriteAnnotations(@ConvertWith(SymbolIdsToLongArray.class) long[] value, String expectedBytes) { + assertWritingValue(expectedBytes, value, IonEncoder_1_1::writeAnnotations); + } + + @Test + public void testWriteAnnotationsForNull() { + int numBytes = IonEncoder_1_1.writeAnnotations(buf, null); + Assertions.assertEquals("", byteArrayToHex(bytes())); + Assertions.assertEquals(0, numBytes); + } + /** * Utility method to make it easier to write test cases that assert specific sequences of bytes. */ @@ -470,7 +606,7 @@ private static String byteArrayToHex(byte[] bytes) { * Determines the number of bytes needed to represent a series of hexadecimal digits. */ private static int byteLengthFromHexString(String hexString) { - return (hexString.replaceAll("[^\\dA-F]", "").length() - 1) / 2 + 1; + return (hexString.replaceAll("[^\\dA-F]", "").length()) / 2; } /** @@ -496,7 +632,7 @@ private static String byteArrayToBitString(byte[] bytes) { * Determines the number of bytes needed to represent a series of hexadecimal digits. */ private static int byteLengthFromBitString(String bitString) { - return (bitString.replaceAll("[^01]", "").length() - 1) / 8 + 1; + return (bitString.replaceAll("[^01]", "").length()) / 8; } /** @@ -528,4 +664,60 @@ protected Decimal convert(String source) throws ArgumentConversionException { return Decimal.valueOf(source); } } + + /** + * Converts a Hex String to a Byte Array for a @Parameterized test + */ + static class HexStringToByteArray extends TypedArgumentConverter { + + private static final CharsetEncoder ASCII_ENCODER = StandardCharsets.US_ASCII.newEncoder(); + + protected HexStringToByteArray() { + super(String.class, byte[].class); + } + + @Override + protected byte[] convert(String source) throws ArgumentConversionException { + if (source == null) return null; + if (source.trim().isEmpty()) return new byte[0]; + String[] octets = source.split(" "); + byte[] result = new byte[octets.length]; + for (int i = 0; i < octets.length; i++) { + if (octets[i].length() == 1) { + char c = octets[i].charAt(0); + if (!ASCII_ENCODER.canEncode(c)) { + throw new IllegalArgumentException("Cannot convert non-ascii character: " + c); + } + result[i] = (byte) c; + } else { + result[i] = (byte) Integer.parseInt(octets[i], 16); + } + } + return result; + } + } + + /** + * Converts a String of symbol ids to a long[] for a @Parameterized test + */ + static class SymbolIdsToLongArray extends TypedArgumentConverter { + protected SymbolIdsToLongArray() { + super(String.class, long[].class); + } + + @Override + protected long[] convert(String source) throws ArgumentConversionException { + if (source == null) return null; + int size = (int) source.chars().filter(i -> i == '$').count(); + String[] sids = source.split("\\$"); + long[] result = new long[size]; + int i = 0; + for (String sid : sids) { + if (sid.isEmpty()) continue; + result[i] = Long.parseLong(sid.trim()); + i++; + } + return result; + } + } }