Skip to content

Commit

Permalink
Adds read support for Ion 1.1 system symbols (#944)
Browse files Browse the repository at this point in the history
* Adds read support for system symbols

* Fixes and re-enables EncodingDirectiveCompilationTest.

* Allows the binary reader to match user symbol IDs that map to the same text as a system symbol.

* Caches a SymbolToken instance for each Ion 1.1 symbol token

* Adds suggested changes

---------

Co-authored-by: Tyler Gregg <[email protected]>
  • Loading branch information
popematt and tgregg authored Sep 27, 2024
1 parent 714dbcf commit c5562bf
Show file tree
Hide file tree
Showing 18 changed files with 516 additions and 319 deletions.
30 changes: 0 additions & 30 deletions src/main/java/com/amazon/ion/SystemSymbols.java
Original file line number Diff line number Diff line change
Expand Up @@ -134,34 +134,4 @@ private SystemSymbols() { }
* The name of the macro table s-expression within an Ion encoding directive.
*/
public static final String MACRO_TABLE = "macro_table";

/**
* The name of the "annotate" system macro.
*/
public static final String ANNOTATE = "annotate";

/**
* The name of the "literal" special form.
*/
public static final String LITERAL = "literal";

/**
* The name of the "macro" s-expression in the macro table.
*/
public static final String MACRO = "macro";

/**
* The name of the "export" s-expression in the macro table.
*/
public static final String EXPORT = "export";

/**
* The name of the "make_sexp" system macro.
*/
public static final String MAKE_SEXP = "make_sexp";

/**
* The sigil used to denote an expression group in TDL.
*/
public static final String TDL_EXPRESSION_GROUP = ";";
}
112 changes: 66 additions & 46 deletions src/main/java/com/amazon/ion/impl/IonCursorBinary.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,13 @@
import static com.amazon.ion.impl.IonTypeID.DELIMITED_END_ID;
import static com.amazon.ion.impl.IonTypeID.ONE_ANNOTATION_FLEX_SYM_LOWER_NIBBLE_1_1;
import static com.amazon.ion.impl.IonTypeID.ONE_ANNOTATION_SID_LOWER_NIBBLE_1_1;
import static com.amazon.ion.impl.IonTypeID.SYSTEM_MACRO_INVOCATION_ID;
import static com.amazon.ion.impl.IonTypeID.SYSTEM_SYMBOL_VALUE;
import static com.amazon.ion.impl.IonTypeID.TWO_ANNOTATION_FLEX_SYMS_LOWER_NIBBLE_1_1;
import static com.amazon.ion.impl.IonTypeID.TWO_ANNOTATION_SIDS_LOWER_NIBBLE_1_1;
import static com.amazon.ion.impl.IonTypeID.TYPE_IDS_1_1;
import static com.amazon.ion.impl.bin.Ion_1_1_Constants.FLEX_SYM_MAX_SYSTEM_SYMBOL;
import static com.amazon.ion.impl.bin.Ion_1_1_Constants.FLEX_SYM_SYSTEM_SYMBOL_OFFSET;
import static com.amazon.ion.util.IonStreamUtils.throwAsIonException;

/**
Expand Down Expand Up @@ -306,7 +309,7 @@ private static class ArgumentGroupMarker {
/**
* The major version of the Ion encoding currently being read.
*/
private int majorVersion = -1;
private int majorVersion = 1;

/**
* The minor version of the Ion encoding currently being read.
Expand Down Expand Up @@ -1490,24 +1493,17 @@ private long uncheckedReadFlexInt_1_1() {
* Marker's endIndex is set to the symbol ID value and its startIndex is set to -1. When this FlexSym wraps a
* delimited end marker, neither the Marker's startIndex nor its endIndex is set.
* @param markerToSet the marker to populate.
* @return the symbol ID value if one was present, otherwise -1.
* @return the user-space symbol ID value if one was present, otherwise -1.
*/
private long uncheckedReadFlexSym_1_1(Marker markerToSet) {
long result = uncheckedReadFlexInt_1_1();
if (result == 0) {
int nextByte = buffer[(int)(peekIndex++)];
if (nextByte == OpCodes.INLINE_SYMBOL_ZERO_LENGTH) {
// Symbol zero.
markerToSet.endIndex = 0;
return 0;
}
if (nextByte == OpCodes.STRING_ZERO_LENGTH) {
// Inline symbol with zero length.
markerToSet.startIndex = peekIndex;
markerToSet.endIndex = peekIndex;
if (isFlexSymSystemSymbolOrSid0(nextByte & SINGLE_BYTE_MASK)) {
setSystemSymbolMarker(markerToSet, (byte)(nextByte - FLEX_SYM_SYSTEM_SYMBOL_OFFSET));
return -1;
} else if (nextByte != OpCodes.DELIMITED_END_MARKER) {
throw new IonException("FlexSym 0 may only precede symbol zero, empty string, or delimited end.");
throw new IonException("FlexSym 0 may only precede symbol zero, system symbol, or delimited end.");
}
markerToSet.typeId = IonTypeID.DELIMITED_END_ID;
return -1;
Expand All @@ -1523,6 +1519,16 @@ private long uncheckedReadFlexSym_1_1(Marker markerToSet) {
return result;
}

/*
* Determines whether a byte (specifically, the byte following a FlexSym escape byte) represents a system symbol.
*
* @param byteAfterEscapeCode The unsigned value of the byte after the FlexSym escape byte
* @return true if the byte is in the reserved range for system symbols or $0.
*/
private static boolean isFlexSymSystemSymbolOrSid0(int byteAfterEscapeCode) {
return byteAfterEscapeCode >= FLEX_SYM_SYSTEM_SYMBOL_OFFSET && byteAfterEscapeCode <= FLEX_SYM_MAX_SYSTEM_SYMBOL;
}

/**
* Reads a FlexInt into a long, ensuring enough space is available in the buffer. After this method returns false,
* `peekIndex` points to the first byte after the end of the FlexInt and `markerToSet.endIndex` contains the
Expand Down Expand Up @@ -1589,15 +1595,8 @@ private boolean slowReadFlexSym_1_1(Marker markerToSet) {
if (nextByte < 0) {
return true;
}
if ((byte) nextByte == OpCodes.INLINE_SYMBOL_ZERO_LENGTH) {
// Symbol zero.
markerToSet.endIndex = 0;
return false;
}
if ((byte) nextByte == OpCodes.STRING_ZERO_LENGTH) {
// Inline symbol with zero length.
markerToSet.startIndex = peekIndex;
markerToSet.endIndex = peekIndex;
if (isFlexSymSystemSymbolOrSid0(nextByte)) {
setSystemSymbolMarker(markerToSet, nextByte - FLEX_SYM_SYSTEM_SYMBOL_OFFSET);
return false;
} else if ((byte) nextByte != OpCodes.DELIMITED_END_MARKER) {
throw new IonException("FlexSyms may only wrap symbol zero, empty string, or delimited end.");
Expand Down Expand Up @@ -1645,6 +1644,12 @@ IonTypeID typeIdFor(int length) {
return TYPE_IDS_1_1[OpCodes.SYMBOL_ADDRESS_MANY_BYTES & SINGLE_BYTE_MASK];
}
},
SYSTEM_SYMBOL_ID {
@Override
IonTypeID typeIdFor(int length) {
return SYSTEM_SYMBOL_VALUE;
}
},
STRUCT_END {
@Override
IonTypeID typeIdFor(int length) {
Expand All @@ -1661,11 +1666,8 @@ static FlexSymType classifySpecialFlexSym(int specialByte) {
if (specialByte < 0) {
return FlexSymType.INCOMPLETE;
}
if ((byte) specialByte == OpCodes.INLINE_SYMBOL_ZERO_LENGTH) {
return FlexSymType.SYMBOL_ID;
}
if ((byte) specialByte == OpCodes.STRING_ZERO_LENGTH) {
return FlexSymType.INLINE_TEXT;
if (isFlexSymSystemSymbolOrSid0(specialByte)) {
return FlexSymType.SYSTEM_SYMBOL_ID;
}
if ((byte) specialByte == OpCodes.DELIMITED_END_MARKER) {
return FlexSymType.STRUCT_END;
Expand All @@ -1692,7 +1694,12 @@ private FlexSymType uncheckedSkipFlexSym_1_1(Marker markerToSet) {
if (result == 0) {
markerToSet.startIndex = peekIndex + 1;
markerToSet.endIndex = markerToSet.startIndex;
return FlexSymType.classifySpecialFlexSym(buffer[(int) peekIndex++] & SINGLE_BYTE_MASK);
int specialByte = buffer[(int) peekIndex++] & SINGLE_BYTE_MASK;
FlexSymType type = FlexSymType.classifySpecialFlexSym(specialByte);
if (type == FlexSymType.SYSTEM_SYMBOL_ID) {
setSystemSymbolMarker(markerToSet, (byte)(specialByte - FLEX_SYM_SYSTEM_SYMBOL_OFFSET));
}
return type;
} else if (result < 0) {
markerToSet.startIndex = peekIndex;
markerToSet.endIndex = peekIndex - result;
Expand Down Expand Up @@ -1720,11 +1727,19 @@ private FlexSymType slowSkipFlexSym_1_1(Marker markerToSet) {
result |= ~(-1 >>> Long.numberOfLeadingZeros(result));
}
if (result == 0) {
FlexSymType flexSymType = FlexSymType.classifySpecialFlexSym(slowReadByte());
int specialByte = slowReadByte();
FlexSymType flexSymType = FlexSymType.classifySpecialFlexSym(specialByte);
if (markerToSet != null && flexSymType != FlexSymType.INCOMPLETE) {
markerToSet.startIndex = peekIndex;
markerToSet.endIndex = peekIndex;
}
if (markerToSet != null && flexSymType == FlexSymType.SYSTEM_SYMBOL_ID) {
// FIXME: See if we can set the SID in the endIndex here without causing the slow reader to get confused
// about where the end of the value is for tagless symbols.
// I.e. use setSystemSymbolMarker(markerToSet, (byte)(specialByte - FLEX_SYM_SYSTEM_SYMBOL_OFFSET));
markerToSet.typeId = SYSTEM_SYMBOL_VALUE;
markerToSet.startIndex = peekIndex - 1;
}
return flexSymType;
} else if (result < 0) {
if (markerToSet != null) {
Expand Down Expand Up @@ -2229,26 +2244,28 @@ private void validateAnnotationWrapperEndIndex(long endIndex) {
}
}

/*
* The given Marker's endIndex is set to the system symbol ID value and its startIndex is set to -1
* @param markerToSet the marker to set.
*/
private void setSystemSymbolMarker(Marker markerToSet, int systemSid) {
event = Event.START_SCALAR;
markerToSet.typeId = SYSTEM_SYMBOL_VALUE;
markerToSet.startIndex = -1;
markerToSet.endIndex = systemSid;
}

/**
* Sets the given marker to represent the current system token (system macro invocation or system symbol value).
* Before calling this method, `macroInvocationId` must be set from the one-byte FixedInt that represents the ID;
* positive values indicate a macro address, while negative values indicate a system symbol ID.
* @param valueTid the type ID of the system token.
* Sets the given marker to represent the current system macro invocation.
* Before calling this method, `macroInvocationId` must be set from the one-byte FixedUInt that represents the ID.
* @param markerToSet the marker to set.
*/
private void setSystemTokenMarker(IonTypeID valueTid, Marker markerToSet) {
private void setSystemMacroInvocationMarker(Marker markerToSet) {
isSystemInvocation = true;
event = Event.NEEDS_INSTRUCTION;
markerToSet.typeId = SYSTEM_MACRO_INVOCATION_ID;
markerToSet.startIndex = peekIndex;
if (macroInvocationId < 0) {
// This is a system symbol value.
event = Event.START_SCALAR;
markerToSet.typeId = SYSTEM_SYMBOL_VALUE;
markerToSet.endIndex = peekIndex;
} else {
event = Event.NEEDS_INSTRUCTION;
markerToSet.typeId = valueTid;
markerToSet.endIndex = -1;
}
markerToSet.endIndex = -1;
}

/**
Expand Down Expand Up @@ -2293,9 +2310,9 @@ private void uncheckedReadMacroInvocationHeader(IonTypeID valueTid, Marker marke
setUserMacroInvocationMarker(valueTid, markerToSet, uncheckedReadFlexUInt_1_1());
return;
} else {
// Opcode 0xEF: system macro invocation or system symbol value.
// Opcode 0xEF: system macro invocation
macroInvocationId = buffer[(int) peekIndex++];
setSystemTokenMarker(valueTid, markerToSet);
setSystemMacroInvocationMarker(markerToSet);
return;
}
} else if (valueTid.length > 0) {
Expand Down Expand Up @@ -2461,7 +2478,7 @@ private boolean slowReadMacroInvocationHeader(IonTypeID valueTid, Marker markerT
}
// The downcast to byte then upcast to long results in sign extension, treating the byte as a FixedInt.
macroInvocationId = (byte) truncatedId;
setSystemTokenMarker(valueTid, markerToSet);
setSystemMacroInvocationMarker(markerToSet);
return false;
}
} else if (valueTid.length > 0) {
Expand Down Expand Up @@ -3123,6 +3140,9 @@ private long calculateTaglessLengthAndType(TaglessEncoding taglessEncoding) {
default:
throw new IllegalStateException("Length is built into the primitive type's IonTypeID.");
}
if (valueTid == SYSTEM_SYMBOL_VALUE) {
return 1;
}
if (length >= 0) {
valueMarker.endIndex = peekIndex + length;
}
Expand Down
Loading

0 comments on commit c5562bf

Please sign in to comment.