Skip to content

Commit

Permalink
[CALCITE-5993] Add CODE_POINTS_TO_STRING, TO_CODE_POINTS function (en…
Browse files Browse the repository at this point in the history
…abled in BigQuery library)
  • Loading branch information
macroguo-ghy authored and tanclary committed Oct 11, 2023
1 parent 454899a commit 77b3689
Show file tree
Hide file tree
Showing 10 changed files with 327 additions and 14 deletions.
154 changes: 154 additions & 0 deletions babel/src/test/resources/sql/big-query.iq
Original file line number Diff line number Diff line change
Expand Up @@ -2184,6 +2184,160 @@ SELECT CODE_POINTS_TO_BYTES(array[2147483648, 1]);
Input arguments of CODE_POINTS_TO_BYTES out of range: 2147483648
!error

#####################################################################
# CODE_POINTS_TO_STRING(array<integer>)
#
# Takes an array of Unicode code points as ARRAY<INT64>
# and returns a STRING.
#
SELECT CODE_POINTS_TO_STRING(array[65, 66, 67, 68]) as result;
+--------+
| result |
+--------+
| ABCD |
+--------+
(1 row)

!ok

SELECT CODE_POINTS_TO_STRING(array[255, 254, 1024, 70000]) as result;
+--------+
| result |
+--------+
| ÿþЀ𑅰 |
+--------+
(1 row)

!ok

SELECT CODE_POINTS_TO_STRING(array[1+2, 3]) as result;
+--------+
| result |
+--------+
|  |
+--------+
(1 row)

!ok

SELECT CODE_POINTS_TO_STRING(null) as result;
+--------+
| result |
+--------+
| |
+--------+
(1 row)

!ok

SELECT CODE_POINTS_TO_STRING(array[65, null]) as result;
+--------+
| result |
+--------+
| |
+--------+
(1 row)

!ok

SELECT CODE_POINTS_TO_STRING('abc') as result;
Cannot apply 'CODE_POINTS_TO_STRING' to arguments of type 'CODE_POINTS_TO_STRING(<CHAR(3)>)'. Supported form(s): CODE_POINTS_TO_STRING(<INTEGER ARRAY>)
!error

SELECT CODE_POINTS_TO_STRING(array[-1]) as result;
Input arguments of CODE_POINTS_TO_STRING out of range: -1
!error

SELECT CODE_POINTS_TO_STRING(array[2147483648, 1]);
Input arguments of CODE_POINTS_TO_STRING out of range: 2147483648
!error

#####################################################################
# TO_CODE_POINTS(value)
#
# Takes a STRING or BYTES value and returns an array of INT64 values
# that represent code points or extended ASCII character values.
# 1. If value is a STRING, each element in the returned array
# represents a code point. Each code point falls within the range of
# [0, 0xD7FF] and [0xE000, 0x10FFFF].
# 2. If value is BYTES, each element in the array is an extended
# ASCII character value in the range of [0, 255].
#
SELECT TO_CODE_POINTS('ABCD') as result;
+------------------+
| result |
+------------------+
| [65, 66, 67, 68] |
+------------------+
(1 row)

!ok

SELECT TO_CODE_POINTS(x'11223344') as result;
+------------------+
| result |
+------------------+
| [17, 34, 51, 68] |
+------------------+
(1 row)

!ok

SELECT TO_CODE_POINTS(CODE_POINTS_TO_STRING(array[255, 254, 1024, 70000, 65])) as result;
+-----------------------------+
| result |
+-----------------------------+
| [255, 254, 1024, 70000, 65] |
+-----------------------------+
(1 row)

!ok

SELECT TO_CODE_POINTS(CODE_POINTS_TO_BYTES(array[64, 65, 66, 67])) as result;
+------------------+
| result |
+------------------+
| [64, 65, 66, 67] |
+------------------+
(1 row)

!ok

SELECT TO_CODE_POINTS(null) as result;
+--------+
| result |
+--------+
| |
+--------+
(1 row)

!ok

SELECT TO_CODE_POINTS('') as result;
+--------+
| result |
+--------+
| |
+--------+
(1 row)

!ok

SELECT TO_CODE_POINTS(x'') as result;
+--------+
| result |
+--------+
| |
+--------+
(1 row)

!ok

SELECT to_code_points(array[1, 2, 3]) as result;
Error while executing SQL "SELECT to_code_points(array[1, 2, 3]) as result": From line 1, column 8 to line 1, column 37: Cannot apply 'TO_CODE_POINTS' to arguments of type 'TO_CODE_POINTS(<INTEGER ARRAY>)'. Supported form(s): 'TO_CODE_POINTS(<STRING>)'
'TO_CODE_POINTS(<BINARY>)'
!error

#####################################################################
# DATE
#
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@
import static org.apache.calcite.sql.fun.SqlLibraryOperators.CHAR;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.CHR;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.CODE_POINTS_TO_BYTES;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.CODE_POINTS_TO_STRING;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.COMPRESS;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.CONCAT2;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.CONCAT_FUNCTION;
Expand Down Expand Up @@ -264,6 +265,7 @@
import static org.apache.calcite.sql.fun.SqlLibraryOperators.TO_BASE32;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.TO_BASE64;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.TO_CHAR;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.TO_CODE_POINTS;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.TO_HEX;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.TRANSLATE3;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.TRUNC;
Expand Down Expand Up @@ -568,6 +570,10 @@ Builder populate() {
NullPolicy.SEMI_STRICT);
defineMethod(CODE_POINTS_TO_BYTES, BuiltInMethod.CODE_POINTS_TO_BYTES.method,
NullPolicy.STRICT);
defineMethod(CODE_POINTS_TO_STRING, BuiltInMethod.CODE_POINTS_TO_STRING.method,
NullPolicy.STRICT);
defineMethod(TO_CODE_POINTS, BuiltInMethod.TO_CODE_POINTS.method,
NullPolicy.STRICT);
defineMethod(REPEAT, BuiltInMethod.REPEAT.method, NullPolicy.STRICT);
defineMethod(SPACE, BuiltInMethod.SPACE.method, NullPolicy.STRICT);
defineMethod(STRCMP, BuiltInMethod.STRCMP.method, NullPolicy.STRICT);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -298,8 +298,8 @@ ExInst<CalciteException> invalidCompare(String a0, String a1, String a2,
@BaseMessage("Date literal ''{0}'' out of range")
ExInst<SqlValidatorException> dateLiteralOutOfRange(String a0);

@BaseMessage("Input arguments of CODE_POINTS_TO_BYTES out of range: {0,number,#}")
ExInst<CalciteException> inputArgumentsOfCodePointsToBytesOutOfRange(long a0);
@BaseMessage("Input arguments of {0} out of range: {1,number,#}; should be in the range of {2}")
ExInst<CalciteException> inputArgumentsOfFunctionOutOfRange(String a0, Number a1, String a2);

@BaseMessage("String literal continued on same line")
ExInst<SqlValidatorException> stringFragsOnSameLine();
Expand Down
63 changes: 59 additions & 4 deletions core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java
Original file line number Diff line number Diff line change
Expand Up @@ -1194,7 +1194,7 @@ public static String charFromUtf8(int n) {
}

/**
* SQL CODE_POINTS_TO_BYTES function.
* SQL CODE_POINTS_TO_BYTES(list) function.
*/
public static @Nullable ByteString codePointsToBytes(List codePoints) {
int length = codePoints.size();
Expand All @@ -1207,14 +1207,70 @@ public static String charFromUtf8(int n) {
assert codePoint instanceof Number;
long cp = ((Number) codePoint).longValue();
if (cp < 0 || cp > 255) {
throw RESOURCE.inputArgumentsOfCodePointsToBytesOutOfRange(cp).ex();
throw RESOURCE.inputArgumentsOfFunctionOutOfRange(
"CODE_POINTS_TO_BYTES", cp, "[0, 255]").ex();
}
bytes[i] = (byte) cp;
}

return new ByteString(bytes);
}

/**
* SQL CODE_POINTS_TO_STRING(list) function.
*/
public static @Nullable String codePointsToString(List codePoints) {
StringBuilder sb = new StringBuilder();
for (Object codePoint: codePoints) {
if (codePoint == null) {
return null;
}
assert codePoint instanceof Number;
long cp = ((Number) codePoint).longValue();
// Each valid code point should fall within the range of [0, 0xD7FF] and [0xE000, 0x10FFFF]
if (cp >= 0 && cp <= 0xD7FF || cp >= 0xE000 && cp <= 0x10FFFF) {
sb.append(charFromUtf8((int) cp));
} else {
throw RESOURCE.inputArgumentsOfFunctionOutOfRange(
"CODE_POINTS_TO_STRING", cp, "[0, 0xD7FF] and [0xE000, 0x10FFFF]").ex();
}
}

return sb.toString();
}

/**
* SQL TO_CODE_POINTS(string) function.
*/
public static @Nullable List<Integer> toCodePoints(String s) {
if (s.length() == 0) {
return null;
}
final ImmutableList.Builder<Integer> builder = new ImmutableList.Builder<>();
final int length = s.length();
int i = 0;
while (i < length) {
int cp = s.codePointAt(i);
builder.add(cp);
i += cp == s.charAt(i) ? 1 : 2;
}
return builder.build();
}

/**
* SQL TO_CODE_POINTS(string) function for binary string.
*/
public static @Nullable List<Integer> toCodePoints(ByteString s) {
if (s.length() == 0) {
return null;
}
final ImmutableList.Builder<Integer> builder = new ImmutableList.Builder<>();
for (byte b : s.getBytes()) {
builder.add((int) b);
}
return builder.build();
}

/** SQL OCTET_LENGTH(binary) function. */
public static int octetLength(ByteString s) {
return s.length();
Expand Down Expand Up @@ -2897,8 +2953,7 @@ public static double atanh(BigDecimal b) {
/** SQL <code>ATANH</code> operator applied to double values. */
public static double atanh(double b) {
if (Math.abs(b) >= 1) {
throw new IllegalArgumentException("Input parameter of atanh cannot be out of the "
+ "range (-1, 1)!");
throw RESOURCE.inputArgumentsOfFunctionOutOfRange("ATANH", b, "(-1, 1)").ex();
}
final double mult;
// check the sign bit of the raw representation to handle -0.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,7 @@ private static RelDataType deriveTypePad(SqlOperatorBinding binding, RelDataType
SqlBasicFunction.create("SPLIT",
ReturnTypes.ARG0
.andThen(SqlLibraryOperators::deriveTypeSplit)
.andThen(SqlTypeTransforms.TO_ARRAY)
.andThen(SqlTypeTransforms.TO_NULLABLE),
.andThen(SqlTypeTransforms.TO_ARRAY_NULLABLE),
OperandTypes.or(OperandTypes.CHARACTER_CHARACTER,
OperandTypes.CHARACTER,
OperandTypes.BINARY_BINARY,
Expand Down Expand Up @@ -1856,13 +1855,34 @@ private static RelDataType deriveTypeMapFromEntries(SqlOperatorBinding opBinding
OperandTypes.INTEGER,
SqlFunctionCategory.STRING);

/** The "CODE_POINTS_TO_BYTES(integers)" function (BigQuery); Converts an array of extended ASCII
* code points to bytes. */
@LibraryOperator(libraries = {BIG_QUERY})
public static final SqlFunction CODE_POINTS_TO_BYTES =
SqlBasicFunction.create("CODE_POINTS_TO_BYTES",
ReturnTypes.VARBINARY_NULLABLE,
OperandTypes.ARRAY_OF_INTEGER,
SqlFunctionCategory.STRING);

/** The "CODE_POINTS_TO_STRING(integers)" function (BigQuery); Converts an array of Unicode code
* points to string. */
@LibraryOperator(libraries = {BIG_QUERY})
public static final SqlFunction CODE_POINTS_TO_STRING =
SqlBasicFunction.create("CODE_POINTS_TO_STRING",
ReturnTypes.VARCHAR_NULLABLE,
OperandTypes.ARRAY_OF_INTEGER,
SqlFunctionCategory.STRING);

/** The "TO_CODE_POINTS(string or binary)" function (BigQuery); Converts a {@code string} or
* {@code binary} value to an array of integers that represent code points or extended ASCII
* character values. */
@LibraryOperator(libraries = {BIG_QUERY})
public static final SqlFunction TO_CODE_POINTS =
SqlBasicFunction.create("TO_CODE_POINTS",
ReturnTypes.INTEGER.andThen(SqlTypeTransforms.TO_ARRAY_NULLABLE),
OperandTypes.STRING.or(OperandTypes.BINARY),
SqlFunctionCategory.STRING);

@LibraryOperator(libraries = {ALL})
public static final SqlFunction TANH =
SqlBasicFunction.create("TANH",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -224,14 +224,22 @@ private SqlTypeName toVar(RelDataType type) {

/**
* Parameter type-inference transform strategy that wraps a given type
* in a array.
* in an array.
*
* @see org.apache.calcite.rel.type.RelDataTypeFactory#createArrayType(RelDataType, long)
*/
public static final SqlTypeTransform TO_ARRAY =
(opBinding, typeToTransform) ->
opBinding.getTypeFactory().createArrayType(typeToTransform, -1);

/**
* Parameter type-inference transform strategy that wraps a given type in an array,
* but nullable if any of element of a calls operands is nullable.
*/
public static final SqlTypeTransform TO_ARRAY_NULLABLE =
(opBinding, typeToTransform) ->
TO_NULLABLE.transformType(opBinding, TO_ARRAY.transformType(opBinding, typeToTransform));

/** Parameter type-inference transform that transforms {@code T} to
* {@code MEASURE<T>} for some type T. */
public static final SqlTypeTransform TO_MEASURE =
Expand Down
2 changes: 2 additions & 0 deletions core/src/main/java/org/apache/calcite/util/BuiltInMethod.java
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,8 @@ public enum BuiltInMethod {
CHAR_FROM_ASCII(SqlFunctions.class, "charFromAscii", int.class),
CHAR_FROM_UTF8(SqlFunctions.class, "charFromUtf8", int.class),
CODE_POINTS_TO_BYTES(SqlFunctions.class, "codePointsToBytes", List.class),
CODE_POINTS_TO_STRING(SqlFunctions.class, "codePointsToString", List.class),
TO_CODE_POINTS(SqlFunctions.class, "toCodePoints", String.class),
CONVERT(SqlFunctions.class, "convertWithCharset", String.class, String.class,
String.class),
EXP(SqlFunctions.class, "exp", double.class),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ OperandNotComparable=Operands {0} not comparable to each other
TypeNotComparableEachOther=Types {0} not comparable to each other
NumberLiteralOutOfRange=Numeric literal ''{0}'' out of range
DateLiteralOutOfRange=Date literal ''{0}'' out of range
InputArgumentsOfCodePointsToBytesOutOfRange=Input arguments of CODE_POINTS_TO_BYTES out of range: {0,number,#}
InputArgumentsOfFunctionOutOfRange=Input arguments of {0} out of range: {1,number,#}; should be in the range of {2}
StringFragsOnSameLine=String literal continued on same line
AliasMustBeSimpleIdentifier=Table or column alias must be a simple identifier
CharLiteralAliasNotValid=Expecting alias, found character literal
Expand Down
2 changes: 2 additions & 0 deletions site/_docs/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -2686,6 +2686,7 @@ BigQuery's type system uses confusingly different names for types and functions:
| m s | CHAR(integer) | Returns the character whose ASCII code is *integer* % 256, or null if *integer* &lt; 0
| b o p | CHR(integer) | Returns the character whose UTF-8 code is *integer*
| b | CODE_POINTS_TO_BYTES(integers) | Converts *integers*, an array of integers between 0 and 255 inclusive, into bytes; throws error if any element is out of range
| b | CODE_POINTS_TO_STRING(integers) | Converts *integers*, an array of integers between 0 and 0xD7FF or between 0xE000 and 0x10FFFF inclusive, into string; throws error if any element is out of range
| o | CONCAT(string, string) | Concatenates two strings, returns null only when both string arguments are null, otherwise treats null as empty string
| b m | CONCAT(string [, string ]*) | Concatenates one or more strings, returns null if any of the arguments is null
| p q | CONCAT(string [, string ]*) | Concatenates one or more strings, null is treated as empty string
Expand Down Expand Up @@ -2845,6 +2846,7 @@ BigQuery's type system uses confusingly different names for types and functions:
| b | TIME_SUB(time, interval) | Returns the TIME value that is *interval* before *time*
| b | TIME_TRUNC(time, timeUnit) | Truncates *time* to the granularity of *timeUnit*, rounding to the beginning of the unit
| m o p | TO_CHAR(timestamp, format) | Converts *timestamp* to a string using the format *format*
| b | TO_CODE_POINTS(string) | Converts *string* to an array of integers that represent code points or extended ASCII character values
| o p | TO_DATE(string, format) | Converts *string* to a date using the format *format*
| o p | TO_TIMESTAMP(string, format) | Converts *string* to a timestamp using the format *format*
| b o p | TRANSLATE(expr, fromString, toString) | Returns *expr* with all occurrences of each character in *fromString* replaced by its corresponding character in *toString*. Characters in *expr* that are not in *fromString* are not replaced
Expand Down
Loading

0 comments on commit 77b3689

Please sign in to comment.