Skip to content

Commit

Permalink
[CALCITE-5978] Add REGEXP_INSTR function (enabled in BigQuery library)
Browse files Browse the repository at this point in the history
  • Loading branch information
Anthrino authored and tanclary committed Sep 15, 2023
1 parent f3f5e7e commit 9e3ea96
Show file tree
Hide file tree
Showing 9 changed files with 388 additions and 22 deletions.
155 changes: 155 additions & 0 deletions babel/src/test/resources/sql/big-query.iq
Original file line number Diff line number Diff line change
Expand Up @@ -1244,6 +1244,161 @@ SELECT REGEXP_EXTRACT_ALL("abcadcabcaecghi", "(a.c).(.*)$");
Multiple capturing groups (count=2) not allowed in regex input for REGEXP_EXTRACT_ALL
!error

#####################################################################
# REGEXP_INSTR(value, regexp[, position[, occurrence[, occurrence_position]]])
#
# Returns the lowest 1-based position of regexp in value.
# Returns 0 if there is no match, regex is empty or if position or occurrence are beyond range.
# Returns an exception if regex, position, occurrence or occurrence_position are invalid.

WITH example AS (
SELECT 'ab@cd-ef' AS source_value, '@[^-]*' AS regexp UNION ALL
SELECT 'ab@d-ef', '@[^-]*' UNION ALL
SELECT 'abc@cd-ef', '@[^-]*' UNION ALL
SELECT 'abc-ef', '@[^-]*')
SELECT source_value, regexp, REGEXP_INSTR(source_value, regexp) AS instr
FROM example;
+--------------+--------+-------+
| source_value | regexp | instr |
+--------------+--------+-------+
| ab@cd-ef | @[^-]* | 3 |
| ab@d-ef | @[^-]* | 3 |
| abc-ef | @[^-]* | 0 |
| abc@cd-ef | @[^-]* | 4 |
+--------------+--------+-------+
(4 rows)

!ok

WITH example AS (
SELECT 'a@cd-ef b@cd-ef' AS source_value, '@[^-]*' AS regexp, 1 AS position UNION ALL
SELECT 'a@cd-ef b@cd-ef', '@[^-]*', 2 UNION ALL
SELECT 'a@cd-ef b@cd-ef', '@[^-]*', 3 UNION ALL
SELECT 'a@cd-ef b@cd-ef', '@[^-]*', 4)
SELECT
source_value, regexp, position,
REGEXP_INSTR(source_value, regexp, position) AS instr
FROM example;
+-----------------+--------+----------+-------+
| source_value | regexp | position | instr |
+-----------------+--------+----------+-------+
| a@cd-ef b@cd-ef | @[^-]* | 1 | 2 |
| a@cd-ef b@cd-ef | @[^-]* | 2 | 2 |
| a@cd-ef b@cd-ef | @[^-]* | 3 | 10 |
| a@cd-ef b@cd-ef | @[^-]* | 4 | 10 |
+-----------------+--------+----------+-------+
(4 rows)

!ok

WITH example AS (
SELECT 'a@cd-ef b@cd-ef c@cd-ef' AS source_value,
'@[^-]*' AS regexp, 1 AS position, 1 AS occurrence UNION ALL
SELECT 'a@cd-ef b@cd-ef c@cd-ef', '@[^-]*', 1, 2 UNION ALL
SELECT 'a@cd-ef b@cd-ef c@cd-ef', '@[^-]*', 1, 3)
SELECT
source_value, regexp, position, occurrence,
REGEXP_INSTR(source_value, regexp, position, occurrence) AS instr
FROM example;
+-------------------------+--------+----------+------------+-------+
| source_value | regexp | position | occurrence | instr |
+-------------------------+--------+----------+------------+-------+
| a@cd-ef b@cd-ef c@cd-ef | @[^-]* | 1 | 1 | 2 |
| a@cd-ef b@cd-ef c@cd-ef | @[^-]* | 1 | 2 | 10 |
| a@cd-ef b@cd-ef c@cd-ef | @[^-]* | 1 | 3 | 18 |
+-------------------------+--------+----------+------------+-------+
(3 rows)

!ok

WITH example AS (
SELECT 'a@cd-ef' AS source_value, '@[^-]*' AS regexp,
1 AS position, 1 AS occurrence, 0 AS o_position UNION ALL
SELECT 'a@cd-ef', '@[^-]*', 1, 1, 1)
SELECT
source_value, regexp, position, occurrence, o_position,
REGEXP_INSTR(source_value, regexp, position, occurrence, o_position) AS instr
FROM example;
+--------------+--------+----------+------------+------------+-------+
| source_value | regexp | position | occurrence | o_position | instr |
+--------------+--------+----------+------------+------------+-------+
| a@cd-ef | @[^-]* | 1 | 1 | 0 | 2 |
| a@cd-ef | @[^-]* | 1 | 1 | 1 | 5 |
+--------------+--------+----------+------------+------------+-------+
(2 rows)

!ok

SELECT REGEXP_INSTR("abcadcabcaecghi", "a.+c");
+--------+
| EXPR$0 |
+--------+
| 1 |
+--------+
(1 row)

!ok

SELECT REGEXP_INSTR("abcadcabcaecghi", "abc(a.c)", 4);
+--------+
| EXPR$0 |
+--------+
| 10 |
+--------+
(1 row)

!ok

SELECT REGEXP_INSTR("abcadcabcaecghi", "a.c", 25);
+--------+
| EXPR$0 |
+--------+
| 0 |
+--------+
(1 row)

!ok

SELECT REGEXP_INSTR("abcadcabcaecghi", "a.c", 1, 5);
+--------+
| EXPR$0 |
+--------+
| 0 |
+--------+
(1 row)

!ok

SELECT REGEXP_INSTR("a9cadca5c4aecghi", "a[0-9]c", 1, 2, 1);
+--------+
| EXPR$0 |
+--------+
| 10 |
+--------+
(1 row)

!ok

SELECT REGEXP_INSTR("abc def ghi", "adz)");
Invalid regular expression for REGEXP_INSTR: 'Unmatched closing ')' near index 2 adz) ^'
!error

SELECT REGEXP_INSTR("abc def ghi", "(^)a(.*)b($)");
Multiple capturing groups (count=3) not allowed in regex input for REGEXP_INSTR
!error

SELECT REGEXP_INSTR("abcadcabcaecghi", "a.c", 0);
Invalid integer input '0' for argument 'position' in REGEXP_INSTR
!error

SELECT REGEXP_INSTR("abcadcabcaecghi", "a.c", 3, -2);
Invalid integer input '-2' for argument 'occurrence' in REGEXP_INSTR
!error

SELECT REGEXP_INSTR("abcadcabcaecghi", "a.c", 3, 2, -2);
Invalid integer input '-2' for argument 'occurrence_position' in REGEXP_INSTR
!error

#####################################################################
# REGEXP_SUBSTR(value, regexp[, position[, occurrence]])
#
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,7 @@
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_CONTAINS;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_EXTRACT;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_EXTRACT_ALL;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_INSTR;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_REPLACE;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REPEAT;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REVERSE;
Expand Down Expand Up @@ -582,6 +583,9 @@ Builder populate() {
defineReflective(REGEXP_EXTRACT, BuiltInMethod.REGEXP_EXTRACT2.method,
BuiltInMethod.REGEXP_EXTRACT3.method, BuiltInMethod.REGEXP_EXTRACT4.method);
defineReflective(REGEXP_EXTRACT_ALL, BuiltInMethod.REGEXP_EXTRACT_ALL.method);
defineReflective(REGEXP_INSTR, BuiltInMethod.REGEXP_INSTR2.method,
BuiltInMethod.REGEXP_INSTR3.method, BuiltInMethod.REGEXP_INSTR4.method,
BuiltInMethod.REGEXP_INSTR5.method);

map.put(TRIM, new TrimImplementor());

Expand Down
120 changes: 98 additions & 22 deletions core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,8 @@ public static String sha512(ByteString string) {
return DigestUtils.sha512Hex(string.getBytes());
}

/** State for {@code REGEXP_CONTAINS}, {@code REGEXP_REPLACE}, {@code RLIKE}.
/** State for {@code REGEXP_CONTAINS}, {@code REGEXP_EXTRACT}, {@code REGEXP_EXTRACT_ALL},
* {@code REGEXP_INSTR}, {@code REGEXP_REPLACE}, {@code RLIKE}.
*
* <p>Marked deterministic so that the code generator instantiates one once
* per query, not once per row. */
Expand Down Expand Up @@ -404,56 +405,70 @@ private Pattern validateRegexPattern(String regex, String methodName) {
}
}

/** Helper for multiple capturing group regex check in REGEXP_EXTRACT fns. */
/** Helper for multiple capturing group regex check in REGEXP_* fns. */
private void checkMultipleCapturingGroupsInRegex(Matcher matcher, String methodName) {
if (matcher.groupCount() > 1) {
throw RESOURCE.multipleCapturingGroupsForRegexpExtract(
Integer.toString(matcher.groupCount()), methodName).ex();
}
}

/** Helper for checking values of position and occurrence arguments in REGEXP_* fns.
* Regex fns not using occurrencePosition param pass a default value of 0.
* Throws an exception or returns true in case of failed value checks. */
private boolean checkPosOccurrenceParamValues(int position,
int occurrence, int occurrencePosition, String value, String methodName) {
if (position <= 0) {
throw RESOURCE.invalidIntegerInputForRegexpFunctions(Integer.toString(position),
"position", methodName).ex();
}
if (occurrence <= 0) {
throw RESOURCE.invalidIntegerInputForRegexpFunctions(Integer.toString(occurrence),
"occurrence", methodName).ex();
}
if (occurrencePosition != 0 && occurrencePosition != 1) {
throw RESOURCE.invalidIntegerInputForRegexpFunctions(Integer.toString(occurrencePosition),
"occurrence_position", methodName).ex();
}
if (position <= value.length()) {
return false;
}
return true;
}

/** SQL {@code REGEXP_CONTAINS(value, regexp)} function.
* Throws a runtime exception for invalid regular expressions.*/
* Throws a runtime exception for invalid regular expressions. */
public boolean regexpContains(String value, String regex) {
final Pattern pattern = validateRegexPattern(regex, "REGEXP_CONTAINS");
return pattern.matcher(value).find();
}

/** SQL {@code REGEXP_EXTRACT(value, regexp)} function.
* Returns NULL if there is no match. Returns an exception if regex is invalid.
* Uses position=1 and occurrence=1 as default values when not specified. */
* Returns NULL if there is no match. Returns an exception if regex is invalid.
* Uses position=1 and occurrence=1 as default values when not specified. */
public @Nullable String regexpExtract(String value, String regex) {
return regexpExtract(value, regex, 1, 1);
}

/** SQL {@code REGEXP_EXTRACT(value, regexp, position)} function.
* Returns NULL if there is no match, or if position is beyond range.
* Returns an exception if regex or position is invalid.
* Uses occurrence=1 as default value when not specified. */
* Returns NULL if there is no match, or if position is beyond range.
* Returns an exception if regex or position is invalid.
* Uses occurrence=1 as default value when not specified. */
public @Nullable String regexpExtract(String value, String regex, int position) {
return regexpExtract(value, regex, position, 1);
}

/** SQL {@code REGEXP_EXTRACT(value, regexp, position, occurrence)} function.
* Returns NULL if there is no match, or if position or occurrence are beyond range.
* Returns an exception if regex, position or occurrence are invalid. */
* Returns NULL if there is no match, or if position or occurrence are beyond range.
* Returns an exception if regex, position or occurrence are invalid. */
public @Nullable String regexpExtract(String value, String regex, int position,
int occurrence) {
// Uses java.util.regex as a standard for regex processing
// in Calcite instead of RE2 used by BigQuery/GoogleSQL
final String methodName = "REGEXP_EXTRACT";
final Pattern pattern = validateRegexPattern(regex, methodName);

if (position <= 0) {
throw RESOURCE.invalidIntegerInputForRegexpFunctions(Integer.toString(position),
"position", methodName).ex();
}
if (occurrence <= 0) {
throw RESOURCE.invalidIntegerInputForRegexpFunctions(Integer.toString(occurrence),
"occurrence", methodName).ex();
}

if (position > value.length()) {
if (checkPosOccurrenceParamValues(position, occurrence, 0, value, methodName)) {
return null;
}

Expand All @@ -475,7 +490,7 @@ public boolean regexpContains(String value, String regex) {
}

/** SQL {@code REGEXP_EXTRACT_ALL(value, regexp)} function.
* Returns an empty array if there is no match, returns an exception if regex is invalid.*/
* Returns an empty array if there is no match, returns an exception if regex is invalid. */
public List<String> regexpExtractAll(String value, String regex) {
// Uses java.util.regex as a standard for regex processing
// in Calcite instead of RE2 used by BigQuery/GoogleSQL
Expand All @@ -488,13 +503,74 @@ public List<String> regexpExtractAll(String value, String regex) {
ImmutableList.Builder<String> matches = ImmutableList.builder();
while (matcher.find()) {
String match = matcher.group(matcher.groupCount());
if (match != null && !match.isEmpty()) {
if (match != null) {
matches.add(match);
}
}
return matches.build();
}

/** SQL {@code REGEXP_INSTR(value, regexp)} function.
* Returns 0 if there is no match or regex is empty. Returns an exception if regex is invalid.
* Uses position=1, occurrence=1, occurrencePosition=0 as default values if not specified. */
public int regexpInstr(String value, String regex) {
return regexpInstr(value, regex, 1, 1, 0);
}

/** SQL {@code REGEXP_INSTR(value, regexp, position)} function.
* Returns 0 if there is no match, regex is empty, or if position is beyond range.
* Returns an exception if regex or position is invalid.
* Uses occurrence=1, occurrencePosition=0 as default value when not specified. */
public int regexpInstr(String value, String regex, int position) {
return regexpInstr(value, regex, position, 1, 0);
}

/** SQL {@code REGEXP_INSTR(value, regexp, position, occurrence)} function.
* Returns 0 if there is no match, regex is empty, or if position or occurrence
* are beyond range. Returns an exception if regex, position or occurrence are invalid.
* Uses occurrencePosition=0 as default value when not specified. */
public int regexpInstr(String value, String regex, int position,
int occurrence) {
return regexpInstr(value, regex, position, occurrence, 0);
}

/** SQL {@code REGEXP_INSTR(value, regexp, position, occurrence, occurrencePosition)}
* function. Returns 0 if there is no match, regex is empty, or if position or occurrence
* are beyond range. Returns an exception if regex, position, occurrence
* or occurrencePosition are invalid. */
public int regexpInstr(String value, String regex, int position,
int occurrence, int occurrencePosition) {
// Uses java.util.regex as a standard for regex processing
// in Calcite instead of RE2 used by BigQuery/GoogleSQL
final String methodName = "REGEXP_INSTR";
final Pattern pattern = validateRegexPattern(regex, methodName);

if (checkPosOccurrenceParamValues(position, occurrence, occurrencePosition, value,
methodName) || regex.isEmpty()) {
return 0;
}

Matcher matcher = pattern.matcher(value);
checkMultipleCapturingGroupsInRegex(matcher, methodName);
matcher.region(position - 1, value.length());

int matchIndex = 0;
while (occurrence > 0) {
if (matcher.find()) {
if (occurrencePosition == 0) {
matchIndex = matcher.start(matcher.groupCount()) + 1;
} else {
matchIndex = matcher.end(matcher.groupCount()) + 1;
}
} else {
return 0;
}
occurrence--;
}

return matchIndex;
}

/** SQL {@code REGEXP_REPLACE} function with 3 arguments. */
public String regexpReplace(String s, String regex,
String replacement) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,14 @@ static RelDataType deriveTypeSplit(SqlOperatorBinding operatorBinding,
OperandTypes.STRING_STRING,
SqlFunctionCategory.STRING);

/** The "REGEXP_INSTR(value, regexp [, position[, occurrence, [occurrence_position]]])" function.
* Returns the lowest 1-based position of a regexp in value. Returns NULL if there is no match. */
@LibraryOperator(libraries = {BIG_QUERY})
public static final SqlBasicFunction REGEXP_INSTR =
SqlBasicFunction.create("REGEXP_INSTR", ReturnTypes.INTEGER_NULLABLE,
OperandTypes.STRING_STRING_OPTIONAL_INTEGER_OPTIONAL_INTEGER_OPTIONAL_INTEGER,
SqlFunctionCategory.STRING);

@LibraryOperator(libraries = {MYSQL, ORACLE})
public static final SqlFunction REGEXP_REPLACE = new SqlRegexpReplaceFunction();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -867,6 +867,12 @@ public static SqlSingleOperandTypeChecker same(int operandCount,
ImmutableList.of(SqlTypeFamily.STRING, SqlTypeFamily.STRING, SqlTypeFamily.INTEGER,
SqlTypeFamily.INTEGER), i -> i == 2 || i == 3);

public static final SqlSingleOperandTypeChecker
STRING_STRING_OPTIONAL_INTEGER_OPTIONAL_INTEGER_OPTIONAL_INTEGER =
family(
ImmutableList.of(SqlTypeFamily.STRING, SqlTypeFamily.STRING, SqlTypeFamily.INTEGER,
SqlTypeFamily.INTEGER, SqlTypeFamily.INTEGER), i -> i == 2 || i == 3 || i == 4);

public static final SqlSingleOperandTypeChecker STRING_INTEGER =
family(SqlTypeFamily.STRING, SqlTypeFamily.INTEGER);

Expand Down
Loading

0 comments on commit 9e3ea96

Please sign in to comment.