Skip to content

Commit

Permalink
[CALCITE-6309] Add REGEXP_LIKE function (enabled in Postgres library)
Browse files Browse the repository at this point in the history
PostgreSQL supports a 3-argument REGEXP_LIKE variant that takes in
a string of regex flags.
  • Loading branch information
jduo committed Mar 12, 2024
1 parent b412fa4 commit 3de107d
Show file tree
Hide file tree
Showing 6 changed files with 25 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_EXTRACT_ALL;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_INSTR;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_LIKE;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_LIKE_PG;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_REPLACE;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REPEAT;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REVERSE;
Expand Down Expand Up @@ -601,6 +602,7 @@ Builder populate() {
BuiltInMethod.PARSE_URL3.method);
defineReflective(REGEXP, BuiltInMethod.RLIKE.method);
defineReflective(REGEXP_LIKE, BuiltInMethod.RLIKE.method);
defineReflective(REGEXP_LIKE_PG, BuiltInMethod.RLIKE.method, BuiltInMethod.REGEXP_LIKE3.method);

Check failure on line 605 in core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java

View workflow job for this annotation

GitHub Actions / Linux (JDK 19)

[Task :core:checkstyleMain] [LineLength] Line is longer than 100 characters (found 102).

Check failure on line 605 in core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java

View workflow job for this annotation

GitHub Actions / Linux (JDK 8), oldest Guava, America/New_York Timezone

[Task :core:checkstyleMain] [LineLength] Line is longer than 100 characters (found 102).

Check failure on line 605 in core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java

View workflow job for this annotation

GitHub Actions / Linux (JDK 11), Pacific/Chatham Timezone

[Task :core:checkstyleMain] [LineLength] Line is longer than 100 characters (found 102).

Check failure on line 605 in core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java

View workflow job for this annotation

GitHub Actions / Linux (JDK 17)

[Task :core:checkstyleMain] [LineLength] Line is longer than 100 characters (found 102).

Check failure on line 605 in core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java

View workflow job for this annotation

GitHub Actions / Linux (JDK 11), Avatica main

[Task :core:checkstyleMain] [LineLength] Line is longer than 100 characters (found 102).

Check failure on line 605 in core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java

View workflow job for this annotation

GitHub Actions / Linux (JDK 8), latest Guava, America/New_York Timezone

[Task :core:checkstyleMain] [LineLength] Line is longer than 100 characters (found 102).

Check failure on line 605 in core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java

View workflow job for this annotation

GitHub Actions / macOS (JDK 19)

[Task :core:checkstyleMain] [LineLength] Line is longer than 100 characters (found 102).
defineReflective(REGEXP_CONTAINS, BuiltInMethod.REGEXP_CONTAINS.method);
defineReflective(REGEXP_EXTRACT, BuiltInMethod.REGEXP_EXTRACT2.method,
BuiltInMethod.REGEXP_EXTRACT3.method, BuiltInMethod.REGEXP_EXTRACT4.method);
Expand Down
19 changes: 13 additions & 6 deletions core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java
Original file line number Diff line number Diff line change
Expand Up @@ -400,11 +400,11 @@ Pattern toPattern() {

/** Validate regex arguments in REGEXP_* fns, throws an exception
* for invalid regex patterns, else returns a Pattern object. */
private Pattern validateRegexPattern(String regex, String methodName) {
private Pattern validateRegexPattern(String regex, String methodName, int flags) {
try {
// Uses java.util.regex as a standard for regex processing
// in Calcite instead of RE2 used by BigQuery/GoogleSQL
return cache.getUnchecked(new Key(0, regex));
return cache.getUnchecked(new Key(flags, regex));
} catch (UncheckedExecutionException e) {
if (e.getCause() instanceof PatternSyntaxException) {
throw RESOURCE.invalidRegexInputForRegexpFunctions(
Expand Down Expand Up @@ -482,10 +482,17 @@ public static String replaceNonDollarIndexedString(String replacement) {
/** SQL {@code REGEXP_CONTAINS(value, regexp)} function.
* Throws a runtime exception for invalid regular expressions. */
public boolean regexpContains(String value, String regex) {
final Pattern pattern = validateRegexPattern(regex, "REGEXP_CONTAINS");
final Pattern pattern = validateRegexPattern(regex, "REGEXP_CONTAINS", 0);
return pattern.matcher(value).find();
}

/** SQL {@code REGEXP_LIKE(value, regexp, flags)} function.
* Throws a runtime exception for invalid regular expressions. */
public boolean regexpLike(String value, String regex, String stringFlags) {
final Pattern pattern =
validateRegexPattern(regex, "REGEXP_LIKE", makeRegexpFlags(stringFlags));
return pattern.matcher(value).find();
}
/** SQL {@code REGEXP_EXTRACT(value, regexp)} function.
* Returns NULL if there is no match. Returns an exception if regex is invalid.
* Uses position=1 and occurrence=1 as default values when not specified. */
Expand All @@ -509,7 +516,7 @@ public boolean regexpContains(String value, String regex) {
// Uses java.util.regex as a standard for regex processing
// in Calcite instead of RE2 used by BigQuery/GoogleSQL
final String methodName = "REGEXP_EXTRACT";
final Pattern pattern = validateRegexPattern(regex, methodName);
final Pattern pattern = validateRegexPattern(regex, methodName, 0);

if (!validatePosOccurrenceParamValues(position, occurrence, 0, value, methodName)) {
return null;
Expand Down Expand Up @@ -538,7 +545,7 @@ public List<String> regexpExtractAll(String value, String regex) {
// Uses java.util.regex as a standard for regex processing
// in Calcite instead of RE2 used by BigQuery/GoogleSQL
final String methodName = "REGEXP_EXTRACT_ALL";
final Pattern regexp = validateRegexPattern(regex, methodName);
final Pattern regexp = validateRegexPattern(regex, methodName, 0);

Matcher matcher = regexp.matcher(value);
checkMultipleCapturingGroupsInRegex(matcher, methodName);
Expand Down Expand Up @@ -586,7 +593,7 @@ public int regexpInstr(String value, String regex, int position,
// Uses java.util.regex as a standard for regex processing
// in Calcite instead of RE2 used by BigQuery/GoogleSQL
final String methodName = "REGEXP_INSTR";
final Pattern pattern = validateRegexPattern(regex, methodName);
final Pattern pattern = validateRegexPattern(regex, methodName, 0);

if (regex.isEmpty()
|| !validatePosOccurrenceParamValues(position, occurrence,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,13 @@ static RelDataType deriveTypeSplit(SqlOperatorBinding operatorBinding,
OperandTypes.STRING_STRING,
SqlFunctionCategory.STRING);

/** The "REGEXP_LIKE(value, regexp)" function, equivalent to {@link #RLIKE}. */
@LibraryOperator(libraries = {POSTGRESQL})
public static final SqlFunction REGEXP_LIKE_PG =
SqlBasicFunction.create("REGEXP_LIKE", ReturnTypes.BOOLEAN_NULLABLE,
OperandTypes.STRING_STRING_OPTIONAL_STRING,
SqlFunctionCategory.STRING);

@LibraryOperator(libraries = {MYSQL})
public static final SqlFunction COMPRESS =
SqlBasicFunction.create("COMPRESS",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,7 @@ public enum BuiltInMethod {
String.class, String.class, int.class, int.class),
REGEXP_INSTR5(SqlFunctions.RegexFunction.class, "regexpInstr",
String.class, String.class, int.class, int.class, int.class),
REGEXP_LIKE3(SqlFunctions.RegexFunction.class, "regexpLike", String.class, String.class, String.class),

Check failure on line 566 in core/src/main/java/org/apache/calcite/util/BuiltInMethod.java

View workflow job for this annotation

GitHub Actions / Linux (JDK 19)

[Task :core:checkstyleMain] [LineLength] Line is longer than 100 characters (found 105).

Check failure on line 566 in core/src/main/java/org/apache/calcite/util/BuiltInMethod.java

View workflow job for this annotation

GitHub Actions / Linux (JDK 8), oldest Guava, America/New_York Timezone

[Task :core:checkstyleMain] [LineLength] Line is longer than 100 characters (found 105).

Check failure on line 566 in core/src/main/java/org/apache/calcite/util/BuiltInMethod.java

View workflow job for this annotation

GitHub Actions / Linux (JDK 11), Pacific/Chatham Timezone

[Task :core:checkstyleMain] [LineLength] Line is longer than 100 characters (found 105).

Check failure on line 566 in core/src/main/java/org/apache/calcite/util/BuiltInMethod.java

View workflow job for this annotation

GitHub Actions / Linux (JDK 17)

[Task :core:checkstyleMain] [LineLength] Line is longer than 100 characters (found 105).

Check failure on line 566 in core/src/main/java/org/apache/calcite/util/BuiltInMethod.java

View workflow job for this annotation

GitHub Actions / Linux (JDK 11), Avatica main

[Task :core:checkstyleMain] [LineLength] Line is longer than 100 characters (found 105).

Check failure on line 566 in core/src/main/java/org/apache/calcite/util/BuiltInMethod.java

View workflow job for this annotation

GitHub Actions / Linux (JDK 8), latest Guava, America/New_York Timezone

[Task :core:checkstyleMain] [LineLength] Line is longer than 100 characters (found 105).

Check failure on line 566 in core/src/main/java/org/apache/calcite/util/BuiltInMethod.java

View workflow job for this annotation

GitHub Actions / macOS (JDK 19)

[Task :core:checkstyleMain] [LineLength] Line is longer than 100 characters (found 105).
REGEXP_REPLACE3(SqlFunctions.RegexFunction.class, "regexpReplace",
String.class, String.class, String.class),
REGEXP_REPLACE4(SqlFunctions.RegexFunction.class, "regexpReplace",
Expand Down
2 changes: 1 addition & 1 deletion site/_docs/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -2820,7 +2820,7 @@ In the following:
| b | REGEXP_EXTRACT(string, regexp [, position [, occurrence]]) | Returns the substring in *string* that matches the *regexp*, starting search at *position* (default 1), and until locating the nth *occurrence* (default 1). Returns NULL if there is no match
| b | REGEXP_EXTRACT_ALL(string, regexp) | Returns an array of all substrings in *string* that matches the *regexp*. Returns an empty array if there is no match
| b | REGEXP_INSTR(string, regexp [, position [, occurrence [, occurrence_position]]]) | Returns the lowest 1-based position of the substring in *string* that matches the *regexp*, starting search at *position* (default 1), and until locating the nth *occurrence* (default 1). Setting occurrence_position (default 0) to 1 returns the end position of substring + 1. Returns 0 if there is no match
| s | REGEXP_LIKE(string, regexp) | Equivalent to `string1 RLIKE string2`
| s p | REGEXP_LIKE(string, regexp [, flags]) | Equivalent to `string1 RLIKE string2`. Flags are only supported on PostgreSQL. Supported flags are: <ul><li>i: case-insensitive matching</li><li>c: case-sensitive matching</li><li>n: newline-sensitive matching</li></ul>
| b m o | REGEXP_REPLACE(string, regexp, rep [, pos [, occurrence [, matchType]]]) | Replaces all substrings of *string* that match *regexp* with *rep* at the starting *pos* in expr (if omitted, the default is 1), *occurrence* specifies which occurrence of a match to search for (if omitted, the default is 1), *matchType* specifies how to perform matching
| b | REGEXP_SUBSTR(string, regexp [, position [, occurrence]]) | Synonym for REGEXP_EXTRACT
| b m p s | REPEAT(string, integer) | Returns a string consisting of *string* repeated of *integer* times; returns an empty string if *integer* is less than 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3575,6 +3575,7 @@ void checkIsNull(SqlOperatorFixture f, SqlOperator operator) {
checkRlikeFunc(f, SqlLibrary.SPARK, SqlLibraryOperators.RLIKE);
checkRlikeFunc(f, SqlLibrary.SPARK, SqlLibraryOperators.REGEXP);
checkRlikeFunc(f, SqlLibrary.SPARK, SqlLibraryOperators.REGEXP_LIKE);
checkRlikeFunc(f, SqlLibrary.POSTGRESQL, SqlLibraryOperators.REGEXP_LIKE_PG);
checkNotRlikeFunc(f.withLibrary(SqlLibrary.HIVE));
checkNotRlikeFunc(f.withLibrary(SqlLibrary.SPARK));
checkRlikeFails(f.withLibrary(SqlLibrary.MYSQL));
Expand Down

0 comments on commit 3de107d

Please sign in to comment.