From 3de107d4c2bfe85194d42f6f30251ff7c71f9cb3 Mon Sep 17 00:00:00 2001 From: James Duong Date: Tue, 12 Mar 2024 13:33:30 -0700 Subject: [PATCH] [CALCITE-6309] Add REGEXP_LIKE function (enabled in Postgres library) PostgreSQL supports a 3-argument REGEXP_LIKE variant that takes in a string of regex flags. --- .../adapter/enumerable/RexImpTable.java | 2 ++ .../apache/calcite/runtime/SqlFunctions.java | 19 +++++++++++++------ .../calcite/sql/fun/SqlLibraryOperators.java | 7 +++++++ .../apache/calcite/util/BuiltInMethod.java | 1 + site/_docs/reference.md | 2 +- .../apache/calcite/test/SqlOperatorTest.java | 1 + 6 files changed, 25 insertions(+), 7 deletions(-) diff --git a/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java b/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java index 1bc88f18c43f..9624c46e3d00 100644 --- a/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java +++ b/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java @@ -238,6 +238,7 @@ import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_EXTRACT_ALL; import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_INSTR; import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_LIKE; +import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_LIKE_PG; import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_REPLACE; import static org.apache.calcite.sql.fun.SqlLibraryOperators.REPEAT; import static org.apache.calcite.sql.fun.SqlLibraryOperators.REVERSE; @@ -601,6 +602,7 @@ Builder populate() { BuiltInMethod.PARSE_URL3.method); defineReflective(REGEXP, BuiltInMethod.RLIKE.method); defineReflective(REGEXP_LIKE, BuiltInMethod.RLIKE.method); + defineReflective(REGEXP_LIKE_PG, BuiltInMethod.RLIKE.method, BuiltInMethod.REGEXP_LIKE3.method); defineReflective(REGEXP_CONTAINS, BuiltInMethod.REGEXP_CONTAINS.method); defineReflective(REGEXP_EXTRACT, BuiltInMethod.REGEXP_EXTRACT2.method, BuiltInMethod.REGEXP_EXTRACT3.method, BuiltInMethod.REGEXP_EXTRACT4.method); diff --git a/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java b/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java index 7ba962eacaf9..2d1b903bfeba 100644 --- a/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java +++ b/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java @@ -400,11 +400,11 @@ Pattern toPattern() { /** Validate regex arguments in REGEXP_* fns, throws an exception * for invalid regex patterns, else returns a Pattern object. */ - private Pattern validateRegexPattern(String regex, String methodName) { + private Pattern validateRegexPattern(String regex, String methodName, int flags) { try { // Uses java.util.regex as a standard for regex processing // in Calcite instead of RE2 used by BigQuery/GoogleSQL - return cache.getUnchecked(new Key(0, regex)); + return cache.getUnchecked(new Key(flags, regex)); } catch (UncheckedExecutionException e) { if (e.getCause() instanceof PatternSyntaxException) { throw RESOURCE.invalidRegexInputForRegexpFunctions( @@ -482,10 +482,17 @@ public static String replaceNonDollarIndexedString(String replacement) { /** SQL {@code REGEXP_CONTAINS(value, regexp)} function. * Throws a runtime exception for invalid regular expressions. */ public boolean regexpContains(String value, String regex) { - final Pattern pattern = validateRegexPattern(regex, "REGEXP_CONTAINS"); + final Pattern pattern = validateRegexPattern(regex, "REGEXP_CONTAINS", 0); return pattern.matcher(value).find(); } + /** SQL {@code REGEXP_LIKE(value, regexp, flags)} function. + * Throws a runtime exception for invalid regular expressions. */ + public boolean regexpLike(String value, String regex, String stringFlags) { + final Pattern pattern = + validateRegexPattern(regex, "REGEXP_LIKE", makeRegexpFlags(stringFlags)); + return pattern.matcher(value).find(); + } /** SQL {@code REGEXP_EXTRACT(value, regexp)} function. * Returns NULL if there is no match. Returns an exception if regex is invalid. * Uses position=1 and occurrence=1 as default values when not specified. */ @@ -509,7 +516,7 @@ public boolean regexpContains(String value, String regex) { // Uses java.util.regex as a standard for regex processing // in Calcite instead of RE2 used by BigQuery/GoogleSQL final String methodName = "REGEXP_EXTRACT"; - final Pattern pattern = validateRegexPattern(regex, methodName); + final Pattern pattern = validateRegexPattern(regex, methodName, 0); if (!validatePosOccurrenceParamValues(position, occurrence, 0, value, methodName)) { return null; @@ -538,7 +545,7 @@ public List regexpExtractAll(String value, String regex) { // Uses java.util.regex as a standard for regex processing // in Calcite instead of RE2 used by BigQuery/GoogleSQL final String methodName = "REGEXP_EXTRACT_ALL"; - final Pattern regexp = validateRegexPattern(regex, methodName); + final Pattern regexp = validateRegexPattern(regex, methodName, 0); Matcher matcher = regexp.matcher(value); checkMultipleCapturingGroupsInRegex(matcher, methodName); @@ -586,7 +593,7 @@ public int regexpInstr(String value, String regex, int position, // Uses java.util.regex as a standard for regex processing // in Calcite instead of RE2 used by BigQuery/GoogleSQL final String methodName = "REGEXP_INSTR"; - final Pattern pattern = validateRegexPattern(regex, methodName); + final Pattern pattern = validateRegexPattern(regex, methodName, 0); if (regex.isEmpty() || !validatePosOccurrenceParamValues(position, occurrence, diff --git a/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java b/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java index 38b5b48f77b6..23939e5cb715 100644 --- a/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java +++ b/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java @@ -563,6 +563,13 @@ static RelDataType deriveTypeSplit(SqlOperatorBinding operatorBinding, OperandTypes.STRING_STRING, SqlFunctionCategory.STRING); + /** The "REGEXP_LIKE(value, regexp)" function, equivalent to {@link #RLIKE}. */ + @LibraryOperator(libraries = {POSTGRESQL}) + public static final SqlFunction REGEXP_LIKE_PG = + SqlBasicFunction.create("REGEXP_LIKE", ReturnTypes.BOOLEAN_NULLABLE, + OperandTypes.STRING_STRING_OPTIONAL_STRING, + SqlFunctionCategory.STRING); + @LibraryOperator(libraries = {MYSQL}) public static final SqlFunction COMPRESS = SqlBasicFunction.create("COMPRESS", diff --git a/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java b/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java index 6280e11fdf2a..1f96c899612f 100644 --- a/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java +++ b/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java @@ -563,6 +563,7 @@ public enum BuiltInMethod { String.class, String.class, int.class, int.class), REGEXP_INSTR5(SqlFunctions.RegexFunction.class, "regexpInstr", String.class, String.class, int.class, int.class, int.class), + REGEXP_LIKE3(SqlFunctions.RegexFunction.class, "regexpLike", String.class, String.class, String.class), REGEXP_REPLACE3(SqlFunctions.RegexFunction.class, "regexpReplace", String.class, String.class, String.class), REGEXP_REPLACE4(SqlFunctions.RegexFunction.class, "regexpReplace", diff --git a/site/_docs/reference.md b/site/_docs/reference.md index d8712e52ab65..5ad2aeb3d5fc 100644 --- a/site/_docs/reference.md +++ b/site/_docs/reference.md @@ -2820,7 +2820,7 @@ In the following: | b | REGEXP_EXTRACT(string, regexp [, position [, occurrence]]) | Returns the substring in *string* that matches the *regexp*, starting search at *position* (default 1), and until locating the nth *occurrence* (default 1). Returns NULL if there is no match | b | REGEXP_EXTRACT_ALL(string, regexp) | Returns an array of all substrings in *string* that matches the *regexp*. Returns an empty array if there is no match | b | REGEXP_INSTR(string, regexp [, position [, occurrence [, occurrence_position]]]) | Returns the lowest 1-based position of the substring in *string* that matches the *regexp*, starting search at *position* (default 1), and until locating the nth *occurrence* (default 1). Setting occurrence_position (default 0) to 1 returns the end position of substring + 1. Returns 0 if there is no match -| s | REGEXP_LIKE(string, regexp) | Equivalent to `string1 RLIKE string2` +| s p | REGEXP_LIKE(string, regexp [, flags]) | Equivalent to `string1 RLIKE string2`. Flags are only supported on PostgreSQL. Supported flags are: | b m o | REGEXP_REPLACE(string, regexp, rep [, pos [, occurrence [, matchType]]]) | Replaces all substrings of *string* that match *regexp* with *rep* at the starting *pos* in expr (if omitted, the default is 1), *occurrence* specifies which occurrence of a match to search for (if omitted, the default is 1), *matchType* specifies how to perform matching | b | REGEXP_SUBSTR(string, regexp [, position [, occurrence]]) | Synonym for REGEXP_EXTRACT | b m p s | REPEAT(string, integer) | Returns a string consisting of *string* repeated of *integer* times; returns an empty string if *integer* is less than 1 diff --git a/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java b/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java index ebf23f290a37..fe8d5fc81494 100644 --- a/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java +++ b/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java @@ -3575,6 +3575,7 @@ void checkIsNull(SqlOperatorFixture f, SqlOperator operator) { checkRlikeFunc(f, SqlLibrary.SPARK, SqlLibraryOperators.RLIKE); checkRlikeFunc(f, SqlLibrary.SPARK, SqlLibraryOperators.REGEXP); checkRlikeFunc(f, SqlLibrary.SPARK, SqlLibraryOperators.REGEXP_LIKE); + checkRlikeFunc(f, SqlLibrary.POSTGRESQL, SqlLibraryOperators.REGEXP_LIKE_PG); checkNotRlikeFunc(f.withLibrary(SqlLibrary.HIVE)); checkNotRlikeFunc(f.withLibrary(SqlLibrary.SPARK)); checkRlikeFails(f.withLibrary(SqlLibrary.MYSQL));