From 9e3ea965044e477416189fa07423433aa51b0ae1 Mon Sep 17 00:00:00 2001 From: anthrino Date: Tue, 5 Sep 2023 16:34:24 -0700 Subject: [PATCH] [CALCITE-5978] Add REGEXP_INSTR function (enabled in BigQuery library) --- babel/src/test/resources/sql/big-query.iq | 155 ++++++++++++++++++ .../adapter/enumerable/RexImpTable.java | 4 + .../apache/calcite/runtime/SqlFunctions.java | 120 +++++++++++--- .../calcite/sql/fun/SqlLibraryOperators.java | 8 + .../apache/calcite/sql/type/OperandTypes.java | 6 + .../apache/calcite/util/BuiltInMethod.java | 8 + .../apache/calcite/test/SqlFunctionsTest.java | 77 +++++++++ site/_docs/reference.md | 1 + .../apache/calcite/test/SqlOperatorTest.java | 31 ++++ 9 files changed, 388 insertions(+), 22 deletions(-) diff --git a/babel/src/test/resources/sql/big-query.iq b/babel/src/test/resources/sql/big-query.iq index 2479eb36632..5455de9f578 100755 --- a/babel/src/test/resources/sql/big-query.iq +++ b/babel/src/test/resources/sql/big-query.iq @@ -1244,6 +1244,161 @@ SELECT REGEXP_EXTRACT_ALL("abcadcabcaecghi", "(a.c).(.*)$"); Multiple capturing groups (count=2) not allowed in regex input for REGEXP_EXTRACT_ALL !error +##################################################################### +# REGEXP_INSTR(value, regexp[, position[, occurrence[, occurrence_position]]]) +# +# Returns the lowest 1-based position of regexp in value. +# Returns 0 if there is no match, regex is empty or if position or occurrence are beyond range. +# Returns an exception if regex, position, occurrence or occurrence_position are invalid. + +WITH example AS ( + SELECT 'ab@cd-ef' AS source_value, '@[^-]*' AS regexp UNION ALL + SELECT 'ab@d-ef', '@[^-]*' UNION ALL + SELECT 'abc@cd-ef', '@[^-]*' UNION ALL + SELECT 'abc-ef', '@[^-]*') +SELECT source_value, regexp, REGEXP_INSTR(source_value, regexp) AS instr +FROM example; ++--------------+--------+-------+ +| source_value | regexp | instr | ++--------------+--------+-------+ +| ab@cd-ef | @[^-]* | 3 | +| ab@d-ef | @[^-]* | 3 | +| abc-ef | @[^-]* | 0 | +| abc@cd-ef | @[^-]* | 4 | ++--------------+--------+-------+ +(4 rows) + +!ok + +WITH example AS ( + SELECT 'a@cd-ef b@cd-ef' AS source_value, '@[^-]*' AS regexp, 1 AS position UNION ALL + SELECT 'a@cd-ef b@cd-ef', '@[^-]*', 2 UNION ALL + SELECT 'a@cd-ef b@cd-ef', '@[^-]*', 3 UNION ALL + SELECT 'a@cd-ef b@cd-ef', '@[^-]*', 4) +SELECT + source_value, regexp, position, + REGEXP_INSTR(source_value, regexp, position) AS instr +FROM example; ++-----------------+--------+----------+-------+ +| source_value | regexp | position | instr | ++-----------------+--------+----------+-------+ +| a@cd-ef b@cd-ef | @[^-]* | 1 | 2 | +| a@cd-ef b@cd-ef | @[^-]* | 2 | 2 | +| a@cd-ef b@cd-ef | @[^-]* | 3 | 10 | +| a@cd-ef b@cd-ef | @[^-]* | 4 | 10 | ++-----------------+--------+----------+-------+ +(4 rows) + +!ok + +WITH example AS ( + SELECT 'a@cd-ef b@cd-ef c@cd-ef' AS source_value, + '@[^-]*' AS regexp, 1 AS position, 1 AS occurrence UNION ALL + SELECT 'a@cd-ef b@cd-ef c@cd-ef', '@[^-]*', 1, 2 UNION ALL + SELECT 'a@cd-ef b@cd-ef c@cd-ef', '@[^-]*', 1, 3) +SELECT + source_value, regexp, position, occurrence, + REGEXP_INSTR(source_value, regexp, position, occurrence) AS instr +FROM example; ++-------------------------+--------+----------+------------+-------+ +| source_value | regexp | position | occurrence | instr | ++-------------------------+--------+----------+------------+-------+ +| a@cd-ef b@cd-ef c@cd-ef | @[^-]* | 1 | 1 | 2 | +| a@cd-ef b@cd-ef c@cd-ef | @[^-]* | 1 | 2 | 10 | +| a@cd-ef b@cd-ef c@cd-ef | @[^-]* | 1 | 3 | 18 | ++-------------------------+--------+----------+------------+-------+ +(3 rows) + +!ok + +WITH example AS ( + SELECT 'a@cd-ef' AS source_value, '@[^-]*' AS regexp, + 1 AS position, 1 AS occurrence, 0 AS o_position UNION ALL + SELECT 'a@cd-ef', '@[^-]*', 1, 1, 1) +SELECT + source_value, regexp, position, occurrence, o_position, + REGEXP_INSTR(source_value, regexp, position, occurrence, o_position) AS instr +FROM example; ++--------------+--------+----------+------------+------------+-------+ +| source_value | regexp | position | occurrence | o_position | instr | ++--------------+--------+----------+------------+------------+-------+ +| a@cd-ef | @[^-]* | 1 | 1 | 0 | 2 | +| a@cd-ef | @[^-]* | 1 | 1 | 1 | 5 | ++--------------+--------+----------+------------+------------+-------+ +(2 rows) + +!ok + +SELECT REGEXP_INSTR("abcadcabcaecghi", "a.+c"); ++--------+ +| EXPR$0 | ++--------+ +| 1 | ++--------+ +(1 row) + +!ok + +SELECT REGEXP_INSTR("abcadcabcaecghi", "abc(a.c)", 4); ++--------+ +| EXPR$0 | ++--------+ +| 10 | ++--------+ +(1 row) + +!ok + +SELECT REGEXP_INSTR("abcadcabcaecghi", "a.c", 25); ++--------+ +| EXPR$0 | ++--------+ +| 0 | ++--------+ +(1 row) + +!ok + +SELECT REGEXP_INSTR("abcadcabcaecghi", "a.c", 1, 5); ++--------+ +| EXPR$0 | ++--------+ +| 0 | ++--------+ +(1 row) + +!ok + +SELECT REGEXP_INSTR("a9cadca5c4aecghi", "a[0-9]c", 1, 2, 1); ++--------+ +| EXPR$0 | ++--------+ +| 10 | ++--------+ +(1 row) + +!ok + +SELECT REGEXP_INSTR("abc def ghi", "adz)"); +Invalid regular expression for REGEXP_INSTR: 'Unmatched closing ')' near index 2 adz) ^' +!error + +SELECT REGEXP_INSTR("abc def ghi", "(^)a(.*)b($)"); +Multiple capturing groups (count=3) not allowed in regex input for REGEXP_INSTR +!error + +SELECT REGEXP_INSTR("abcadcabcaecghi", "a.c", 0); +Invalid integer input '0' for argument 'position' in REGEXP_INSTR +!error + +SELECT REGEXP_INSTR("abcadcabcaecghi", "a.c", 3, -2); +Invalid integer input '-2' for argument 'occurrence' in REGEXP_INSTR +!error + +SELECT REGEXP_INSTR("abcadcabcaecghi", "a.c", 3, 2, -2); +Invalid integer input '-2' for argument 'occurrence_position' in REGEXP_INSTR +!error + ##################################################################### # REGEXP_SUBSTR(value, regexp[, position[, occurrence]]) # diff --git a/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java b/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java index 03d7a07c0f6..de84ea7d713 100644 --- a/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java +++ b/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java @@ -224,6 +224,7 @@ import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_CONTAINS; import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_EXTRACT; import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_EXTRACT_ALL; +import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_INSTR; import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_REPLACE; import static org.apache.calcite.sql.fun.SqlLibraryOperators.REPEAT; import static org.apache.calcite.sql.fun.SqlLibraryOperators.REVERSE; @@ -582,6 +583,9 @@ Builder populate() { defineReflective(REGEXP_EXTRACT, BuiltInMethod.REGEXP_EXTRACT2.method, BuiltInMethod.REGEXP_EXTRACT3.method, BuiltInMethod.REGEXP_EXTRACT4.method); defineReflective(REGEXP_EXTRACT_ALL, BuiltInMethod.REGEXP_EXTRACT_ALL.method); + defineReflective(REGEXP_INSTR, BuiltInMethod.REGEXP_INSTR2.method, + BuiltInMethod.REGEXP_INSTR3.method, BuiltInMethod.REGEXP_INSTR4.method, + BuiltInMethod.REGEXP_INSTR5.method); map.put(TRIM, new TrimImplementor()); diff --git a/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java b/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java index 835eb068b4a..1b659d6cb38 100644 --- a/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java +++ b/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java @@ -365,7 +365,8 @@ public static String sha512(ByteString string) { return DigestUtils.sha512Hex(string.getBytes()); } - /** State for {@code REGEXP_CONTAINS}, {@code REGEXP_REPLACE}, {@code RLIKE}. + /** State for {@code REGEXP_CONTAINS}, {@code REGEXP_EXTRACT}, {@code REGEXP_EXTRACT_ALL}, + * {@code REGEXP_INSTR}, {@code REGEXP_REPLACE}, {@code RLIKE}. * *

Marked deterministic so that the code generator instantiates one once * per query, not once per row. */ @@ -404,7 +405,7 @@ private Pattern validateRegexPattern(String regex, String methodName) { } } - /** Helper for multiple capturing group regex check in REGEXP_EXTRACT fns. */ + /** Helper for multiple capturing group regex check in REGEXP_* fns. */ private void checkMultipleCapturingGroupsInRegex(Matcher matcher, String methodName) { if (matcher.groupCount() > 1) { throw RESOURCE.multipleCapturingGroupsForRegexpExtract( @@ -412,31 +413,54 @@ private void checkMultipleCapturingGroupsInRegex(Matcher matcher, String methodN } } + /** Helper for checking values of position and occurrence arguments in REGEXP_* fns. + * Regex fns not using occurrencePosition param pass a default value of 0. + * Throws an exception or returns true in case of failed value checks. */ + private boolean checkPosOccurrenceParamValues(int position, + int occurrence, int occurrencePosition, String value, String methodName) { + if (position <= 0) { + throw RESOURCE.invalidIntegerInputForRegexpFunctions(Integer.toString(position), + "position", methodName).ex(); + } + if (occurrence <= 0) { + throw RESOURCE.invalidIntegerInputForRegexpFunctions(Integer.toString(occurrence), + "occurrence", methodName).ex(); + } + if (occurrencePosition != 0 && occurrencePosition != 1) { + throw RESOURCE.invalidIntegerInputForRegexpFunctions(Integer.toString(occurrencePosition), + "occurrence_position", methodName).ex(); + } + if (position <= value.length()) { + return false; + } + return true; + } + /** SQL {@code REGEXP_CONTAINS(value, regexp)} function. - * Throws a runtime exception for invalid regular expressions.*/ + * Throws a runtime exception for invalid regular expressions. */ public boolean regexpContains(String value, String regex) { final Pattern pattern = validateRegexPattern(regex, "REGEXP_CONTAINS"); return pattern.matcher(value).find(); } /** SQL {@code REGEXP_EXTRACT(value, regexp)} function. - * Returns NULL if there is no match. Returns an exception if regex is invalid. - * Uses position=1 and occurrence=1 as default values when not specified. */ + * Returns NULL if there is no match. Returns an exception if regex is invalid. + * Uses position=1 and occurrence=1 as default values when not specified. */ public @Nullable String regexpExtract(String value, String regex) { return regexpExtract(value, regex, 1, 1); } /** SQL {@code REGEXP_EXTRACT(value, regexp, position)} function. - * Returns NULL if there is no match, or if position is beyond range. - * Returns an exception if regex or position is invalid. - * Uses occurrence=1 as default value when not specified. */ + * Returns NULL if there is no match, or if position is beyond range. + * Returns an exception if regex or position is invalid. + * Uses occurrence=1 as default value when not specified. */ public @Nullable String regexpExtract(String value, String regex, int position) { return regexpExtract(value, regex, position, 1); } /** SQL {@code REGEXP_EXTRACT(value, regexp, position, occurrence)} function. - * Returns NULL if there is no match, or if position or occurrence are beyond range. - * Returns an exception if regex, position or occurrence are invalid. */ + * Returns NULL if there is no match, or if position or occurrence are beyond range. + * Returns an exception if regex, position or occurrence are invalid. */ public @Nullable String regexpExtract(String value, String regex, int position, int occurrence) { // Uses java.util.regex as a standard for regex processing @@ -444,16 +468,7 @@ public boolean regexpContains(String value, String regex) { final String methodName = "REGEXP_EXTRACT"; final Pattern pattern = validateRegexPattern(regex, methodName); - if (position <= 0) { - throw RESOURCE.invalidIntegerInputForRegexpFunctions(Integer.toString(position), - "position", methodName).ex(); - } - if (occurrence <= 0) { - throw RESOURCE.invalidIntegerInputForRegexpFunctions(Integer.toString(occurrence), - "occurrence", methodName).ex(); - } - - if (position > value.length()) { + if (checkPosOccurrenceParamValues(position, occurrence, 0, value, methodName)) { return null; } @@ -475,7 +490,7 @@ public boolean regexpContains(String value, String regex) { } /** SQL {@code REGEXP_EXTRACT_ALL(value, regexp)} function. - * Returns an empty array if there is no match, returns an exception if regex is invalid.*/ + * Returns an empty array if there is no match, returns an exception if regex is invalid. */ public List regexpExtractAll(String value, String regex) { // Uses java.util.regex as a standard for regex processing // in Calcite instead of RE2 used by BigQuery/GoogleSQL @@ -488,13 +503,74 @@ public List regexpExtractAll(String value, String regex) { ImmutableList.Builder matches = ImmutableList.builder(); while (matcher.find()) { String match = matcher.group(matcher.groupCount()); - if (match != null && !match.isEmpty()) { + if (match != null) { matches.add(match); } } return matches.build(); } + /** SQL {@code REGEXP_INSTR(value, regexp)} function. + * Returns 0 if there is no match or regex is empty. Returns an exception if regex is invalid. + * Uses position=1, occurrence=1, occurrencePosition=0 as default values if not specified. */ + public int regexpInstr(String value, String regex) { + return regexpInstr(value, regex, 1, 1, 0); + } + + /** SQL {@code REGEXP_INSTR(value, regexp, position)} function. + * Returns 0 if there is no match, regex is empty, or if position is beyond range. + * Returns an exception if regex or position is invalid. + * Uses occurrence=1, occurrencePosition=0 as default value when not specified. */ + public int regexpInstr(String value, String regex, int position) { + return regexpInstr(value, regex, position, 1, 0); + } + + /** SQL {@code REGEXP_INSTR(value, regexp, position, occurrence)} function. + * Returns 0 if there is no match, regex is empty, or if position or occurrence + * are beyond range. Returns an exception if regex, position or occurrence are invalid. + * Uses occurrencePosition=0 as default value when not specified. */ + public int regexpInstr(String value, String regex, int position, + int occurrence) { + return regexpInstr(value, regex, position, occurrence, 0); + } + + /** SQL {@code REGEXP_INSTR(value, regexp, position, occurrence, occurrencePosition)} + * function. Returns 0 if there is no match, regex is empty, or if position or occurrence + * are beyond range. Returns an exception if regex, position, occurrence + * or occurrencePosition are invalid. */ + public int regexpInstr(String value, String regex, int position, + int occurrence, int occurrencePosition) { + // Uses java.util.regex as a standard for regex processing + // in Calcite instead of RE2 used by BigQuery/GoogleSQL + final String methodName = "REGEXP_INSTR"; + final Pattern pattern = validateRegexPattern(regex, methodName); + + if (checkPosOccurrenceParamValues(position, occurrence, occurrencePosition, value, + methodName) || regex.isEmpty()) { + return 0; + } + + Matcher matcher = pattern.matcher(value); + checkMultipleCapturingGroupsInRegex(matcher, methodName); + matcher.region(position - 1, value.length()); + + int matchIndex = 0; + while (occurrence > 0) { + if (matcher.find()) { + if (occurrencePosition == 0) { + matchIndex = matcher.start(matcher.groupCount()) + 1; + } else { + matchIndex = matcher.end(matcher.groupCount()) + 1; + } + } else { + return 0; + } + occurrence--; + } + + return matchIndex; + } + /** SQL {@code REGEXP_REPLACE} function with 3 arguments. */ public String regexpReplace(String s, String regex, String replacement) { diff --git a/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java b/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java index 43f5f6b9b4d..f850ab1cc06 100644 --- a/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java +++ b/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java @@ -500,6 +500,14 @@ static RelDataType deriveTypeSplit(SqlOperatorBinding operatorBinding, OperandTypes.STRING_STRING, SqlFunctionCategory.STRING); + /** The "REGEXP_INSTR(value, regexp [, position[, occurrence, [occurrence_position]]])" function. + * Returns the lowest 1-based position of a regexp in value. Returns NULL if there is no match. */ + @LibraryOperator(libraries = {BIG_QUERY}) + public static final SqlBasicFunction REGEXP_INSTR = + SqlBasicFunction.create("REGEXP_INSTR", ReturnTypes.INTEGER_NULLABLE, + OperandTypes.STRING_STRING_OPTIONAL_INTEGER_OPTIONAL_INTEGER_OPTIONAL_INTEGER, + SqlFunctionCategory.STRING); + @LibraryOperator(libraries = {MYSQL, ORACLE}) public static final SqlFunction REGEXP_REPLACE = new SqlRegexpReplaceFunction(); diff --git a/core/src/main/java/org/apache/calcite/sql/type/OperandTypes.java b/core/src/main/java/org/apache/calcite/sql/type/OperandTypes.java index 8f77ddedd96..8f1d6facdc6 100644 --- a/core/src/main/java/org/apache/calcite/sql/type/OperandTypes.java +++ b/core/src/main/java/org/apache/calcite/sql/type/OperandTypes.java @@ -867,6 +867,12 @@ public static SqlSingleOperandTypeChecker same(int operandCount, ImmutableList.of(SqlTypeFamily.STRING, SqlTypeFamily.STRING, SqlTypeFamily.INTEGER, SqlTypeFamily.INTEGER), i -> i == 2 || i == 3); + public static final SqlSingleOperandTypeChecker + STRING_STRING_OPTIONAL_INTEGER_OPTIONAL_INTEGER_OPTIONAL_INTEGER = + family( + ImmutableList.of(SqlTypeFamily.STRING, SqlTypeFamily.STRING, SqlTypeFamily.INTEGER, + SqlTypeFamily.INTEGER, SqlTypeFamily.INTEGER), i -> i == 2 || i == 3 || i == 4); + public static final SqlSingleOperandTypeChecker STRING_INTEGER = family(SqlTypeFamily.STRING, SqlTypeFamily.INTEGER); diff --git a/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java b/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java index 19c5228e09b..4652ca6bab8 100644 --- a/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java +++ b/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java @@ -534,6 +534,14 @@ public enum BuiltInMethod { String.class, String.class, int.class, int.class), REGEXP_EXTRACT_ALL(SqlFunctions.RegexFunction.class, "regexpExtractAll", String.class, String.class), + REGEXP_INSTR2(SqlFunctions.RegexFunction.class, "regexpInstr", + String.class, String.class), + REGEXP_INSTR3(SqlFunctions.RegexFunction.class, "regexpInstr", + String.class, String.class, int.class), + REGEXP_INSTR4(SqlFunctions.RegexFunction.class, "regexpInstr", + String.class, String.class, int.class, int.class), + REGEXP_INSTR5(SqlFunctions.RegexFunction.class, "regexpInstr", + String.class, String.class, int.class, int.class, int.class), REGEXP_REPLACE3(SqlFunctions.RegexFunction.class, "regexpReplace", String.class, String.class, String.class), REGEXP_REPLACE4(SqlFunctions.RegexFunction.class, "regexpReplace", diff --git a/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java b/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java index 52a33b0e8fe..f31a1829799 100644 --- a/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java +++ b/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java @@ -240,6 +240,7 @@ static List list() { assertThat(f.regexpContains("abcdef", "abz*"), is(true)); assertThat(f.regexpContains("zabzz", "abz*"), is(true)); assertThat(f.regexpContains("zazbbzz", "abz*"), is(false)); + assertThat(f.regexpContains("abcadcabcaecghi", ""), is(true)); try { final boolean b = f.regexpContains("abc def ghi", "(abc"); @@ -274,6 +275,7 @@ static List list() { // basic extracts assertThat(f.regexpExtract("abcadcabcaecghi", "ac"), nullValue()); + assertThat(f.regexpExtract("abcadcabcaecghi", ""), is("")); assertThat(f.regexpExtract("a9cadca5c4aecghi", "a[0-9]c"), is("a9c")); assertThat(f.regexpExtract("abcadcabcaecghi", "a.*c"), is("abcadcabcaec")); @@ -342,6 +344,7 @@ static List list() { final SqlFunctions.RegexFunction f = new SqlFunctions.RegexFunction(); assertThat(f.regexpExtractAll("abcadcabcaecghi", "ac"), is(list())); + assertThat(f.regexpExtractAll("abcadc", ""), is(list("", "", "", "", "", "", ""))); assertThat(f.regexpExtractAll("abcadcabcaecghi", "abc(a.c)"), is(list("adc", "aec"))); assertThat(f.regexpExtractAll("abcadcabcaecghi", "a.c"), is(list("abc", "adc", "abc", "aec"))); assertThat(f.regexpExtractAll("banana", "ana"), is(list("ana"))); @@ -367,6 +370,80 @@ static List list() { } } + @Test void testRegexpInstr() { + final SqlFunctions.RegexFunction f = new SqlFunctions.RegexFunction(); + + // basic searches + assertThat(f.regexpInstr("abcdefghij", "adc"), is(0)); + assertThat(f.regexpInstr("abcdefghij", ""), is(0)); + assertThat(f.regexpInstr("a9ca5c4aechi", "a[0-9]c"), is(1)); + assertThat(f.regexpInstr("abcadcabcaecghi", ".dc"), is(4)); + + // capturing group searches + assertThat(f.regexpInstr("abcadcabcaecghi", "abc(a.c)"), is(4)); + assertThat(f.regexpInstr("abcadcabcaecghi", "abc(a.c)", 4), is(10)); + assertThat(f.regexpInstr("abcadcabcaecghi", "abc(a.c)", 1, 2), is(10)); + assertThat(f.regexpInstr("abcadcabcaecghi", "abc(a.c)", 1, 2, 1), is(13)); + + // position-based searches + assertThat(f.regexpInstr("abcadcabcaecghi", ".ec", 25), is(0)); + assertThat(f.regexpInstr("a9cadca5c4aecghi", "a[0-9]c", 4), is(7)); + assertThat(f.regexpInstr("abcadcabcaecghi", "a.*c", 7), is(7)); + + // occurrence-based searches + assertThat(f.regexpInstr("a9cadca5c4aecghi", "a[0-9]c", 1, 3), is(0)); + assertThat(f.regexpInstr("a9cadca5c4aecghi", "a[0-9]c", 2, 1), is(7)); + assertThat(f.regexpInstr("a9cadca5c4aecghi", "a[0-9]c", 1, 1), is(1)); + + // occurrence_position-based searches + assertThat(f.regexpInstr("a9cadca5c4aecghi", "a[0-9]c", 1, 1, 0), is(1)); + assertThat(f.regexpInstr("abcadcabcaecghi", "abc(a.c)", 7, 1, 1), is(13)); + assertThat(f.regexpInstr("abcadcabcaec", "abc(a.c)", 4, 1, 1), is(13)); + + // exceptional scenarios + try { + final int idx = f.regexpInstr("abc def ghi", "{4,1}"); + fail("expected error, got " + idx); + } catch (RuntimeException e) { + assertThat(e.getMessage(), + is("Invalid regular expression for REGEXP_INSTR: 'Illegal repetition range near index 4" + + " {4,1} ^'")); + } + + try { + final int idx = f.regexpInstr("abcadcabcaecghi", "(.)a(.c)"); + fail("expected error, got " + idx); + } catch (RuntimeException e) { + assertThat(e.getMessage(), + is("Multiple capturing groups (count=2) not allowed in regex input for " + + "REGEXP_INSTR")); + } + + try { + final int idx = f.regexpInstr("abcadcabcaecghi", "a.c", 0); + fail("expected error, got " + idx); + } catch (RuntimeException e) { + assertThat(e.getMessage(), + is("Invalid integer input '0' for argument 'position' in REGEXP_INSTR")); + } + + try { + final int idx = f.regexpInstr("abcadcabcaecghi", "a.c", 3, -1); + fail("expected error, got " + idx); + } catch (RuntimeException e) { + assertThat(e.getMessage(), + is("Invalid integer input '-1' for argument 'occurrence' in REGEXP_INSTR")); + } + + try { + final int idx = f.regexpInstr("abcadcabcaecghi", "a.c", 2, 4, -4); + fail("expected error, got " + idx); + } catch (RuntimeException e) { + assertThat(e.getMessage(), + is("Invalid integer input '-4' for argument 'occurrence_position' in REGEXP_INSTR")); + } + } + @Test void testRegexpReplace() { final SqlFunctions.RegexFunction f = new SqlFunctions.RegexFunction(); assertThat(f.regexpReplace("a b c", "b", "X"), is("a X c")); diff --git a/site/_docs/reference.md b/site/_docs/reference.md index 6c0b9c2a636..857445d27f4 100644 --- a/site/_docs/reference.md +++ b/site/_docs/reference.md @@ -2790,6 +2790,7 @@ BigQuery's type system uses confusingly different names for types and functions: | b | REGEXP_CONTAINS(string, regexp) | Returns whether *string* is a partial match for the *regexp* | b | REGEXP_EXTRACT(string, regexp [, position [, occurrence]]) | Returns the substring in *string* that matches the *regexp*, starting search at *position* (default 1), and until locating the nth *occurrence* (default 1). Returns NULL if there is no match | b | REGEXP_EXTRACT_ALL(string, regexp) | Returns an array of all substrings in *string* that matches the *regexp*. Returns an empty array if there is no match +| b | REGEXP_INSTR(string, regexp [, position [, occurrence [, occurrence_position]]]) | Returns the lowest 1-based position of the substring in *string* that matches the *regexp*, starting search at *position* (default 1), and until locating the nth *occurrence* (default 1). Setting occurrence_position (default 0) to 1 returns the end position of substring + 1. Returns 0 if there is no match | m o | REGEXP_REPLACE(string, regexp, rep [, pos [, occurrence [, matchType]]]) | Replaces all substrings of *string* that match *regexp* with *rep* at the starting *pos* in expr (if omitted, the default is 1), *occurrence* means which occurrence of a match to search for (if omitted, the default is 1), *matchType* specifies how to perform matching | b | REGEXP_SUBSTR(string, regexp [, position [, occurrence]]) | Synonym for REGEXP_EXTRACT | b m p | REPEAT(string, integer) | Returns a string consisting of *string* repeated of *integer* times; returns an empty string if *integer* is less than 1 diff --git a/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java b/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java index 4e0636be075..e4b774ac59c 100644 --- a/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java +++ b/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java @@ -4666,6 +4666,7 @@ private static void checkIf(SqlOperatorFixture f) { f.checkBoolean("regexp_contains('11555666442233', '^\\d{10}$')", false); f.checkBoolean("regexp_contains('55566644221133', '\\d{10}')", true); f.checkBoolean("regexp_contains('55as56664as422', '\\d{10}')", false); + f.checkBoolean("regexp_contains('55as56664as422', '')", true); f.checkQuery("select regexp_contains('abc def ghi', 'abc')"); f.checkQuery("select regexp_contains('foo@bar.com', '@[a-zA-Z0-9-]+\\\\.[a-zA-Z0-9-.]+')"); @@ -4729,6 +4730,36 @@ private static void checkIf(SqlOperatorFixture f) { f.checkQuery("select regexp_extract_all('55as56664as422', '\\d{10}')"); } + @Test void testRegexpInstrFunc() { + final SqlOperatorFixture f = + fixture().setFor(SqlLibraryOperators.REGEXP_INSTR).withLibrary(SqlLibrary.BIG_QUERY); + + f.checkScalar("regexp_instr('abc def ghi', 'def')", 5, "INTEGER NOT NULL"); + f.checkScalar("regexp_instr('abcadcaecghi', 'a.c', 2)", 4, "INTEGER NOT NULL"); + f.checkScalar("regexp_instr('abcadcaecghi', 'a.c', 1, 3)", 7, "INTEGER NOT NULL"); + f.checkScalar("regexp_instr('abcadcaecghi', 'a.c', 1, 3, 1)", 10, "INTEGER NOT NULL"); + f.checkScalar("regexp_instr('a9cadca513ca4cecghi', 'a([0-9]+)', 1, 2, 1)", 11, + "INTEGER NOT NULL"); + f.checkScalar("regexp_instr('a9cadca513ca4cecghi', 'a([0-9]*)', 8, 1, 0)", 13, + "INTEGER NOT NULL"); + f.checkScalar("regexp_instr('55as56664as422', '\\d{3}', 3, 2, 0)", 12, "INTEGER NOT NULL"); + f.checkScalar("regexp_instr('55as56664as422', '\\d{2}', 2, 2, 1)", 9, "INTEGER NOT NULL"); + f.checkScalar("regexp_instr('55as56664as422', '', 2, 2, 1)", 0, "INTEGER NOT NULL"); + + f.checkNull("regexp_instr('abc def ghi', cast(null as varchar))"); + f.checkNull("regexp_instr(cast(null as varchar), 'abc')"); + f.checkNull("regexp_instr(cast(null as varchar), cast(null as varchar))"); + f.checkNull("regexp_instr('abc def ghi', 'abc', cast(null as integer))"); + f.checkNull("regexp_instr('abc def ghi', 'abc', 1, cast(null as integer))"); + f.checkNull("regexp_instr('abc def ghi', 'abc', 1, 3, cast(null as integer))"); + + f.checkQuery("select regexp_instr('abc def ghi', 'abc')"); + f.checkQuery("select regexp_instr('foo@bar.com', '@[a-zA-Z0-9-]+\\\\.[a-zA-Z0-9-.]+')"); + f.checkQuery("select regexp_instr('55as56664as422', '\\d{10}')"); + f.checkQuery("select regexp_instr('abcadcabcaecghi', 'c(a.c)', 4)"); + f.checkQuery("select regexp_instr('a9cadca5c4aecghi', 'a[0-9]c', 1, 3)"); + } + @Test void testRegexpReplaceFunc() { final SqlOperatorFixture f0 = fixture(); final Consumer consumer = f -> {