diff --git a/src/query/functions/src/scalars/string.rs b/src/query/functions/src/scalars/string.rs index ab1c8703e5a1..d794cf780c75 100644 --- a/src/query/functions/src/scalars/string.rs +++ b/src/query/functions/src/scalars/string.rs @@ -21,6 +21,7 @@ use databend_common_expression::types::number::SimpleDomain; use databend_common_expression::types::number::UInt64Type; use databend_common_expression::types::string::StringColumn; use databend_common_expression::types::string::StringColumnBuilder; +use databend_common_expression::types::string::StringDomain; use databend_common_expression::types::ArrayType; use databend_common_expression::types::NumberType; use databend_common_expression::types::StringType; @@ -631,7 +632,25 @@ pub fn register(registry: &mut FunctionRegistry) { registry.register_passthrough_nullable_2_arg::, StringType, _, _>( "left", - |_, _, _| FunctionDomain::Full, + |_, lhs, rhs| { + let rm = rhs.min as usize; + let min = if rm < lhs.min.chars().count() { + lhs.min.slice(0..rm).to_string() + } else { + lhs.min.clone() + }; + + let rn = rhs.max as usize; + let max = lhs.max.as_ref().map(|ln| { + if rn < ln.chars().count() { + ln.slice(0..rn).to_string() + } else { + ln.clone() + } + }); + + FunctionDomain::Domain(StringDomain { min, max }) + }, vectorize_with_builder_2_arg::, StringType>( |s, n, output, _| { let n = n as usize; @@ -665,7 +684,13 @@ pub fn register(registry: &mut FunctionRegistry) { registry.register_passthrough_nullable_2_arg::, StringType, _, _>( "substr", - |_, _, _| FunctionDomain::Full, + |_, lhs, rhs| { + if rhs.min == rhs.max && rhs.min == 1 { + FunctionDomain::Domain(lhs.clone()) + } else { + FunctionDomain::Full + } + }, vectorize_with_builder_2_arg::, StringType>( |s, pos, output, _ctx| { substr(output, s, pos, s.len() as u64); @@ -675,7 +700,29 @@ pub fn register(registry: &mut FunctionRegistry) { registry.register_passthrough_nullable_3_arg::, NumberType, StringType, _, _>( "substr", - |_, _, _, _| FunctionDomain::Full, + |_, arg1, arg2, arg3| { + if arg2.min == arg2.max && arg2.min == 1 { + let rm = arg3.min as usize; + let min = if rm < arg1.min.chars().count() { + arg1.min.slice(0..rm).to_string() + } else { + arg1.min.clone() + }; + + let rn = arg3.max as usize; + let max = arg1.max.as_ref().map(|ln| { + if rn < ln.chars().count() { + ln.slice(0..rn).to_string() + } else { + ln.clone() + } + }); + + FunctionDomain::Domain(StringDomain { min, max }) + } else { + FunctionDomain::Full + } + }, vectorize_with_builder_3_arg::, NumberType, StringType>(|s, pos, len, output, _ctx| { substr(output, s, pos, len); }), diff --git a/src/query/functions/tests/it/scalars/testdata/string.txt b/src/query/functions/tests/it/scalars/testdata/string.txt index 870f131681a8..c931bee13da7 100644 --- a/src/query/functions/tests/it/scalars/testdata/string.txt +++ b/src/query/functions/tests/it/scalars/testdata/string.txt @@ -3312,23 +3312,23 @@ ast : left('123456789', a) raw expr : left('123456789', a::UInt8) checked expr : left("123456789", to_uint64(a)) evaluation: -+--------+----------+-------------+ -| | a | Output | -+--------+----------+-------------+ -| Type | UInt8 | String | -| Domain | {0..=10} | {""..} | -| Row 0 | 0 | '' | -| Row 1 | 1 | '1' | -| Row 2 | 2 | '12' | -| Row 3 | 3 | '123' | -| Row 4 | 4 | '1234' | -| Row 5 | 5 | '12345' | -| Row 6 | 6 | '123456' | -| Row 7 | 7 | '1234567' | -| Row 8 | 8 | '12345678' | -| Row 9 | 9 | '123456789' | -| Row 10 | 10 | '123456789' | -+--------+----------+-------------+ ++--------+----------+--------------------+ +| | a | Output | ++--------+----------+--------------------+ +| Type | UInt8 | String | +| Domain | {0..=10} | {""..="123456789"} | +| Row 0 | 0 | '' | +| Row 1 | 1 | '1' | +| Row 2 | 2 | '12' | +| Row 3 | 3 | '123' | +| Row 4 | 4 | '1234' | +| Row 5 | 5 | '12345' | +| Row 6 | 6 | '123456' | +| Row 7 | 7 | '1234567' | +| Row 8 | 8 | '12345678' | +| Row 9 | 9 | '123456789' | +| Row 10 | 10 | '123456789' | ++--------+----------+--------------------+ evaluation (internal): +--------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Column | Data | diff --git a/tests/sqllogictests/suites/base/09_fuse_engine/09_0014_func_clustering_information_function.test b/tests/sqllogictests/suites/base/09_fuse_engine/09_0014_func_clustering_information_function.test index f2134b27bcf0..5d92d83a8ee0 100644 --- a/tests/sqllogictests/suites/base/09_fuse_engine/09_0014_func_clustering_information_function.test +++ b/tests/sqllogictests/suites/base/09_fuse_engine/09_0014_func_clustering_information_function.test @@ -46,3 +46,34 @@ select * exclude(timestamp) from clustering_information('default','t09_0014', '( statement ok drop table t09_0014 +statement ok +create table t09_0014_1(c string) + +statement ok +insert into t09_0014_1 values ('abc'), ('abd') + +statement ok +insert into t09_0014_1 values ('xyy'), ('xyz') + +query TIIFFT +select * exclude(timestamp) from clustering_information('default','t09_0014_1', '(c)') +---- +(c) 2 0 0.0 1.0 {"00001":2} + +query TIIFFT +select * exclude(timestamp) from clustering_information('default','t09_0014_1', '(substr(c,1))') +---- +(SUBSTRING(c FROM 1)) 2 0 0.0 1.0 {"00001":2} + +query TIIFFT +select * exclude(timestamp) from clustering_information('default','t09_0014_1', '(substr(c,1,2))') +---- +(SUBSTRING(c FROM 1 FOR 2)) 2 2 0.0 1.0 {"00001":2} + +query TIIFFT +select * exclude(timestamp) from clustering_information('default','t09_0014_1', '(substr(c,2,2))') +---- +(SUBSTRING(c FROM 2 FOR 2)) 2 0 1.0 2.0 {"00002":2} + +statement ok +drop table t09_0014_1 diff --git a/tests/sqllogictests/suites/mode/standalone/explain/explain_substr.test b/tests/sqllogictests/suites/mode/standalone/explain/explain_substr.test new file mode 100644 index 000000000000..bde37c2700d3 --- /dev/null +++ b/tests/sqllogictests/suites/mode/standalone/explain/explain_substr.test @@ -0,0 +1,68 @@ +statement ok +create or replace table t1(c string) + +statement ok +insert into t1 values ('abc'), ('abd') + +statement ok +insert into t1 values ('xyy'), ('xyz') + +# expects that range pruning prunes 1 block: "range pruning: 2 to 1" +query T +explain select * from t1 where substr(c, 1, 2) = 'ab' +---- +Filter +├── output columns: [t1.c (#0)] +├── filters: [is_true(substr(t1.c (#0), 1, 2) = 'ab')] +├── estimated rows: 0.80 +└── TableScan + ├── table: default.default.t1 + ├── output columns: [c (#0)] + ├── read rows: 2 + ├── read size: < 1 KiB + ├── partitions total: 2 + ├── partitions scanned: 1 + ├── pruning stats: [segments: , blocks: ] + ├── push downs: [filters: [is_true(substr(t1.c (#0), 1, 2) = 'ab')], limit: NONE] + └── estimated rows: 4.00 + +query T +explain select * from t1 where substr(c, 2, 2) = 'ab' +---- +Filter +├── output columns: [t1.c (#0)] +├── filters: [is_true(substr(t1.c (#0), 2, 2) = 'ab')] +├── estimated rows: 0.80 +└── TableScan + ├── table: default.default.t1 + ├── output columns: [c (#0)] + ├── read rows: 4 + ├── read size: < 1 KiB + ├── partitions total: 2 + ├── partitions scanned: 2 + ├── pruning stats: [segments: , blocks: ] + ├── push downs: [filters: [is_true(substr(t1.c (#0), 2, 2) = 'ab')], limit: NONE] + └── estimated rows: 4.00 + +# expects that range pruning prunes 1 block: "range pruning: 2 to 1" +query T +explain select * from t1 where left(c, 2) = 'ab' +---- +Filter +├── output columns: [t1.c (#0)] +├── filters: [is_true(left(t1.c (#0), 2) = 'ab')] +├── estimated rows: 0.80 +└── TableScan + ├── table: default.default.t1 + ├── output columns: [c (#0)] + ├── read rows: 2 + ├── read size: < 1 KiB + ├── partitions total: 2 + ├── partitions scanned: 1 + ├── pruning stats: [segments: , blocks: ] + ├── push downs: [filters: [is_true(left(t1.c (#0), 2) = 'ab')], limit: NONE] + └── estimated rows: 4.00 + +statement ok +drop table t1 all +