Skip to content

Commit

Permalink
chore(query): substr support function domain (#15795)
Browse files Browse the repository at this point in the history
* improve substr domain

* fix test

* fix test

* fix test
  • Loading branch information
zhyass committed Jun 13, 2024
1 parent d6744ab commit fcd3b55
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 20 deletions.
53 changes: 50 additions & 3 deletions src/query/functions/src/scalars/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use databend_common_expression::types::number::SimpleDomain;
use databend_common_expression::types::number::UInt64Type;
use databend_common_expression::types::string::StringColumn;
use databend_common_expression::types::string::StringColumnBuilder;
use databend_common_expression::types::string::StringDomain;
use databend_common_expression::types::ArrayType;
use databend_common_expression::types::NumberType;
use databend_common_expression::types::StringType;
Expand Down Expand Up @@ -631,7 +632,25 @@ pub fn register(registry: &mut FunctionRegistry) {

registry.register_passthrough_nullable_2_arg::<StringType, NumberType<u64>, StringType, _, _>(
"left",
|_, _, _| FunctionDomain::Full,
|_, lhs, rhs| {
let rm = rhs.min as usize;
let min = if rm < lhs.min.chars().count() {
lhs.min.slice(0..rm).to_string()
} else {
lhs.min.clone()
};

let rn = rhs.max as usize;
let max = lhs.max.as_ref().map(|ln| {
if rn < ln.chars().count() {
ln.slice(0..rn).to_string()
} else {
ln.clone()
}
});

FunctionDomain::Domain(StringDomain { min, max })
},
vectorize_with_builder_2_arg::<StringType, NumberType<u64>, StringType>(
|s, n, output, _| {
let n = n as usize;
Expand Down Expand Up @@ -665,7 +684,13 @@ pub fn register(registry: &mut FunctionRegistry) {

registry.register_passthrough_nullable_2_arg::<StringType, NumberType<i64>, StringType, _, _>(
"substr",
|_, _, _| FunctionDomain::Full,
|_, lhs, rhs| {
if rhs.min == rhs.max && rhs.min == 1 {
FunctionDomain::Domain(lhs.clone())
} else {
FunctionDomain::Full
}
},
vectorize_with_builder_2_arg::<StringType, NumberType<i64>, StringType>(
|s, pos, output, _ctx| {
substr(output, s, pos, s.len() as u64);
Expand All @@ -675,7 +700,29 @@ pub fn register(registry: &mut FunctionRegistry) {

registry.register_passthrough_nullable_3_arg::<StringType, NumberType<i64>, NumberType<u64>, StringType, _, _>(
"substr",
|_, _, _, _| FunctionDomain::Full,
|_, arg1, arg2, arg3| {
if arg2.min == arg2.max && arg2.min == 1 {
let rm = arg3.min as usize;
let min = if rm < arg1.min.chars().count() {
arg1.min.slice(0..rm).to_string()
} else {
arg1.min.clone()
};

let rn = arg3.max as usize;
let max = arg1.max.as_ref().map(|ln| {
if rn < ln.chars().count() {
ln.slice(0..rn).to_string()
} else {
ln.clone()
}
});

FunctionDomain::Domain(StringDomain { min, max })
} else {
FunctionDomain::Full
}
},
vectorize_with_builder_3_arg::<StringType, NumberType<i64>, NumberType<u64>, StringType>(|s, pos, len, output, _ctx| {
substr(output, s, pos, len);
}),
Expand Down
34 changes: 17 additions & 17 deletions src/query/functions/tests/it/scalars/testdata/string.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3312,23 +3312,23 @@ ast : left('123456789', a)
raw expr : left('123456789', a::UInt8)
checked expr : left<String, UInt64>("123456789", to_uint64<UInt8>(a))
evaluation:
+--------+----------+-------------+
| | a | Output |
+--------+----------+-------------+
| Type | UInt8 | String |
| Domain | {0..=10} | {""..} |
| Row 0 | 0 | '' |
| Row 1 | 1 | '1' |
| Row 2 | 2 | '12' |
| Row 3 | 3 | '123' |
| Row 4 | 4 | '1234' |
| Row 5 | 5 | '12345' |
| Row 6 | 6 | '123456' |
| Row 7 | 7 | '1234567' |
| Row 8 | 8 | '12345678' |
| Row 9 | 9 | '123456789' |
| Row 10 | 10 | '123456789' |
+--------+----------+-------------+
+--------+----------+--------------------+
| | a | Output |
+--------+----------+--------------------+
| Type | UInt8 | String |
| Domain | {0..=10} | {""..="123456789"} |
| Row 0 | 0 | '' |
| Row 1 | 1 | '1' |
| Row 2 | 2 | '12' |
| Row 3 | 3 | '123' |
| Row 4 | 4 | '1234' |
| Row 5 | 5 | '12345' |
| Row 6 | 6 | '123456' |
| Row 7 | 7 | '1234567' |
| Row 8 | 8 | '12345678' |
| Row 9 | 9 | '123456789' |
| Row 10 | 10 | '123456789' |
+--------+----------+--------------------+
evaluation (internal):
+--------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Column | Data |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,34 @@ select * exclude(timestamp) from clustering_information('default','t09_0014', '(
statement ok
drop table t09_0014

statement ok
create table t09_0014_1(c string)

statement ok
insert into t09_0014_1 values ('abc'), ('abd')

statement ok
insert into t09_0014_1 values ('xyy'), ('xyz')

query TIIFFT
select * exclude(timestamp) from clustering_information('default','t09_0014_1', '(c)')
----
(c) 2 0 0.0 1.0 {"00001":2}

query TIIFFT
select * exclude(timestamp) from clustering_information('default','t09_0014_1', '(substr(c,1))')
----
(SUBSTRING(c FROM 1)) 2 0 0.0 1.0 {"00001":2}

query TIIFFT
select * exclude(timestamp) from clustering_information('default','t09_0014_1', '(substr(c,1,2))')
----
(SUBSTRING(c FROM 1 FOR 2)) 2 2 0.0 1.0 {"00001":2}

query TIIFFT
select * exclude(timestamp) from clustering_information('default','t09_0014_1', '(substr(c,2,2))')
----
(SUBSTRING(c FROM 2 FOR 2)) 2 0 1.0 2.0 {"00002":2}

statement ok
drop table t09_0014_1
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
statement ok
create or replace table t1(c string)

statement ok
insert into t1 values ('abc'), ('abd')

statement ok
insert into t1 values ('xyy'), ('xyz')

# expects that range pruning prunes 1 block: "range pruning: 2 to 1"
query T
explain select * from t1 where substr(c, 1, 2) = 'ab'
----
Filter
├── output columns: [t1.c (#0)]
├── filters: [is_true(substr(t1.c (#0), 1, 2) = 'ab')]
├── estimated rows: 0.80
└── TableScan
├── table: default.default.t1
├── output columns: [c (#0)]
├── read rows: 2
├── read size: < 1 KiB
├── partitions total: 2
├── partitions scanned: 1
├── pruning stats: [segments: <range pruning: 2 to 1>, blocks: <range pruning: 1 to 1>]
├── push downs: [filters: [is_true(substr(t1.c (#0), 1, 2) = 'ab')], limit: NONE]
└── estimated rows: 4.00

query T
explain select * from t1 where substr(c, 2, 2) = 'ab'
----
Filter
├── output columns: [t1.c (#0)]
├── filters: [is_true(substr(t1.c (#0), 2, 2) = 'ab')]
├── estimated rows: 0.80
└── TableScan
├── table: default.default.t1
├── output columns: [c (#0)]
├── read rows: 4
├── read size: < 1 KiB
├── partitions total: 2
├── partitions scanned: 2
├── pruning stats: [segments: <range pruning: 2 to 2>, blocks: <range pruning: 2 to 2>]
├── push downs: [filters: [is_true(substr(t1.c (#0), 2, 2) = 'ab')], limit: NONE]
└── estimated rows: 4.00

# expects that range pruning prunes 1 block: "range pruning: 2 to 1"
query T
explain select * from t1 where left(c, 2) = 'ab'
----
Filter
├── output columns: [t1.c (#0)]
├── filters: [is_true(left(t1.c (#0), 2) = 'ab')]
├── estimated rows: 0.80
└── TableScan
├── table: default.default.t1
├── output columns: [c (#0)]
├── read rows: 2
├── read size: < 1 KiB
├── partitions total: 2
├── partitions scanned: 1
├── pruning stats: [segments: <range pruning: 2 to 1>, blocks: <range pruning: 1 to 1>]
├── push downs: [filters: [is_true(left(t1.c (#0), 2) = 'ab')], limit: NONE]
└── estimated rows: 4.00

statement ok
drop table t1 all

0 comments on commit fcd3b55

Please sign in to comment.