Skip to content

Commit

Permalink
Added supprt for match bool prefix (#3806)
Browse files Browse the repository at this point in the history
Closes #3780
  • Loading branch information
fulmicoton authored Sep 7, 2023
1 parent 04a6e4a commit 2409d2b
Show file tree
Hide file tree
Showing 9 changed files with 327 additions and 34 deletions.
29 changes: 28 additions & 1 deletion docs/reference/es_compatible_api.md
Original file line number Diff line number Diff line change
Expand Up @@ -368,9 +368,36 @@ The following query types are supported.
| `max_expansions` | `Integer` | Number of terms to be match by the prefix matching. | 50 |
| `slop` | `Integer` | Allows extra tokens between the query tokens. | 0 |
| `analyzer` | String | Analyzer meant to cut the query into terms. It is recommended to NOT use this parameter. | The actual field tokenizer. |
| `zero_terms_query` | `all` or `none` | Defines if all (`all`) or no documents (`none`) should be returned if the query does not contain any terms after tokenization. | `none` |




### `match_bool_prefix`

[Elasticsearch reference documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query-phrase-prefix.html)

#### Example

```json
{
"match_bool_prefix": {
"payload.commits.message": {
"query" : "automated comm" // This will match "automated commit" for instance.
}
}
}
```

Contrary to ES/Opensearch, in Quickwit, at most 50 terms will be considered when searching the last term of the query as a prefix `match_bool_prefix`.

#### Supported Parameters

| Variable | Type | Description | Default |
|-------------------|------------|------------------------------------------------------------------|---------|
| `query` | String | Full-text search query. The last token will be prefix-matched | - |
| `operator` | `"AND"` or `"OR"` | Defines whether all terms should be present (`AND`) or if at least one term is sufficient to match (`OR`). | OR |
| `zero_terms_query`| `all` or `none` | Defines if all (`all`) or no documents (`none`) should be returned if the query does not contain any terms after tokenization. | `none` |

### `term`

[Elasticsearch reference documentation](https://www.elastic.co/guide/en/elasticsearch/reference/8.8/query-dsl-term-query.html)
Expand Down
70 changes: 49 additions & 21 deletions quickwit/quickwit-doc-mapper/src/query_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ use std::convert::Infallible;
use std::ops::Bound;

use quickwit_query::query_ast::{
PhrasePrefixQuery, QueryAst, QueryAstVisitor, RangeQuery, TermSetQuery,
FullTextMode, FullTextQuery, PhrasePrefixQuery, QueryAst, QueryAstVisitor, RangeQuery,
TermSetQuery,
};
use quickwit_query::InvalidQuery;
use tantivy::query::Query;
Expand Down Expand Up @@ -69,7 +70,7 @@ pub(crate) fn build_query(

let term_set_query_fields = extract_term_set_query_fields(query_ast);
let term_ranges_grouped_by_field =
extract_phrase_prefix_term_ranges(query_ast, &schema, tokenizer_manager)?;
extract_prefix_term_ranges(query_ast, &schema, tokenizer_manager)?;

let mut terms_grouped_by_field: HashMap<Field, HashMap<_, bool>> = Default::default();
query.query_terms(&mut |term, need_position| {
Expand Down Expand Up @@ -134,52 +135,79 @@ fn prefix_term_to_range(prefix: Term) -> (Bound<Term>, Bound<Term>) {
(Bound::Included(prefix), Bound::Unbounded)
}

struct ExtractPhrasePrefixTermRanges<'a> {
type PositionNeeded = bool;

struct ExtractPrefixTermRanges<'a> {
schema: &'a Schema,
tokenizer_manager: &'a TokenizerManager,
term_ranges_to_warm_up: HashMap<Field, HashMap<TermRange, bool>>,
term_ranges_to_warm_up: HashMap<Field, HashMap<TermRange, PositionNeeded>>,
}

impl<'a> ExtractPhrasePrefixTermRanges<'a> {
impl<'a> ExtractPrefixTermRanges<'a> {
fn with_schema(schema: &'a Schema, tokenizer_manager: &'a TokenizerManager) -> Self {
ExtractPhrasePrefixTermRanges {
ExtractPrefixTermRanges {
schema,
tokenizer_manager,
term_ranges_to_warm_up: HashMap::new(),
}
}

fn add_prefix_term(
&mut self,
term: Term,
max_expansions: u32,
position_needed: PositionNeeded,
) {
let field = term.field();
let (start, end) = prefix_term_to_range(term);
let term_range = TermRange {
start,
end,
limit: Some(max_expansions as u64),
};
self.term_ranges_to_warm_up
.entry(field)
.or_default()
.insert(term_range, position_needed);
}
}

impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPhrasePrefixTermRanges<'b> {
impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> {
type Err = InvalidQuery;

fn visit_full_text(&mut self, full_text_query: &'a FullTextQuery) -> Result<(), Self::Err> {
if let FullTextMode::BoolPrefix {
operator: _,
max_expansions,
} = &full_text_query.params.mode
{
if let Some(prefix_term) =
full_text_query.get_last_term(self.schema, self.tokenizer_manager)
{
self.add_prefix_term(prefix_term, *max_expansions, false);
}
}
Ok(())
}

fn visit_phrase_prefix(
&mut self,
phrase_prefix: &'a PhrasePrefixQuery,
) -> Result<(), Self::Err> {
let (field, terms) = phrase_prefix.get_terms(self.schema, self.tokenizer_manager)?;
let (_, terms) = phrase_prefix.get_terms(self.schema, self.tokenizer_manager)?;
if let Some((_, term)) = terms.last() {
let (start, end) = prefix_term_to_range(term.clone());
let term_range = TermRange {
start,
end,
limit: Some(phrase_prefix.max_expansions as u64),
};
self.term_ranges_to_warm_up
.entry(field)
.or_default()
.insert(term_range, true);
self.add_prefix_term(term.clone(), phrase_prefix.max_expansions, true);
}
Ok(())
}
}

fn extract_phrase_prefix_term_ranges(
fn extract_prefix_term_ranges(
query_ast: &QueryAst,
schema: &Schema,
tokenizer_manager: &TokenizerManager,
) -> anyhow::Result<HashMap<Field, HashMap<TermRange, bool>>> {
let mut visitor = ExtractPhrasePrefixTermRanges::with_schema(schema, tokenizer_manager);
) -> anyhow::Result<HashMap<Field, HashMap<TermRange, PositionNeeded>>> {
let mut visitor = ExtractPrefixTermRanges::with_schema(schema, tokenizer_manager);
visitor.visit(query_ast)?;
Ok(visitor.term_ranges_to_warm_up)
}
Expand Down
65 changes: 65 additions & 0 deletions quickwit/quickwit-query/src/elastic_query_dsl/match_bool_prefix.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// Copyright (C) 2023 Quickwit, Inc.
//
// Quickwit is offered under the AGPL v3.0 and as commercial software.
// For commercial licensing, contact us at [email protected].
//
// AGPL:
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.

use serde::Deserialize;

use crate::elastic_query_dsl::match_query::{MatchQueryParams, MatchQueryParamsForDeserialization};
use crate::elastic_query_dsl::{default_max_expansions, ConvertableToQueryAst};
use crate::query_ast::{FullTextParams, FullTextQuery, QueryAst};
use crate::OneFieldMap;

/// `MatchBoolPrefixQuery` as defined in
/// <https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-bool-prefix-query.html>
#[derive(Deserialize, Clone, Eq, PartialEq, Debug)]
#[serde(
from = "OneFieldMap<MatchQueryParamsForDeserialization>",
into = "OneFieldMap<MatchQueryParams>"
)]
pub(crate) struct MatchBoolPrefixQuery {
pub(crate) field: String,
pub(crate) params: MatchQueryParams,
}

impl ConvertableToQueryAst for MatchBoolPrefixQuery {
fn convert_to_query_ast(self) -> anyhow::Result<QueryAst> {
let full_text_params = FullTextParams {
tokenizer: None,
mode: crate::query_ast::FullTextMode::BoolPrefix {
operator: self.params.operator,
max_expansions: default_max_expansions(),
},
zero_terms_query: self.params.zero_terms_query,
};
Ok(QueryAst::FullText(FullTextQuery {
field: self.field,
text: self.params.query,
params: full_text_params,
}))
}
}

impl From<OneFieldMap<MatchQueryParamsForDeserialization>> for MatchBoolPrefixQuery {
fn from(match_query_params: OneFieldMap<MatchQueryParamsForDeserialization>) -> Self {
let OneFieldMap { field, value } = match_query_params;
MatchBoolPrefixQuery {
field,
params: value.inner,
}
}
}
4 changes: 2 additions & 2 deletions quickwit/quickwit-query/src/elastic_query_dsl/match_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,9 @@ impl From<MatchQuery> for ElasticQueryDslInner {

#[derive(Deserialize)]
#[serde(transparent)]
struct MatchQueryParamsForDeserialization {
pub(crate) struct MatchQueryParamsForDeserialization {
#[serde(deserialize_with = "string_or_struct")]
inner: MatchQueryParams,
pub(crate) inner: MatchQueryParams,
}

impl From<MatchQuery> for OneFieldMap<MatchQueryParams> {
Expand Down
10 changes: 10 additions & 0 deletions quickwit/quickwit-query/src/elastic_query_dsl/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use serde::{Deserialize, Serialize};

mod bool_query;
mod exists_query;
mod match_bool_prefix;
mod match_phrase_query;
mod match_query;
mod multi_match;
Expand All @@ -39,13 +40,18 @@ use range_query::RangeQuery;
use term_query::TermQuery;

use crate::elastic_query_dsl::exists_query::ExistsQuery;
use crate::elastic_query_dsl::match_bool_prefix::MatchBoolPrefixQuery;
use crate::elastic_query_dsl::match_phrase_query::MatchPhraseQuery;
use crate::elastic_query_dsl::match_query::MatchQuery;
use crate::elastic_query_dsl::multi_match::MultiMatchQuery;
use crate::elastic_query_dsl::terms_query::TermsQuery;
use crate::not_nan_f32::NotNaNf32;
use crate::query_ast::QueryAst;

fn default_max_expansions() -> u32 {
50
}

#[derive(Serialize, Deserialize, Debug, Eq, PartialEq, Clone, Copy, Default)]
pub(crate) struct MatchAllQuery {
pub boost: Option<NotNaNf32>,
Expand All @@ -64,6 +70,7 @@ pub(crate) enum ElasticQueryDslInner {
MatchAll(MatchAllQuery),
MatchNone(MatchNoneQuery),
Match(MatchQuery),
MatchBoolPrefix(MatchBoolPrefixQuery),
MatchPhrase(MatchPhraseQuery),
MatchPhrasePrefix(MatchPhrasePrefixQuery),
MultiMatch(MultiMatchQuery),
Expand Down Expand Up @@ -105,6 +112,9 @@ impl ConvertableToQueryAst for ElasticQueryDslInner {
}
}
Self::MatchNone(_) => Ok(QueryAst::MatchNone),
Self::MatchBoolPrefix(match_bool_prefix_query) => {
match_bool_prefix_query.convert_to_query_ast()
}
Self::MatchPhrase(match_phrase_query) => match_phrase_query.convert_to_query_ast(),
Self::MatchPhrasePrefix(match_phrase_prefix) => {
match_phrase_prefix.convert_to_query_ast()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,14 @@
use serde::Deserialize;

use crate::elastic_query_dsl::one_field_map::OneFieldMap;
use crate::elastic_query_dsl::{ConvertableToQueryAst, ElasticQueryDslInner};
use crate::elastic_query_dsl::{
default_max_expansions, ConvertableToQueryAst, ElasticQueryDslInner,
};
use crate::query_ast::{self, FullTextMode, FullTextParams, QueryAst};
use crate::MatchAllOrNone;

pub(crate) type MatchPhrasePrefixQuery = OneFieldMap<MatchPhrasePrefixQueryParams>;

fn default_max_expansions() -> u32 {
50
}

#[derive(PartialEq, Eq, Debug, Deserialize, Clone)]
#[serde(deny_unknown_fields)]
pub(crate) struct MatchPhrasePrefixQueryParams {
Expand Down
Loading

0 comments on commit 2409d2b

Please sign in to comment.