From 50a6e71d0f2b4989eb7c8eaf7781fbb7c63ac1ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Massot?= Date: Wed, 5 Jul 2023 00:32:33 +0200 Subject: [PATCH] Add support for custom tokenizers ngram and regex. (#3575) * Add support for custom tokenizers ngram and regex. * Update backward compatibiilty tests. * Update tantivy version and tokenizers. * Improve tests and add endpoint to analyze text. * Better error on analyze endpoint. * Fix from reviews. --- quickwit/Cargo.lock | 29 +-- quickwit/Cargo.toml | 2 +- .../tests/index_config/hdfs-logs.json | 9 +- .../tests/index_config/hdfs-logs.toml | 5 +- .../tests/index_config/hdfs-logs.yaml | 6 +- .../quickwit-config/src/index_config/mod.rs | 16 +- .../src/default_doc_mapper/default_mapper.rs | 246 +++++++++++++++++- .../default_mapper_builder.rs | 4 + .../default_doc_mapper/field_mapping_entry.rs | 54 ++-- .../src/default_doc_mapper/mod.rs | 5 + .../src/default_doc_mapper/tokenizer_entry.rs | 238 +++++++++++++++++ .../quickwit-doc-mapper/src/doc_mapper.rs | 4 + quickwit/quickwit-doc-mapper/src/lib.rs | 20 +- .../quickwit-doc-mapper/src/query_builder.rs | 49 +++- .../quickwit-indexing/src/actors/indexer.rs | 8 +- .../src/actors/merge_executor.rs | 28 +- .../quickwit-indexing/src/actors/packager.rs | 2 +- .../src/backward_compatibility_tests/mod.rs | 15 +- .../file-backed-index/v0.4.expected.json | 3 +- .../file-backed-index/v0.5.expected.json | 3 +- .../file-backed-index/v0.6.expected.json | 10 +- .../test-data/file-backed-index/v0.6.json | 10 +- .../index-metadata/v0.4.expected.json | 3 +- .../index-metadata/v0.5.expected.json | 3 +- .../index-metadata/v0.6.expected.json | 10 +- .../test-data/index-metadata/v0.6.json | 10 +- quickwit/quickwit-query/src/lib.rs | 5 +- .../src/query_ast/bool_query.rs | 33 ++- .../src/query_ast/full_text_query.rs | 58 ++++- quickwit/quickwit-query/src/query_ast/mod.rs | 100 ++++--- .../src/query_ast/phrase_prefix_query.rs | 7 +- .../src/query_ast/range_query.rs | 55 +++- .../src/query_ast/term_query.rs | 32 ++- .../src/query_ast/term_set_query.rs | 13 +- .../src/query_ast/user_input_query.rs | 18 +- .../quickwit-query/src/query_ast/utils.rs | 32 ++- quickwit/quickwit-query/src/tokenizers.rs | 37 ++- quickwit/quickwit-search/Cargo.toml | 1 + quickwit/quickwit-search/src/fetch_docs.rs | 12 +- quickwit/quickwit-search/src/leaf.rs | 19 +- .../quickwit-search/src/search_stream/leaf.rs | 9 +- quickwit/quickwit-search/src/tests.rs | 48 +++- .../src/index_api/rest_handler.rs | 85 ++++++ 43 files changed, 1147 insertions(+), 209 deletions(-) create mode 100644 quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index 19aa44fbd38..09be8f99d91 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -2997,9 +2997,9 @@ dependencies = [ [[package]] name = "lz4_flex" -version = "0.10.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b8c72594ac26bfd34f2d99dfced2edfaddfe8a476e3ff2ca0eb293d925c4f83" +checksum = "3ea9b256699eda7b0387ffbc776dd625e28bde3918446381781245b7a50349d8" [[package]] name = "match_cfg" @@ -3061,9 +3061,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] name = "memmap2" -version = "0.6.2" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d28bba84adfe6646737845bc5ebbfa2c08424eb1c37e94a1fd2a82adb56a872" +checksum = "f49388d20533534cd19360ad3d6a7dadc885944aa802ba3995040c5ec11288c6" dependencies = [ "libc", ] @@ -3683,7 +3683,7 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "ownedbytes" version = "0.5.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=924fc70#924fc70cb58f56dcd1a0547f2528c9ea86452763" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=3c30066#3c300666ad448386136d2595b613b3236b123ff9" dependencies = [ "stable_deref_trait", ] @@ -6473,8 +6473,8 @@ dependencies = [ [[package]] name = "tantivy" -version = "0.20.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=924fc70#924fc70cb58f56dcd1a0547f2528c9ea86452763" +version = "0.20.2" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=3c30066#3c300666ad448386136d2595b613b3236b123ff9" dependencies = [ "aho-corasick", "arc-swap", @@ -6486,7 +6486,6 @@ dependencies = [ "crc32fast", "crossbeam-channel", "downcast-rs", - "fail", "fastdivide", "fs4", "futures-util", @@ -6529,7 +6528,7 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" version = "0.4.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=924fc70#924fc70cb58f56dcd1a0547f2528c9ea86452763" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=3c30066#3c300666ad448386136d2595b613b3236b123ff9" dependencies = [ "bitpacking", ] @@ -6537,7 +6536,7 @@ dependencies = [ [[package]] name = "tantivy-columnar" version = "0.1.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=924fc70#924fc70cb58f56dcd1a0547f2528c9ea86452763" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=3c30066#3c300666ad448386136d2595b613b3236b123ff9" dependencies = [ "fastdivide", "fnv", @@ -6552,7 +6551,7 @@ dependencies = [ [[package]] name = "tantivy-common" version = "0.5.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=924fc70#924fc70cb58f56dcd1a0547f2528c9ea86452763" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=3c30066#3c300666ad448386136d2595b613b3236b123ff9" dependencies = [ "async-trait", "byteorder", @@ -6575,7 +6574,7 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" version = "0.20.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=924fc70#924fc70cb58f56dcd1a0547f2528c9ea86452763" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=3c30066#3c300666ad448386136d2595b613b3236b123ff9" dependencies = [ "combine", "once_cell", @@ -6585,7 +6584,7 @@ dependencies = [ [[package]] name = "tantivy-sstable" version = "0.1.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=924fc70#924fc70cb58f56dcd1a0547f2528c9ea86452763" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=3c30066#3c300666ad448386136d2595b613b3236b123ff9" dependencies = [ "tantivy-common", "tantivy-fst", @@ -6595,7 +6594,7 @@ dependencies = [ [[package]] name = "tantivy-stacker" version = "0.1.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=924fc70#924fc70cb58f56dcd1a0547f2528c9ea86452763" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=3c30066#3c300666ad448386136d2595b613b3236b123ff9" dependencies = [ "murmurhash32", "tantivy-common", @@ -6604,7 +6603,7 @@ dependencies = [ [[package]] name = "tantivy-tokenizer-api" version = "0.1.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=924fc70#924fc70cb58f56dcd1a0547f2528c9ea86452763" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=3c30066#3c300666ad448386136d2595b613b3236b123ff9" dependencies = [ "serde", ] diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index dd86593ced1..181649791e9 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -224,7 +224,7 @@ quickwit-serve = { version = "0.6.1", path = "./quickwit-serve" } quickwit-storage = { version = "0.6.1", path = "./quickwit-storage" } quickwit-telemetry = { version = "0.6.1", path = "./quickwit-telemetry" } -tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "924fc70", default-features = false, features = [ +tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "3c30066", default-features = false, features = [ "mmap", "lz4-compression", "zstd-compression", diff --git a/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.json b/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.json index 738162998cb..c4a61cf79a5 100644 --- a/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.json +++ b/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.json @@ -4,6 +4,13 @@ "index_id": "hdfs-logs", "index_uri": "s3://quickwit-indexes/hdfs-logs", "doc_mapping": { + "tokenizers": [ + { + "name": "service_regex", + "type": "regex", + "pattern": "\\w*" + } + ], "field_mappings": [ { "name": "tenant_id", @@ -33,7 +40,7 @@ { "name": "service", "type": "text", - "tokenizer": "raw" + "tokenizer": "service_regex" } ] } diff --git a/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.toml b/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.toml index dc5ddcefb3d..4c177c3c513 100644 --- a/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.toml +++ b/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.toml @@ -3,12 +3,15 @@ index_id = "hdfs-logs" index_uri = "s3://quickwit-indexes/hdfs-logs" [doc_mapping] +tokenizers = [ + { name = "service_regex", type = "regex", pattern = "\\w*" }, +] field_mappings = [ { name = "tenant_id", type = "u64", fast = true }, { name = "timestamp", type = "datetime", fast = true }, { name = "severity_text", type = "text", tokenizer = "raw" }, { name = "body", type = "text", tokenizer = "default", record = "position" }, - { name = "resource", type = "object", field_mappings = [ { name = "service", type = "text", tokenizer = "raw" } ] }, + { name = "resource", type = "object", field_mappings = [ { name = "service", type = "text", tokenizer = "service_regex" } ] }, ] tag_fields = [ "tenant_id" ] store_source = true diff --git a/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.yaml b/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.yaml index c8658f9107a..caa390111d3 100644 --- a/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.yaml +++ b/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.yaml @@ -3,6 +3,10 @@ index_id: hdfs-logs index_uri: s3://quickwit-indexes/hdfs-logs doc_mapping: + tokenizers: + - name: service_regex + type: regex + pattern: "\\w*" field_mappings: - name: tenant_id type: u64 @@ -22,7 +26,7 @@ doc_mapping: field_mappings: - name: service type: text - tokenizer: raw + tokenizer: service_regex tag_fields: [tenant_id] timestamp_field: timestamp store_source: true diff --git a/quickwit/quickwit-config/src/index_config/mod.rs b/quickwit/quickwit-config/src/index_config/mod.rs index 53fdec46299..ac290688431 100644 --- a/quickwit/quickwit-config/src/index_config/mod.rs +++ b/quickwit/quickwit-config/src/index_config/mod.rs @@ -33,7 +33,7 @@ use humantime::parse_duration; use quickwit_common::uri::Uri; use quickwit_doc_mapper::{ DefaultDocMapper, DefaultDocMapperBuilder, DocMapper, FieldMappingEntry, ModeType, - QuickwitJsonOptions, + QuickwitJsonOptions, TokenizerEntry, }; use serde::{Deserialize, Serialize}; pub use serialize::load_index_config_from_user_config; @@ -76,6 +76,8 @@ pub struct DocMapping { #[schema(value_type = u32)] #[serde(default = "DefaultDocMapper::default_max_num_partitions")] pub max_num_partitions: NonZeroU32, + #[serde(default)] + pub tokenizers: Vec, } #[derive(Clone, Debug, Serialize, Deserialize, utoipa::ToSchema)] @@ -414,6 +416,14 @@ impl TestableForRegression for IndexConfig { }"#, ) .unwrap(); + let tokenizer = serde_json::from_str( + r#"{ + "name": "custom_tokenizer", + "type": "regex", + "pattern": "[^\\p{L}\\p{N}]+" + }"#, + ) + .unwrap(); let doc_mapping = DocMapping { field_mappings: vec![ tenant_id_mapping, @@ -431,6 +441,7 @@ impl TestableForRegression for IndexConfig { partition_key: Some("tenant_id".to_string()), max_num_partitions: NonZeroU32::new(100).unwrap(), timestamp_field: Some("timestamp".to_string()), + tokenizers: vec![tokenizer], }; let retention_policy = Some(RetentionPolicy::new( "90 days".to_string(), @@ -507,6 +518,7 @@ pub fn build_doc_mapper( dynamic_mapping: doc_mapping.dynamic_mapping.clone(), partition_key: doc_mapping.partition_key.clone(), max_num_partitions: doc_mapping.max_num_partitions, + tokenizers: doc_mapping.tokenizers.clone(), }; Ok(Arc::new(builder.try_build()?)) } @@ -539,6 +551,8 @@ mod tests { &Uri::from_well_formed("s3://defaultbucket/"), ) .unwrap(); + assert_eq!(index_config.doc_mapping.tokenizers.len(), 1); + assert_eq!(index_config.doc_mapping.tokenizers[0].name, "service_regex"); assert_eq!(index_config.doc_mapping.field_mappings.len(), 5); assert_eq!(index_config.doc_mapping.field_mappings[0].name, "tenant_id"); assert_eq!(index_config.doc_mapping.field_mappings[1].name, "timestamp"); diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs index 4c75c2a269b..5419cb157e6 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs @@ -17,18 +17,20 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use std::collections::{BTreeMap, BTreeSet}; +use std::collections::{BTreeMap, BTreeSet, HashSet}; use std::num::NonZeroU32; use anyhow::{bail, Context}; +use quickwit_query::create_default_quickwit_tokenizer_manager; use quickwit_query::query_ast::QueryAst; use serde::{Deserialize, Serialize}; use serde_json::{self, Value as JsonValue}; use tantivy::query::Query; use tantivy::schema::{Field, FieldType, Schema, Value as TantivyValue, STORED}; +use tantivy::tokenizer::TokenizerManager; use tantivy::Document; -use super::field_mapping_entry::QuickwitTextTokenizer; +use super::field_mapping_entry::RAW_TOKENIZER_NAME; use super::DefaultDocMapperBuilder; use crate::default_doc_mapper::mapping_tree::{build_mapping_tree, MappingNode}; use crate::default_doc_mapper::FieldMappingType; @@ -37,8 +39,8 @@ use crate::doc_mapper::{JsonObject, Partition}; use crate::query_builder::build_query; use crate::routing_expression::RoutingExpr; use crate::{ - Cardinality, DocMapper, DocParsingError, ModeType, QueryParserError, WarmupInfo, - DYNAMIC_FIELD_NAME, SOURCE_FIELD_NAME, + Cardinality, DocMapper, DocParsingError, ModeType, QueryParserError, TokenizerEntry, + WarmupInfo, DYNAMIC_FIELD_NAME, SOURCE_FIELD_NAME, }; /// Defines how an unmapped field should be handled. @@ -96,6 +98,10 @@ pub struct DefaultDocMapper { required_fields: Vec, /// Defines how unmapped fields should be handle. mode: Mode, + /// User-defined tokenizers. + tokenizer_entries: Vec, + /// Tokenizer manager. + tokenizer_manager: TokenizerManager, } impl DefaultDocMapper { @@ -164,6 +170,40 @@ impl TryFrom for DefaultDocMapper { let schema = schema_builder.build(); + let tokenizer_manager = create_default_quickwit_tokenizer_manager(); + let mut custom_tokenizer_names = HashSet::new(); + for tokenizer_config_entry in builder.tokenizers.iter() { + if custom_tokenizer_names.contains(&tokenizer_config_entry.name) { + bail!( + "Duplicated custom tokenizer: `{}`", + tokenizer_config_entry.name + ); + } + if tokenizer_manager + .get(&tokenizer_config_entry.name) + .is_some() + { + bail!( + "Custom tokenizer name `{}` should be different from built-in tokenizer's \ + names.", + tokenizer_config_entry.name + ); + } + let tokenizer = tokenizer_config_entry + .config + .text_analyzer() + .map_err(|error| { + anyhow::anyhow!( + "Failed to build tokenizer `{}`: {:?}", + tokenizer_config_entry.name, + error + ) + })?; + tokenizer_manager.register(&tokenizer_config_entry.name, tokenizer); + custom_tokenizer_names.insert(&tokenizer_config_entry.name); + } + validate_fields_tokenizers(&schema, &tokenizer_manager)?; + // Resolve default search fields let mut default_search_field_names = Vec::new(); for default_search_field_name in &builder.default_search_fields { @@ -216,6 +256,8 @@ impl TryFrom for DefaultDocMapper { partition_key, max_num_partitions: builder.max_num_partitions, mode, + tokenizer_entries: builder.tokenizers, + tokenizer_manager, }) } } @@ -235,8 +277,8 @@ fn validate_tag(tag_field_name: &str, schema: &Schema) -> Result<(), anyhow::Err FieldType::Str(options) => { let tokenizer_opt = options .get_indexing_options() - .map(|text_options| text_options.tokenizer()); - if tokenizer_opt != Some(QuickwitTextTokenizer::Raw.get_name()) { + .map(|text_options: &tantivy::schema::TextFieldIndexing| text_options.tokenizer()); + if tokenizer_opt != Some(RAW_TOKENIZER_NAME) { bail!("Tags collection is only allowed on text fields with the `raw` tokenizer."); } } @@ -266,6 +308,34 @@ fn validate_tag(tag_field_name: &str, schema: &Schema) -> Result<(), anyhow::Err Ok(()) } +/// Checks that a given text/json field name has a registered tokenizer. +fn validate_fields_tokenizers( + schema: &Schema, + tokenizer_manager: &TokenizerManager, +) -> Result<(), anyhow::Error> { + for (_, field_entry) in schema.fields() { + let tokenizer_name_opt = match field_entry.field_type() { + FieldType::Str(options) => options + .get_indexing_options() + .map(|text_options: &tantivy::schema::TextFieldIndexing| text_options.tokenizer()), + FieldType::JsonObject(options) => options + .get_text_indexing_options() + .map(|text_options: &tantivy::schema::TextFieldIndexing| text_options.tokenizer()), + _ => None, + }; + if let Some(tokenizer_name) = tokenizer_name_opt { + if tokenizer_manager.get(tokenizer_name).is_none() { + bail!( + "Unknown tokenizer `{}` for field `{}`.", + tokenizer_name, + field_entry.name() + ); + } + } + } + Ok(()) +} + impl From for DefaultDocMapperBuilder { fn from(default_doc_mapper: DefaultDocMapper) -> Self { let mode = default_doc_mapper.mode.mode_type(); @@ -291,6 +361,7 @@ impl From for DefaultDocMapperBuilder { dynamic_mapping, partition_key: partition_key_opt, max_num_partitions: default_doc_mapper.max_num_partitions, + tokenizers: default_doc_mapper.tokenizer_entries, } } } @@ -397,6 +468,7 @@ impl DocMapper for DefaultDocMapper { build_query( query_ast, split_schema, + self.tokenizer_manager(), &self.default_search_field_names[..], with_validation, ) @@ -421,6 +493,10 @@ impl DocMapper for DefaultDocMapper { fn max_num_partitions(&self) -> NonZeroU32 { self.max_num_partitions } + + fn tokenizer_manager(&self) -> &TokenizerManager { + &self.tokenizer_manager + } } #[cfg(test)] @@ -432,6 +508,7 @@ mod tests { use tantivy::schema::{FieldType, IndexRecordOption, Type, Value as TantivyValue}; use super::DefaultDocMapper; + use crate::default_doc_mapper::field_mapping_entry::DEFAULT_TOKENIZER_NAME; use crate::{ DefaultDocMapperBuilder, DocMapper, DocParsingError, DYNAMIC_FIELD_NAME, SOURCE_FIELD_NAME, }; @@ -1358,10 +1435,7 @@ mod tests { panic!() }; let text_indexing_options = json_options.get_text_indexing_options().unwrap(); - assert_eq!( - text_indexing_options.tokenizer(), - super::QuickwitTextTokenizer::Raw.get_name() - ); + assert_eq!(text_indexing_options.tokenizer(), super::RAW_TOKENIZER_NAME); assert_eq!( text_indexing_options.index_option(), IndexRecordOption::Basic @@ -1376,7 +1450,7 @@ mod tests { }; assert_eq!( text_options.get_indexing_options().unwrap().tokenizer(), - super::QuickwitTextTokenizer::Default.get_name() + DEFAULT_TOKENIZER_NAME ); } } @@ -1441,4 +1515,154 @@ mod tests { .find_field_mapping_type("my\\.timestamp") .unwrap(); } + + #[test] + fn test_build_doc_mapper_with_custom_ngram_tokenizer() { + let mapper = serde_json::from_str::( + r#"{ + "tokenizers": [ + { + "name": "my_tokenizer", + "filters": ["lower_caser", "ascii_folding", "remove_long"], + "type": "ngram", + "min_gram": 3, + "max_gram": 5 + } + ], + "field_mappings": [ + { + "name": "my_text", + "type": "text", + "tokenizer": "my_tokenizer" + } + ] + }"#, + ) + .unwrap(); + let field_mapping_type = mapper + .field_mappings + .find_field_mapping_type("my_text") + .unwrap(); + match &field_mapping_type { + super::FieldMappingType::Text(options, _) => { + assert!(options.tokenizer.is_some()); + let tokenizer = options.tokenizer.as_ref().unwrap(); + assert_eq!(tokenizer.name(), "my_tokenizer"); + } + _ => panic!("Expected a text field"), + } + assert!(mapper.tokenizer_manager().get("my_tokenizer").is_some()); + } + + #[test] + fn test_build_doc_mapper_should_fail_with_unknown_tokenizer() { + let mapper_builder = serde_json::from_str::( + r#"{ + "field_mappings": [ + { + "name": "my_text", + "type": "text", + "tokenizer": "my_tokenizer" + } + ] + }"#, + ) + .unwrap(); + let mapper = mapper_builder.try_build(); + let error_msg = mapper.unwrap_err().to_string(); + assert!(error_msg.contains("Unknown tokenizer")); + } + + #[test] + fn test_build_doc_mapper_tokenizer_manager_with_custom_tokenizer() { + let mapper = serde_json::from_str::( + r#"{ + "tokenizers": [ + { + "name": "my_tokenizer", + "filters": ["lower_caser"], + "type": "ngram", + "min_gram": 3, + "max_gram": 5 + } + ], + "field_mappings": [ + { + "name": "my_text", + "type": "text", + "tokenizer": "my_tokenizer" + } + ] + }"#, + ) + .unwrap(); + let mut tokenizer = mapper.tokenizer_manager().get("my_tokenizer").unwrap(); + let mut token_stream = tokenizer.token_stream("HELLO WORLD"); + assert_eq!(token_stream.next().unwrap().text, "hel"); + assert_eq!(token_stream.next().unwrap().text, "hell"); + assert_eq!(token_stream.next().unwrap().text, "hello"); + } + + #[test] + fn test_build_doc_mapper_with_custom_invalid_regex_tokenizer() { + let mapper_builder = serde_json::from_str::( + r#"{ + "tokenizers": [ + { + "name": "my_tokenizer", + "type": "regex", + "pattern": "(my_pattern" + } + ], + "field_mappings": [ + { + "name": "my_text", + "type": "text", + "tokenizer": "my_tokenizer" + } + ] + }"#, + ) + .unwrap(); + let mapper = mapper_builder.try_build(); + assert!(mapper.is_err()); + let error_mesg = mapper.unwrap_err().to_string(); + assert!(error_mesg.contains("Invalid regex tokenizer")); + } + + #[test] + fn test_doc_mapper_with_custom_tokenizer_equivalent_to_default() { + let mapper = serde_json::from_str::( + r#"{ + "tokenizers": [ + { + "name": "my_tokenizer", + "filters": ["remove_long", "lower_caser"], + "type": "simple", + "min_gram": 3, + "max_gram": 5 + } + ], + "field_mappings": [ + { + "name": "my_text", + "type": "text", + "tokenizer": "my_tokenizer" + } + ] + }"#, + ) + .unwrap(); + let mut default_tokenizer = mapper.tokenizer_manager().get("default").unwrap(); + let mut tokenizer = mapper.tokenizer_manager().get("my_tokenizer").unwrap(); + let text = "I've seen things... seen things you little people wouldn't believe."; + let mut default_token_stream = default_tokenizer.token_stream(text); + let mut token_stream = tokenizer.token_stream(text); + for _ in 0..10 { + assert_eq!( + default_token_stream.next().unwrap().text, + token_stream.next().unwrap().text + ); + } + } } diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper_builder.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper_builder.rs index 609a93f7f2f..0cde2fe48a2 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper_builder.rs @@ -22,6 +22,7 @@ use std::num::NonZeroU32; use anyhow::bail; use serde::{Deserialize, Serialize}; +use super::tokenizer_entry::TokenizerEntry; use super::FieldMappingEntry; use crate::default_doc_mapper::default_mapper::Mode; use crate::default_doc_mapper::QuickwitJsonOptions; @@ -66,6 +67,9 @@ pub struct DefaultDocMapperBuilder { /// how the unmapped fields should be handled. #[serde(default)] pub dynamic_mapping: Option, + /// User-defined tokenizers. + #[serde(default)] + pub tokenizers: Vec, } /// `Mode` describing how the unmapped field should be handled. diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs index 21d921c18b9..61d63178600 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs @@ -17,6 +17,7 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . +use std::borrow::Cow; use std::convert::TryFrom; use anyhow::bail; @@ -211,26 +212,28 @@ impl Default for QuickwitIpAddrOptions { } } -#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize, utoipa::ToSchema)] -pub enum QuickwitTextTokenizer { - #[serde(rename = "raw")] - Raw, - #[serde(rename = "default")] - Default, - #[serde(rename = "en_stem")] - StemEn, - #[serde(rename = "chinese_compatible")] - Chinese, +#[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize, utoipa::ToSchema)] +pub struct QuickwitTextTokenizer(Cow<'static, str>); + +pub(crate) const DEFAULT_TOKENIZER_NAME: &str = "default"; + +pub(crate) const RAW_TOKENIZER_NAME: &str = "raw"; + +impl Default for QuickwitTextTokenizer { + fn default() -> Self { + Self::from_static(DEFAULT_TOKENIZER_NAME) + } } impl QuickwitTextTokenizer { - pub fn get_name(&self) -> &str { - match self { - QuickwitTextTokenizer::Raw => "raw", - QuickwitTextTokenizer::Default => "default", - QuickwitTextTokenizer::StemEn => "en_stem", - QuickwitTextTokenizer::Chinese => "chinese_compatible", - } + pub const fn from_static(name: &'static str) -> Self { + Self(Cow::Borrowed(name)) + } + pub(crate) fn name(&self) -> &str { + &self.0 + } + pub fn raw() -> Self { + Self::from_static(RAW_TOKENIZER_NAME) } } @@ -322,11 +325,11 @@ impl From for TextOptions { .unwrap_or(IndexRecordOption::Basic); let tokenizer = quickwit_text_options .tokenizer - .unwrap_or(QuickwitTextTokenizer::Default); + .unwrap_or(QuickwitTextTokenizer::default()); let text_field_indexing = TextFieldIndexing::default() .set_index_option(index_record_option) .set_fieldnorms(quickwit_text_options.fieldnorms) - .set_tokenizer(tokenizer.get_name()); + .set_tokenizer(tokenizer.name()); text_options = text_options.set_indexing_options(text_field_indexing); } @@ -414,9 +417,9 @@ impl From for JsonObjectOptions { .unwrap_or(IndexRecordOption::Basic); let tokenizer = quickwit_json_options .tokenizer - .unwrap_or(QuickwitTextTokenizer::Raw); + .unwrap_or(QuickwitTextTokenizer::raw()); let text_field_indexing = TextFieldIndexing::default() - .set_tokenizer(tokenizer.get_name()) + .set_tokenizer(tokenizer.name()) .set_index_option(index_record_option); json_options = json_options.set_indexing_options(text_field_indexing); } @@ -728,8 +731,7 @@ mod tests { "name": "my_field_name", "type": "text", "stored": true, - "record": "basic", - "tokenizer": "notexist" + "record": "notexist" } "#, ); @@ -737,7 +739,7 @@ mod tests { assert_eq!( mapping_entry.unwrap_err().to_string(), "Error while parsing field `my_field_name`: unknown variant `notexist`, expected one \ - of `raw`, `default`, `en_stem`, `chinese_compatible`" + of `basic`, `freq`, `position`" .to_string() ); Ok(()) @@ -780,7 +782,7 @@ mod tests { FieldMappingType::Text(options, _) => { assert_eq!(options.stored, true); assert_eq!(options.indexed, true); - assert_eq!(options.tokenizer.unwrap().get_name(), "en_stem"); + assert_eq!(options.tokenizer.unwrap().name(), "en_stem"); assert_eq!(options.record.unwrap(), IndexRecordOption::Basic); } _ => panic!("wrong property type"), @@ -1446,7 +1448,7 @@ mod tests { let expected_json_options = QuickwitJsonOptions { description: None, indexed: true, - tokenizer: Some(QuickwitTextTokenizer::Raw), + tokenizer: Some(QuickwitTextTokenizer::raw()), record: None, stored: false, expand_dots: true, diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mod.rs index 52a01b21f61..77aa2a508ba 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mod.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mod.rs @@ -23,6 +23,7 @@ mod default_mapper_builder; mod field_mapping_entry; mod field_mapping_type; mod mapping_tree; +mod tokenizer_entry; use anyhow::bail; use once_cell::sync::Lazy; @@ -38,6 +39,10 @@ pub(crate) use self::field_mapping_entry::{ FieldMappingEntryForSerialization, IndexRecordOptionSchema, QuickwitTextTokenizer, }; pub(crate) use self::field_mapping_type::FieldMappingType; +pub use self::tokenizer_entry::{analyze_text, TokenizerConfig, TokenizerEntry}; +pub(crate) use self::tokenizer_entry::{ + NgramTokenizerOption, RegexTokenizerOption, TokenFilterType, TokenizerType, +}; use crate::QW_RESERVED_FIELD_NAMES; /// Regular expression validating a field mapping name. diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs new file mode 100644 index 00000000000..14b5aa7eea4 --- /dev/null +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs @@ -0,0 +1,238 @@ +// Copyright (C) 2023 Quickwit, Inc. +// +// Quickwit is offered under the AGPL v3.0 and as commercial software. +// For commercial licensing, contact us at hello@quickwit.io. +// +// AGPL: +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use anyhow::Context; +use quickwit_query::DEFAULT_REMOVE_TOKEN_LENGTH; +use serde::{Deserialize, Serialize}; +use tantivy::tokenizer::{ + AsciiFoldingFilter, LowerCaser, NgramTokenizer, RegexTokenizer, RemoveLongFilter, + SimpleTokenizer, TextAnalyzer, Token, +}; + +/// A `TokenizerEntry` defines a custom tokenizer with its name and configuration. +#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, utoipa::ToSchema)] +pub struct TokenizerEntry { + /// Tokenizer name. + pub name: String, + /// Tokenizer configuration. + #[serde(flatten)] + pub(crate) config: TokenizerConfig, +} + +/// Tokenizer configuration. +#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, utoipa::ToSchema)] +pub struct TokenizerConfig { + #[serde(flatten)] + tokenizer_type: TokenizerType, + #[serde(default)] + filters: Vec, +} + +impl TokenizerConfig { + /// Build a `TextAnalyzer` from a `TokenizerConfig`. + pub fn text_analyzer(&self) -> anyhow::Result { + let mut text_analyzer_builder = match &self.tokenizer_type { + TokenizerType::Simple => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(), + TokenizerType::Ngram(options) => { + let tokenizer = + NgramTokenizer::new(options.min_gram, options.max_gram, options.prefix_only) + .with_context(|| "Invalid ngram tokenizer".to_string())?; + TextAnalyzer::builder(tokenizer).dynamic() + } + TokenizerType::Regex(options) => { + let tokenizer = RegexTokenizer::new(&options.pattern) + .with_context(|| "Invalid regex tokenizer".to_string())?; + TextAnalyzer::builder(tokenizer).dynamic() + } + }; + for filter in &self.filters { + match filter.tantivy_token_filter_enum() { + TantivyTokenFilterEnum::RemoveLong(token_filter) => { + text_analyzer_builder = text_analyzer_builder.filter_dynamic(token_filter); + } + TantivyTokenFilterEnum::LowerCaser(token_filter) => { + text_analyzer_builder = text_analyzer_builder.filter_dynamic(token_filter); + } + TantivyTokenFilterEnum::AsciiFolding(token_filter) => { + text_analyzer_builder = text_analyzer_builder.filter_dynamic(token_filter); + } + } + } + Ok(text_analyzer_builder.build()) + } +} + +/// Helper function to analyze a text with a given `TokenizerConfig`. +pub fn analyze_text(text: &str, tokenizer: &TokenizerConfig) -> anyhow::Result> { + let mut text_analyzer = tokenizer.text_analyzer()?; + let mut token_stream = text_analyzer.token_stream(text); + let mut tokens = Vec::new(); + token_stream.process(&mut |token| { + tokens.push(token.clone()); + }); + Ok(tokens) +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize, utoipa::ToSchema)] +#[serde(rename_all = "snake_case")] +pub enum TokenFilterType { + RemoveLong, + LowerCaser, + AsciiFolding, +} + +/// Tantivy token filter enum to build +/// a `TextAnalyzer` with dynamic token filters. +enum TantivyTokenFilterEnum { + RemoveLong(RemoveLongFilter), + LowerCaser(LowerCaser), + AsciiFolding(AsciiFoldingFilter), +} + +impl TokenFilterType { + fn tantivy_token_filter_enum(&self) -> TantivyTokenFilterEnum { + match &self { + Self::RemoveLong => TantivyTokenFilterEnum::RemoveLong(RemoveLongFilter::limit( + DEFAULT_REMOVE_TOKEN_LENGTH, + )), + Self::LowerCaser => TantivyTokenFilterEnum::LowerCaser(LowerCaser), + Self::AsciiFolding => TantivyTokenFilterEnum::AsciiFolding(AsciiFoldingFilter), + } + } +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize, utoipa::ToSchema)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum TokenizerType { + Simple, + Ngram(NgramTokenizerOption), + Regex(RegexTokenizerOption), +} + +#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, utoipa::ToSchema)] +#[serde(deny_unknown_fields)] +pub struct NgramTokenizerOption { + pub min_gram: usize, + pub max_gram: usize, + #[serde(default)] + pub prefix_only: bool, +} + +#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, utoipa::ToSchema)] +#[serde(deny_unknown_fields)] +pub struct RegexTokenizerOption { + pub pattern: String, +} + +#[cfg(test)] +mod tests { + use super::{NgramTokenizerOption, TokenizerType}; + use crate::default_doc_mapper::RegexTokenizerOption; + use crate::TokenizerEntry; + + #[test] + fn test_deserialize_tokenizer_entry() { + let result: Result = + serde_json::from_str::( + r#" + { + "name": "my_tokenizer", + "type": "ngram", + "min_gram": 1, + "max_gram": 3, + "filters": [ + "remove_long", + "lower_caser", + "ascii_folding" + ] + } + "#, + ); + assert!(result.is_ok()); + let tokenizer_config_entry = result.unwrap(); + assert_eq!(tokenizer_config_entry.config.filters.len(), 3); + match tokenizer_config_entry.config.tokenizer_type { + TokenizerType::Ngram(options) => { + assert_eq!( + options, + NgramTokenizerOption { + min_gram: 1, + max_gram: 3, + prefix_only: false, + } + ) + } + _ => panic!("Unexpected tokenizer type"), + } + } + + #[test] + fn test_deserialize_tokenizer_entry_failed_with_wrong_key() { + let result: Result = + serde_json::from_str::( + r#" + { + "name": "my_tokenizer", + "type": "ngram", + "min_gram": 1, + "max_gram": 3, + "filters": [ + "remove_long", + "lower_caser", + "ascii_folding" + ], + "abc": 123 + } + "#, + ); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("unknown field `abc`")); + } + + #[test] + fn test_tokenizer_entry_regex() { + let result: Result = + serde_json::from_str::( + r#" + { + "name": "my_tokenizer", + "type": "regex", + "pattern": "(my_pattern)" + } + "#, + ); + assert!(result.is_ok()); + let tokenizer_config_entry = result.unwrap(); + assert_eq!(tokenizer_config_entry.config.filters.len(), 0); + match tokenizer_config_entry.config.tokenizer_type { + TokenizerType::Regex(options) => { + assert_eq!( + options, + RegexTokenizerOption { + pattern: "(my_pattern)".to_string(), + } + ) + } + _ => panic!("Unexpected tokenizer type"), + } + } +} diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper.rs index 3743e4c7ee0..5d195555520 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper.rs @@ -28,6 +28,7 @@ use quickwit_query::query_ast::QueryAst; use serde_json::Value as JsonValue; use tantivy::query::Query; use tantivy::schema::{Field, FieldType, Schema, Value}; +use tantivy::tokenizer::TokenizerManager; use tantivy::{Document, Term}; pub type Partition = u64; @@ -143,6 +144,9 @@ pub trait DocMapper: Send + Sync + Debug + DynClone + 'static { /// Returns the maximum number of partitions. fn max_num_partitions(&self) -> NonZeroU32; + + /// Returns the tokenizer manager. + fn tokenizer_manager(&self) -> &TokenizerManager; } /// A struct to wrap a tantivy field with its name. diff --git a/quickwit/quickwit-doc-mapper/src/lib.rs b/quickwit/quickwit-doc-mapper/src/lib.rs index a4a8bacbfaa..de7410c3435 100644 --- a/quickwit/quickwit-doc-mapper/src/lib.rs +++ b/quickwit/quickwit-doc-mapper/src/lib.rs @@ -35,11 +35,13 @@ mod routing_expression; pub mod tag_pruning; pub use default_doc_mapper::{ - DefaultDocMapper, DefaultDocMapperBuilder, FieldMappingEntry, ModeType, QuickwitJsonOptions, + analyze_text, DefaultDocMapper, DefaultDocMapperBuilder, FieldMappingEntry, ModeType, + QuickwitJsonOptions, TokenizerConfig, TokenizerEntry, }; use default_doc_mapper::{ FastFieldOptions, FieldMappingEntryForSerialization, IndexRecordOptionSchema, - QuickwitTextNormalizer, QuickwitTextTokenizer, + NgramTokenizerOption, QuickwitTextNormalizer, QuickwitTextTokenizer, RegexTokenizerOption, + TokenFilterType, TokenizerType, }; pub use doc_mapper::{DocMapper, JsonObject, NamedField, TermRange, WarmupInfo}; pub use error::{DocParsingError, QueryParserError}; @@ -61,13 +63,19 @@ pub(crate) enum Cardinality { #[derive(utoipa::OpenApi)] #[openapi(components(schemas( - QuickwitJsonOptions, FastFieldOptions, - QuickwitTextNormalizer, + FieldMappingEntryForSerialization, + IndexRecordOptionSchema, ModeType, + NgramTokenizerOption, + QuickwitJsonOptions, + QuickwitTextNormalizer, QuickwitTextTokenizer, - IndexRecordOptionSchema, - FieldMappingEntryForSerialization, + RegexTokenizerOption, + TokenFilterType, + TokenizerConfig, + TokenizerEntry, + TokenizerType, )))] /// Schema used for the OpenAPI generation which are apart of this crate. pub struct DocMapperApiSchemas; diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index 0250f0ca5f8..e9f263b3694 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -27,6 +27,7 @@ use quickwit_query::query_ast::{ use quickwit_query::InvalidQuery; use tantivy::query::Query; use tantivy::schema::{Field, Schema}; +use tantivy::tokenizer::TokenizerManager; use tantivy::Term; use crate::{QueryParserError, TermRange, WarmupInfo}; @@ -50,6 +51,7 @@ impl<'a> QueryAstVisitor<'a> for RangeQueryFields { pub(crate) fn build_query( query_ast: &QueryAst, schema: Schema, + tokenizer_manager: &TokenizerManager, search_fields: &[String], with_validation: bool, ) -> Result<(Box, WarmupInfo), QueryParserError> { @@ -58,10 +60,16 @@ pub(crate) fn build_query( let _: Result<(), Infallible> = range_query_fields.visit(query_ast); let fast_field_names: HashSet = range_query_fields.range_query_field_names; - let query = query_ast.build_tantivy_query(&schema, search_fields, with_validation)?; + let query = query_ast.build_tantivy_query( + &schema, + tokenizer_manager, + search_fields, + with_validation, + )?; let term_set_query_fields = extract_term_set_query_fields(query_ast); - let term_ranges_grouped_by_field = extract_phrase_prefix_term_ranges(query_ast, &schema)?; + let term_ranges_grouped_by_field = + extract_phrase_prefix_term_ranges(query_ast, &schema, tokenizer_manager)?; let mut terms_grouped_by_field: HashMap> = Default::default(); query.query_terms(&mut |term, need_position| { @@ -128,13 +136,15 @@ fn prefix_term_to_range(prefix: Term) -> (Bound, Bound) { struct ExtractPhrasePrefixTermRanges<'a> { schema: &'a Schema, + tokenizer_manager: &'a TokenizerManager, term_ranges_to_warm_up: HashMap>, } impl<'a> ExtractPhrasePrefixTermRanges<'a> { - fn with_schema(schema: &'a Schema) -> Self { + fn with_schema(schema: &'a Schema, tokenizer_manager: &'a TokenizerManager) -> Self { ExtractPhrasePrefixTermRanges { schema, + tokenizer_manager, term_ranges_to_warm_up: HashMap::new(), } } @@ -147,7 +157,7 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPhrasePrefixTermRanges<'b> { &mut self, phrase_prefix: &'a PhrasePrefixQuery, ) -> Result<(), Self::Err> { - let (field, terms) = phrase_prefix.get_terms(self.schema)?; + let (field, terms) = phrase_prefix.get_terms(self.schema, self.tokenizer_manager)?; if let Some((_, term)) = terms.last() { let (start, end) = prefix_term_to_range(term.clone()); let term_range = TermRange { @@ -167,8 +177,9 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPhrasePrefixTermRanges<'b> { fn extract_phrase_prefix_term_ranges( query_ast: &QueryAst, schema: &Schema, + tokenizer_manager: &TokenizerManager, ) -> anyhow::Result>> { - let mut visitor = ExtractPhrasePrefixTermRanges::with_schema(schema); + let mut visitor = ExtractPhrasePrefixTermRanges::with_schema(schema, tokenizer_manager); visitor.visit(query_ast)?; Ok(visitor.term_ranges_to_warm_up) } @@ -176,6 +187,7 @@ fn extract_phrase_prefix_term_ranges( #[cfg(test)] mod test { use quickwit_proto::query_ast_from_user_text; + use quickwit_query::create_default_quickwit_tokenizer_manager; use tantivy::schema::{Schema, FAST, INDEXED, STORED, TEXT}; use super::build_query; @@ -234,7 +246,13 @@ mod test { .parse_user_query(&[]) .map_err(|err| err.to_string())?; let schema = make_schema(dynamic_mode); - let query_result = build_query(&query_ast, schema, &[], true); + let query_result = build_query( + &query_ast, + schema, + &create_default_quickwit_tokenizer_manager(), + &[], + true, + ); query_result .map(|query| format!("{:?}", query)) .map_err(|err| err.to_string()) @@ -508,14 +526,27 @@ mod test { .parse_user_query(&[]) .unwrap(); - let (_, warmup_info) = build_query(&query_with_set, make_schema(true), &[], true).unwrap(); + let (_, warmup_info) = build_query( + &query_with_set, + make_schema(true), + &create_default_quickwit_tokenizer_manager(), + &[], + true, + ) + .unwrap(); assert_eq!(warmup_info.term_dict_field_names.len(), 1); assert_eq!(warmup_info.posting_field_names.len(), 1); assert!(warmup_info.term_dict_field_names.contains("title")); assert!(warmup_info.posting_field_names.contains("title")); - let (_, warmup_info) = - build_query(&query_without_set, make_schema(true), &[], true).unwrap(); + let (_, warmup_info) = build_query( + &query_without_set, + make_schema(true), + &create_default_quickwit_tokenizer_manager(), + &[], + true, + ) + .unwrap(); assert!(warmup_info.term_dict_field_names.is_empty()); assert!(warmup_info.posting_field_names.is_empty()); } diff --git a/quickwit/quickwit-indexing/src/actors/indexer.rs b/quickwit/quickwit-indexing/src/actors/indexer.rs index e78963c7143..b6eea483691 100644 --- a/quickwit/quickwit-indexing/src/actors/indexer.rs +++ b/quickwit/quickwit-indexing/src/actors/indexer.rs @@ -39,10 +39,11 @@ use quickwit_config::IndexingSettings; use quickwit_doc_mapper::DocMapper; use quickwit_metastore::checkpoint::{IndexCheckpointDelta, SourceCheckpointDelta}; use quickwit_metastore::Metastore; -use quickwit_query::{get_quickwit_fastfield_normalizer_manager, get_quickwit_tokenizer_manager}; +use quickwit_query::get_quickwit_fastfield_normalizer_manager; use serde::Serialize; use tantivy::schema::Schema; use tantivy::store::{Compressor, ZstdCompressor}; +use tantivy::tokenizer::TokenizerManager; use tantivy::{DateTime, IndexBuilder, IndexSettings}; use tokio::runtime::Handle; use tokio::sync::{OwnedSemaphorePermit, Semaphore}; @@ -83,6 +84,7 @@ struct IndexerState { indexing_settings: IndexingSettings, publish_lock: PublishLock, schema: Schema, + tokenizer_manager: TokenizerManager, max_num_partitions: NonZeroU32, index_settings: IndexSettings, cooperative_indexing_permits: Option>, @@ -98,7 +100,7 @@ impl IndexerState { let index_builder = IndexBuilder::new() .settings(self.index_settings.clone()) .schema(self.schema.clone()) - .tokenizers(get_quickwit_tokenizer_manager().clone()) + .tokenizers(self.tokenizer_manager.clone()) .fast_field_tokenizers(get_quickwit_fastfield_normalizer_manager().clone()); let io_controls = IoControls::default() @@ -466,6 +468,7 @@ impl Indexer { index_serializer_mailbox: Mailbox, ) -> Self { let schema = doc_mapper.schema(); + let tokenizer_manager = doc_mapper.tokenizer_manager().clone(); let docstore_compression = Compressor::Zstd(ZstdCompressor { compression_level: Some(indexing_settings.docstore_compression_level), }); @@ -483,6 +486,7 @@ impl Indexer { indexing_settings, publish_lock: PublishLock::default(), schema, + tokenizer_manager, index_settings, max_num_partitions: doc_mapper.max_num_partitions(), cooperative_indexing_permits, diff --git a/quickwit/quickwit-indexing/src/actors/merge_executor.rs b/quickwit/quickwit-indexing/src/actors/merge_executor.rs index 8092059444f..ae4042baa88 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_executor.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_executor.rs @@ -35,9 +35,10 @@ use quickwit_directories::UnionDirectory; use quickwit_doc_mapper::DocMapper; use quickwit_metastore::{Metastore, SplitMetadata}; use quickwit_proto::metastore_api::DeleteTask; +use quickwit_query::get_quickwit_fastfield_normalizer_manager; use quickwit_query::query_ast::QueryAst; -use quickwit_query::{get_quickwit_fastfield_normalizer_manager, get_quickwit_tokenizer_manager}; use tantivy::directory::{DirectoryClone, MmapDirectory, RamDirectory}; +use tantivy::tokenizer::TokenizerManager; use tantivy::{Advice, DateTime, Directory, Index, IndexMeta, SegmentId, SegmentReader}; use tokio::runtime::Handle; use tracing::{debug, info, instrument, warn}; @@ -154,13 +155,14 @@ fn combine_index_meta(mut index_metas: Vec) -> anyhow::Result], + tokenizer_manager: &TokenizerManager, ) -> anyhow::Result<(IndexMeta, Vec>)> { let mut directories: Vec> = Vec::new(); let mut index_metas = Vec::new(); for tantivy_dir in tantivy_dirs { directories.push(tantivy_dir.clone()); - let index_meta = open_index(tantivy_dir.clone())?.load_metas()?; + let index_meta = open_index(tantivy_dir.clone(), tokenizer_manager)?.load_metas()?; index_metas.push(index_meta); } let union_index_meta = combine_index_meta(index_metas)?; @@ -289,7 +291,8 @@ impl MergeExecutor { merge_scratch_directory: TempDirectory, ctx: &ActorContext, ) -> anyhow::Result { - let (union_index_meta, split_directories) = open_split_directories(&tantivy_dirs)?; + let (union_index_meta, split_directories) = + open_split_directories(&tantivy_dirs, self.doc_mapper.tokenizer_manager())?; // TODO it would be nice if tantivy could let us run the merge in the current thread. fail_point!("before-merge-split"); let controlled_directory = self @@ -306,7 +309,10 @@ impl MergeExecutor { // This will have the side effect of deleting the directory containing the downloaded // splits. - let merged_index = open_index(controlled_directory.clone())?; + let merged_index = open_index( + controlled_directory.clone(), + self.doc_mapper.tokenizer_manager(), + )?; ctx.record_progress(); let split_attrs = merge_split_attrs(merge_split_id, &self.pipeline_id, &splits); @@ -351,7 +357,8 @@ impl MergeExecutor { num_delete_tasks = delete_tasks.len() ); - let (union_index_meta, split_directories) = open_split_directories(&tantivy_dirs)?; + let (union_index_meta, split_directories) = + open_split_directories(&tantivy_dirs, self.doc_mapper.tokenizer_manager())?; let controlled_directory = self .merge_split_directories( union_index_meta, @@ -366,7 +373,7 @@ impl MergeExecutor { // This will have the side effect of deleting the directory containing the downloaded split. let mut merged_index = Index::open(controlled_directory.clone())?; ctx.record_progress(); - merged_index.set_tokenizers(get_quickwit_tokenizer_manager().clone()); + merged_index.set_tokenizers(self.doc_mapper.tokenizer_manager().clone()); merged_index.set_fast_field_tokenizers(get_quickwit_fastfield_normalizer_manager().clone()); ctx.record_progress(); @@ -454,7 +461,7 @@ impl MergeExecutor { ]; directory_stack.extend(split_directories.into_iter()); let union_directory = UnionDirectory::union_of(directory_stack); - let union_index = open_index(union_directory)?; + let union_index = open_index(union_directory, self.doc_mapper.tokenizer_manager())?; ctx.record_progress(); let _protect_guard = ctx.protect_zone(); @@ -510,9 +517,12 @@ impl MergeExecutor { } } -fn open_index>>(directory: T) -> tantivy::Result { +fn open_index>>( + directory: T, + tokenizer_manager: &TokenizerManager, +) -> tantivy::Result { let mut index = Index::open(directory)?; - index.set_tokenizers(get_quickwit_tokenizer_manager().clone()); + index.set_tokenizers(tokenizer_manager.clone()); index.set_fast_field_tokenizers(get_quickwit_fastfield_normalizer_manager().clone()); Ok(index) } diff --git a/quickwit/quickwit-indexing/src/actors/packager.rs b/quickwit/quickwit-indexing/src/actors/packager.rs index f4849eab5f1..3e3e1327969 100644 --- a/quickwit/quickwit-indexing/src/actors/packager.rs +++ b/quickwit/quickwit-indexing/src/actors/packager.rs @@ -363,7 +363,7 @@ mod tests { schema_builder.add_bool_field("tag_bool", NumericOptions::default().set_indexed()); let schema = schema_builder.build(); let mut index = Index::create_in_dir(split_scratch_directory.path(), schema)?; - index.set_tokenizers(quickwit_query::get_quickwit_tokenizer_manager().clone()); + index.set_tokenizers(quickwit_query::create_default_quickwit_tokenizer_manager()); index.set_fast_field_tokenizers( quickwit_query::get_quickwit_fastfield_normalizer_manager().clone(), ); diff --git a/quickwit/quickwit-metastore/src/backward_compatibility_tests/mod.rs b/quickwit/quickwit-metastore/src/backward_compatibility_tests/mod.rs index 7eaf9c6ee12..edaf21021af 100644 --- a/quickwit/quickwit-metastore/src/backward_compatibility_tests/mod.rs +++ b/quickwit/quickwit-metastore/src/backward_compatibility_tests/mod.rs @@ -69,17 +69,19 @@ where for<'a> T: Deserialize<'a> { } fn test_backward_compatibility_single_case(path: &Path) -> anyhow::Result<()> -where T: TestableForRegression { +where T: TestableForRegression + std::fmt::Debug { println!("---\nTest deserialization of {}", path.display()); let deserialized: T = deserialize_json_file(path)?; let expected_path = path.to_string_lossy().replace(".json", ".expected.json"); let expected: T = deserialize_json_file(Path::new(&expected_path))?; + println!("---\nTest equality of {:?}", expected); + println!("---\nwith {:?}", deserialized); deserialized.test_equality(&expected); Ok(()) } fn test_backward_compatibility(test_dir: &Path) -> anyhow::Result<()> -where T: TestableForRegression { +where T: TestableForRegression + std::fmt::Debug { for entry in fs::read_dir(test_dir).with_context(|| format!("Failed to read {}", test_dir.display()))? { @@ -95,7 +97,7 @@ where T: TestableForRegression { } fn test_and_update_expected_files_single_case(expected_path: &Path) -> anyhow::Result -where for<'a> T: Serialize + Deserialize<'a> { +where for<'a> T: std::fmt::Debug + Serialize + Deserialize<'a> { let expected: T = deserialize_json_file(Path::new(&expected_path))?; let expected_old_json_value: JsonValue = deserialize_json_file(Path::new(&expected_path))?; let expected_new_json_value: JsonValue = serde_json::to_value(&expected)?; @@ -104,6 +106,9 @@ where for<'a> T: Serialize + Deserialize<'a> { // No modification return Ok(false); } + println!("---\nTest deserialization of {}", expected_path.display()); + println!("---\nexpected {:?}", expected); + println!("---\nwith {:?}", expected_new_json_value); let mut expected_new_json = serde_json::to_string_pretty(&expected_new_json_value)?; expected_new_json.push('\n'); std::fs::write(expected_path, expected_new_json.as_bytes())?; @@ -111,7 +116,7 @@ where for<'a> T: Serialize + Deserialize<'a> { } fn test_and_update_expected_files(test_dir: &Path) -> anyhow::Result<()> -where for<'a> T: Deserialize<'a> + Serialize { +where for<'a> T: std::fmt::Debug + Deserialize<'a> + Serialize { let mut updated_expected_files = Vec::new(); for entry in fs::read_dir(test_dir)? { let entry = entry?; @@ -189,7 +194,7 @@ where for<'a> T: Serialize { /// - `test` is a function asserting the equality of the deserialized version /// and the expected version. pub(crate) fn test_json_backward_compatibility_helper(test_name: &str) -> anyhow::Result<()> -where T: TestableForRegression { +where T: TestableForRegression + std::fmt::Debug { let sample_instance: T = T::sample_for_regression(); let test_dir = Path::new("test-data").join(test_name); test_global_version(&sample_instance).context("Version is not the global version.")?; diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.4.expected.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.4.expected.json index 0fee081d4fc..d8ce831f14f 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.4.expected.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.4.expected.json @@ -67,7 +67,8 @@ "log_level", "tenant_id" ], - "timestamp_field": "timestamp" + "timestamp_field": "timestamp", + "tokenizers": [] }, "index_id": "my-index", "index_uri": "s3://quickwit-indexes/my-index", diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.5.expected.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.5.expected.json index 9458dab0a5d..dc755b90f0c 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.5.expected.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.5.expected.json @@ -67,7 +67,8 @@ "log_level", "tenant_id" ], - "timestamp_field": "timestamp" + "timestamp_field": "timestamp", + "tokenizers": [] }, "index_id": "my-index", "index_uri": "s3://quickwit-indexes/my-index", diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.6.expected.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.6.expected.json index 4946958b31f..24533065e2d 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.6.expected.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.6.expected.json @@ -67,7 +67,15 @@ "log_level", "tenant_id" ], - "timestamp_field": "timestamp" + "timestamp_field": "timestamp", + "tokenizers": [ + { + "filters": [], + "name": "custom_tokenizer", + "pattern": "[^\\p{L}\\p{N}]+", + "type": "regex" + } + ] }, "index_id": "my-index", "index_uri": "s3://quickwit-indexes/my-index", diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.6.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.6.json index 4946958b31f..24533065e2d 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.6.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.6.json @@ -67,7 +67,15 @@ "log_level", "tenant_id" ], - "timestamp_field": "timestamp" + "timestamp_field": "timestamp", + "tokenizers": [ + { + "filters": [], + "name": "custom_tokenizer", + "pattern": "[^\\p{L}\\p{N}]+", + "type": "regex" + } + ] }, "index_id": "my-index", "index_uri": "s3://quickwit-indexes/my-index", diff --git a/quickwit/quickwit-metastore/test-data/index-metadata/v0.4.expected.json b/quickwit/quickwit-metastore/test-data/index-metadata/v0.4.expected.json index 6ac7c0c2761..c4bd9f937e9 100644 --- a/quickwit/quickwit-metastore/test-data/index-metadata/v0.4.expected.json +++ b/quickwit/quickwit-metastore/test-data/index-metadata/v0.4.expected.json @@ -56,7 +56,8 @@ "log_level", "tenant_id" ], - "timestamp_field": "timestamp" + "timestamp_field": "timestamp", + "tokenizers": [] }, "index_id": "my-index", "index_uri": "s3://quickwit-indexes/my-index", diff --git a/quickwit/quickwit-metastore/test-data/index-metadata/v0.5.expected.json b/quickwit/quickwit-metastore/test-data/index-metadata/v0.5.expected.json index 6ac7c0c2761..c4bd9f937e9 100644 --- a/quickwit/quickwit-metastore/test-data/index-metadata/v0.5.expected.json +++ b/quickwit/quickwit-metastore/test-data/index-metadata/v0.5.expected.json @@ -56,7 +56,8 @@ "log_level", "tenant_id" ], - "timestamp_field": "timestamp" + "timestamp_field": "timestamp", + "tokenizers": [] }, "index_id": "my-index", "index_uri": "s3://quickwit-indexes/my-index", diff --git a/quickwit/quickwit-metastore/test-data/index-metadata/v0.6.expected.json b/quickwit/quickwit-metastore/test-data/index-metadata/v0.6.expected.json index 73dde1cd200..b865fcb421f 100644 --- a/quickwit/quickwit-metastore/test-data/index-metadata/v0.6.expected.json +++ b/quickwit/quickwit-metastore/test-data/index-metadata/v0.6.expected.json @@ -56,7 +56,15 @@ "log_level", "tenant_id" ], - "timestamp_field": "timestamp" + "timestamp_field": "timestamp", + "tokenizers": [ + { + "filters": [], + "name": "custom_tokenizer", + "pattern": "[^\\p{L}\\p{N}]+", + "type": "regex" + } + ] }, "index_id": "my-index", "index_uri": "s3://quickwit-indexes/my-index", diff --git a/quickwit/quickwit-metastore/test-data/index-metadata/v0.6.json b/quickwit/quickwit-metastore/test-data/index-metadata/v0.6.json index 73dde1cd200..b865fcb421f 100644 --- a/quickwit/quickwit-metastore/test-data/index-metadata/v0.6.json +++ b/quickwit/quickwit-metastore/test-data/index-metadata/v0.6.json @@ -56,7 +56,15 @@ "log_level", "tenant_id" ], - "timestamp_field": "timestamp" + "timestamp_field": "timestamp", + "tokenizers": [ + { + "filters": [], + "name": "custom_tokenizer", + "pattern": "[^\\p{L}\\p{N}]+", + "type": "regex" + } + ] }, "index_id": "my-index", "index_uri": "s3://quickwit-indexes/my-index", diff --git a/quickwit/quickwit-query/src/lib.rs b/quickwit/quickwit-query/src/lib.rs index a2a1e5e295c..d76fb425818 100644 --- a/quickwit/quickwit-query/src/lib.rs +++ b/quickwit/quickwit-query/src/lib.rs @@ -43,7 +43,10 @@ pub(crate) use not_nan_f32::NotNaNf32; pub use query_ast::utils::find_field_or_hit_dynamic; use serde::{Deserialize, Serialize}; pub use tantivy::query::Query as TantivyQuery; -pub use tokenizers::{get_quickwit_fastfield_normalizer_manager, get_quickwit_tokenizer_manager}; +pub use tokenizers::{ + create_default_quickwit_tokenizer_manager, get_quickwit_fastfield_normalizer_manager, + DEFAULT_REMOVE_TOKEN_LENGTH, +}; #[derive(Serialize, Deserialize, Debug, Default, Copy, Clone, Eq, PartialEq)] pub enum BooleanOperand { diff --git a/quickwit/quickwit-query/src/query_ast/bool_query.rs b/quickwit/quickwit-query/src/query_ast/bool_query.rs index ed79344fd81..30ec9cac473 100644 --- a/quickwit/quickwit-query/src/query_ast/bool_query.rs +++ b/quickwit/quickwit-query/src/query_ast/bool_query.rs @@ -19,6 +19,7 @@ use serde::{Deserialize, Serialize}; use tantivy::schema::Schema as TantivySchema; +use tantivy::tokenizer::TokenizerManager; use super::{BuildTantivyAst, TantivyQueryAst}; use crate::query_ast::QueryAst; @@ -59,27 +60,45 @@ impl BuildTantivyAst for BoolQuery { fn build_tantivy_ast_impl( &self, schema: &TantivySchema, + tokenizer_manager: &TokenizerManager, search_fields: &[String], with_validation: bool, ) -> Result { let mut boolean_query = super::tantivy_query_ast::TantivyBoolQuery::default(); for must in &self.must { - let must_leaf = must.build_tantivy_ast_call(schema, search_fields, with_validation)?; + let must_leaf = must.build_tantivy_ast_call( + schema, + tokenizer_manager, + search_fields, + with_validation, + )?; boolean_query.must.push(must_leaf); } for must_not in &self.must_not { - let must_not_leaf = - must_not.build_tantivy_ast_call(schema, search_fields, with_validation)?; + let must_not_leaf = must_not.build_tantivy_ast_call( + schema, + tokenizer_manager, + search_fields, + with_validation, + )?; boolean_query.must_not.push(must_not_leaf); } for should in &self.should { - let should_leaf = - should.build_tantivy_ast_call(schema, search_fields, with_validation)?; + let should_leaf = should.build_tantivy_ast_call( + schema, + tokenizer_manager, + search_fields, + with_validation, + )?; boolean_query.should.push(should_leaf); } for filter in &self.filter { - let filter_leaf = - filter.build_tantivy_ast_call(schema, search_fields, with_validation)?; + let filter_leaf = filter.build_tantivy_ast_call( + schema, + tokenizer_manager, + search_fields, + with_validation, + )?; boolean_query.filter.push(filter_leaf); } Ok(TantivyQueryAst::Bool(boolean_query)) diff --git a/quickwit/quickwit-query/src/query_ast/full_text_query.rs b/quickwit/quickwit-query/src/query_ast/full_text_query.rs index 4b43b1891f5..5703a0e458b 100644 --- a/quickwit/quickwit-query/src/query_ast/full_text_query.rs +++ b/quickwit/quickwit-query/src/query_ast/full_text_query.rs @@ -24,13 +24,13 @@ use tantivy::query::{PhraseQuery as TantivyPhraseQuery, TermQuery as TantivyTerm use tantivy::schema::{ Field, IndexRecordOption, JsonObjectOptions, Schema as TantivySchema, TextFieldIndexing, }; -use tantivy::tokenizer::{BoxTokenStream, TextAnalyzer}; +use tantivy::tokenizer::{TextAnalyzer, TokenStream, TokenizerManager}; use tantivy::Term; use crate::query_ast::tantivy_query_ast::{TantivyBoolQuery, TantivyQueryAst}; use crate::query_ast::utils::full_text_query; use crate::query_ast::{BuildTantivyAst, QueryAst}; -use crate::{get_quickwit_tokenizer_manager, BooleanOperand, InvalidQuery, MatchAllOrNone}; +use crate::{BooleanOperand, InvalidQuery, MatchAllOrNone}; #[derive(Serialize, Deserialize, Debug, Eq, PartialEq, Clone)] #[serde(deny_unknown_fields)] @@ -48,12 +48,13 @@ impl FullTextParams { fn text_analyzer( &self, text_field_indexing: &TextFieldIndexing, + tokenizer_manager: &TokenizerManager, ) -> anyhow::Result { let tokenizer_name: &str = self .tokenizer .as_deref() .unwrap_or(text_field_indexing.tokenizer()); - get_quickwit_tokenizer_manager() + tokenizer_manager .get(tokenizer_name) .with_context(|| format!("No tokenizer named `{}` is registered.", tokenizer_name)) } @@ -64,12 +65,14 @@ impl FullTextParams { json_path: &str, text: &str, json_options: &JsonObjectOptions, + tokenizer_manager: &TokenizerManager, ) -> anyhow::Result> { let text_indexing_options = json_options .get_text_indexing_options() .with_context(|| format!("Json field text `{}` is not indexed", json_path))?; - let mut text_analyzer: TextAnalyzer = self.text_analyzer(text_indexing_options)?; - let mut token_stream: BoxTokenStream = text_analyzer.token_stream(text); + let mut text_analyzer: TextAnalyzer = + self.text_analyzer(text_indexing_options, tokenizer_manager)?; + let mut token_stream = text_analyzer.token_stream(text); let mut tokens = Vec::new(); let mut term = Term::with_capacity(100); let mut json_term_writer = JsonTermWriter::from_field_and_json_path( @@ -90,9 +93,11 @@ impl FullTextParams { field: Field, text: &str, text_field_indexing: &TextFieldIndexing, + tokenizer_manager: &TokenizerManager, ) -> anyhow::Result> { - let mut text_analyzer: TextAnalyzer = self.text_analyzer(text_field_indexing)?; - let mut token_stream: BoxTokenStream = text_analyzer.token_stream(text); + let mut text_analyzer: TextAnalyzer = + self.text_analyzer(text_field_indexing, tokenizer_manager)?; + let mut token_stream = text_analyzer.token_stream(text); let mut tokens = Vec::new(); token_stream.process(&mut |token| { let term: Term = Term::from_field_text(field, &token.text); @@ -205,10 +210,17 @@ impl BuildTantivyAst for FullTextQuery { fn build_tantivy_ast_impl( &self, schema: &TantivySchema, + tokenizer_manager: &TokenizerManager, _search_fields: &[String], _with_validation: bool, ) -> Result { - full_text_query(&self.field, &self.text, &self.params, schema) + full_text_query( + &self.field, + &self.text, + &self.params, + schema, + tokenizer_manager, + ) } } @@ -218,7 +230,7 @@ mod tests { use crate::query_ast::tantivy_query_ast::TantivyQueryAst; use crate::query_ast::{BuildTantivyAst, FullTextMode, FullTextQuery}; - use crate::BooleanOperand; + use crate::{create_default_quickwit_tokenizer_manager, BooleanOperand}; #[test] fn test_zero_terms() { @@ -235,7 +247,12 @@ mod tests { schema_builder.add_text_field("body", TEXT); let schema = schema_builder.build(); let ast: TantivyQueryAst = full_text_query - .build_tantivy_ast_call(&schema, &[], true) + .build_tantivy_ast_call( + &schema, + &create_default_quickwit_tokenizer_manager(), + &[], + true, + ) .unwrap(); assert_eq!(ast.const_predicate(), Some(crate::MatchAllOrNone::MatchAll)); } @@ -255,7 +272,12 @@ mod tests { schema_builder.add_text_field("body", TEXT); let schema = schema_builder.build(); let ast: TantivyQueryAst = full_text_query - .build_tantivy_ast_call(&schema, &[], true) + .build_tantivy_ast_call( + &schema, + &create_default_quickwit_tokenizer_manager(), + &[], + true, + ) .unwrap(); let leaf = ast.as_leaf().unwrap(); assert_eq!( @@ -280,7 +302,12 @@ mod tests { schema_builder.add_text_field("body", TEXT); let schema = schema_builder.build(); let ast: TantivyQueryAst = full_text_query - .build_tantivy_ast_call(&schema, &[], true) + .build_tantivy_ast_call( + &schema, + &create_default_quickwit_tokenizer_manager(), + &[], + true, + ) .unwrap(); let leaf = ast.as_leaf().unwrap(); assert_eq!( @@ -304,7 +331,12 @@ mod tests { schema_builder.add_text_field("body", TEXT); let schema = schema_builder.build(); let ast: TantivyQueryAst = full_text_query - .build_tantivy_ast_call(&schema, &[], true) + .build_tantivy_ast_call( + &schema, + &create_default_quickwit_tokenizer_manager(), + &[], + true, + ) .unwrap(); let bool_query = ast.as_bool_query().unwrap(); assert_eq!(bool_query.must.len(), 2); diff --git a/quickwit/quickwit-query/src/query_ast/mod.rs b/quickwit/quickwit-query/src/query_ast/mod.rs index adfd5783e2b..befc679892c 100644 --- a/quickwit/quickwit-query/src/query_ast/mod.rs +++ b/quickwit/quickwit-query/src/query_ast/mod.rs @@ -20,6 +20,7 @@ use serde::{Deserialize, Serialize}; use tantivy::query::BoostQuery as TantivyBoostQuery; use tantivy::schema::Schema as TantivySchema; +use tantivy::tokenizer::TokenizerManager; mod bool_query; mod full_text_query; @@ -141,6 +142,7 @@ trait BuildTantivyAst { fn build_tantivy_ast_impl( &self, schema: &TantivySchema, + tokenizer_manager: &TokenizerManager, search_fields: &[String], with_validation: bool, ) -> Result; @@ -149,10 +151,12 @@ trait BuildTantivyAst { fn build_tantivy_ast_call( &self, schema: &TantivySchema, + tokenizer_manager: &TokenizerManager, search_fields: &[String], with_validation: bool, ) -> Result { - let tantivy_ast_res = self.build_tantivy_ast_impl(schema, search_fields, with_validation); + let tantivy_ast_res = + self.build_tantivy_ast_impl(schema, tokenizer_manager, search_fields, with_validation); if !with_validation && tantivy_ast_res.is_err() { return match tantivy_ast_res { res @ Ok(_) | res @ Err(InvalidQuery::UserQueryNotParsed) => res, @@ -167,39 +171,61 @@ impl BuildTantivyAst for QueryAst { fn build_tantivy_ast_impl( &self, schema: &TantivySchema, + tokenizer_manager: &TokenizerManager, search_fields: &[String], with_validation: bool, ) -> Result { match self { - QueryAst::Bool(bool_query) => { - bool_query.build_tantivy_ast_call(schema, search_fields, with_validation) - } - QueryAst::Term(term_query) => { - term_query.build_tantivy_ast_call(schema, search_fields, with_validation) - } - QueryAst::Range(range_query) => { - range_query.build_tantivy_ast_call(schema, search_fields, with_validation) - } + QueryAst::Bool(bool_query) => bool_query.build_tantivy_ast_call( + schema, + tokenizer_manager, + search_fields, + with_validation, + ), + QueryAst::Term(term_query) => term_query.build_tantivy_ast_call( + schema, + tokenizer_manager, + search_fields, + with_validation, + ), + QueryAst::Range(range_query) => range_query.build_tantivy_ast_call( + schema, + tokenizer_manager, + search_fields, + with_validation, + ), QueryAst::MatchAll => Ok(TantivyQueryAst::match_all()), QueryAst::MatchNone => Ok(TantivyQueryAst::match_none()), QueryAst::Boost { boost, underlying } => { - let underlying = - underlying.build_tantivy_ast_call(schema, search_fields, with_validation)?; + let underlying = underlying.build_tantivy_ast_call( + schema, + tokenizer_manager, + search_fields, + with_validation, + )?; let boost_query = TantivyBoostQuery::new(underlying.into(), (*boost).into()); Ok(boost_query.into()) } - QueryAst::TermSet(term_set) => { - term_set.build_tantivy_ast_call(schema, search_fields, with_validation) - } - QueryAst::FullText(full_text_query) => { - full_text_query.build_tantivy_ast_call(schema, search_fields, with_validation) - } - QueryAst::PhrasePrefix(phrase_prefix_query) => { - phrase_prefix_query.build_tantivy_ast_call(schema, search_fields, with_validation) - } - QueryAst::UserInput(user_text_query) => { - user_text_query.build_tantivy_ast_call(schema, search_fields, with_validation) - } + QueryAst::TermSet(term_set) => term_set.build_tantivy_ast_call( + schema, + tokenizer_manager, + search_fields, + with_validation, + ), + QueryAst::FullText(full_text_query) => full_text_query.build_tantivy_ast_call( + schema, + tokenizer_manager, + search_fields, + with_validation, + ), + QueryAst::PhrasePrefix(phrase_prefix_query) => phrase_prefix_query + .build_tantivy_ast_call(schema, tokenizer_manager, search_fields, with_validation), + QueryAst::UserInput(user_text_query) => user_text_query.build_tantivy_ast_call( + schema, + tokenizer_manager, + search_fields, + with_validation, + ), } } } @@ -208,11 +234,12 @@ impl QueryAst { pub fn build_tantivy_query( &self, schema: &TantivySchema, + tokenizer_manager: &TokenizerManager, search_fields: &[String], with_validation: bool, ) -> Result, InvalidQuery> { let tantivy_query_ast = - self.build_tantivy_ast_call(schema, search_fields, with_validation)?; + self.build_tantivy_ast_call(schema, tokenizer_manager, search_fields, with_validation)?; Ok(tantivy_query_ast.simplify().into()) } } @@ -230,7 +257,7 @@ fn parse_user_query_in_asts( mod tests { use crate::query_ast::tantivy_query_ast::TantivyQueryAst; use crate::query_ast::{BoolQuery, BuildTantivyAst, QueryAst, UserInputQuery}; - use crate::InvalidQuery; + use crate::{create_default_quickwit_tokenizer_manager, InvalidQuery}; #[test] fn test_user_query_not_parsed() { @@ -242,7 +269,12 @@ mod tests { .into(); let schema = tantivy::schema::Schema::builder().build(); let build_tantivy_ast_err: InvalidQuery = query_ast - .build_tantivy_ast_call(&schema, &[], true) + .build_tantivy_ast_call( + &schema, + &create_default_quickwit_tokenizer_manager(), + &[], + true, + ) .unwrap_err(); assert!(matches!( build_tantivy_ast_err, @@ -261,7 +293,12 @@ mod tests { let query_ast_with_parsed_user_query: QueryAst = query_ast.parse_user_query(&[]).unwrap(); let schema = tantivy::schema::Schema::builder().build(); let tantivy_query_ast = query_ast_with_parsed_user_query - .build_tantivy_ast_call(&schema, &[], true) + .build_tantivy_ast_call( + &schema, + &create_default_quickwit_tokenizer_manager(), + &[], + true, + ) .unwrap(); assert_eq!(&tantivy_query_ast, &TantivyQueryAst::match_all(),); } @@ -283,7 +320,12 @@ mod tests { bool_query_ast.parse_user_query(&[]).unwrap(); let schema = tantivy::schema::Schema::builder().build(); let tantivy_query_ast = query_ast_with_parsed_user_query - .build_tantivy_ast_call(&schema, &[], true) + .build_tantivy_ast_call( + &schema, + &create_default_quickwit_tokenizer_manager(), + &[], + true, + ) .unwrap(); let tantivy_query_ast_simplified = tantivy_query_ast.simplify(); // This does not get more simplified than this, because we need the boost 0 score. diff --git a/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs b/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs index 08628415a91..8279563acc0 100644 --- a/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs +++ b/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs @@ -20,6 +20,7 @@ use serde::{Deserialize, Serialize}; use tantivy::query::PhrasePrefixQuery as TantivyPhrasePrefixQuery; use tantivy::schema::{Field, FieldType, Schema as TantivySchema}; +use tantivy::tokenizer::TokenizerManager; use tantivy::Term; use crate::query_ast::tantivy_query_ast::TantivyQueryAst; @@ -43,6 +44,7 @@ impl PhrasePrefixQuery { pub fn get_terms( &self, schema: &TantivySchema, + tokenizer_manager: &TokenizerManager, ) -> Result<(Field, Vec<(usize, Term)>), InvalidQuery> { let (field, field_entry, json_path) = find_field_or_hit_dynamic(&self.field, schema)?; let field_type = field_entry.field_type(); @@ -67,6 +69,7 @@ impl PhrasePrefixQuery { field, &self.phrase, text_field_indexing, + tokenizer_manager, )?; Ok((field, terms)) } @@ -90,6 +93,7 @@ impl PhrasePrefixQuery { json_path, &self.phrase, json_options, + tokenizer_manager, )?; Ok((field, terms)) } @@ -110,10 +114,11 @@ impl BuildTantivyAst for PhrasePrefixQuery { fn build_tantivy_ast_impl( &self, schema: &TantivySchema, + tokenizer_manager: &TokenizerManager, _search_fields: &[String], _with_validation: bool, ) -> Result { - let (_, terms) = self.get_terms(schema)?; + let (_, terms) = self.get_terms(schema, tokenizer_manager)?; if terms.is_empty() { if self.params.zero_terms_query.is_none() { diff --git a/quickwit/quickwit-query/src/query_ast/range_query.rs b/quickwit/quickwit-query/src/query_ast/range_query.rs index 12ce4e50d53..71e29eff18c 100644 --- a/quickwit/quickwit-query/src/query_ast/range_query.rs +++ b/quickwit/quickwit-query/src/query_ast/range_query.rs @@ -24,6 +24,7 @@ use tantivy::query::{ FastFieldRangeWeight as TantivyFastFieldRangeQuery, RangeQuery as TantivyRangeQuery, }; use tantivy::schema::Schema as TantivySchema; +use tantivy::tokenizer::TokenizerManager; use super::QueryAst; use crate::json_literal::InterpretUserInput; @@ -222,6 +223,7 @@ impl BuildTantivyAst for RangeQuery { fn build_tantivy_ast_impl( &self, schema: &TantivySchema, + _tokenizer_manager: &TokenizerManager, _search_fields: &[String], _with_validation: bool, ) -> Result { @@ -340,7 +342,9 @@ mod tests { use super::RangeQuery; use crate::query_ast::tantivy_query_ast::TantivyBoolQuery; use crate::query_ast::BuildTantivyAst; - use crate::{InvalidQuery, JsonLiteral, MatchAllOrNone}; + use crate::{ + create_default_quickwit_tokenizer_manager, InvalidQuery, JsonLiteral, MatchAllOrNone, + }; fn make_schema(dynamic_mode: bool) -> Schema { let mut schema_builder = Schema::builder(); @@ -363,7 +367,12 @@ mod tests { upper_bound: Bound::Included(JsonLiteral::String("1989".to_string())), }; let tantivy_ast = range_query - .build_tantivy_ast_call(&schema, &[], true) + .build_tantivy_ast_call( + &schema, + &create_default_quickwit_tokenizer_manager(), + &[], + true, + ) .unwrap() .simplify(); let leaf = tantivy_ast.as_leaf().unwrap(); @@ -402,7 +411,12 @@ mod tests { }; // with validation let invalid_query: InvalidQuery = range_query - .build_tantivy_ast_call(&schema, &[], true) + .build_tantivy_ast_call( + &schema, + &create_default_quickwit_tokenizer_manager(), + &[], + true, + ) .unwrap_err(); assert!( matches!(invalid_query, InvalidQuery::FieldDoesNotExist { full_path } if full_path == "missing_field.toto") @@ -410,7 +424,12 @@ mod tests { // without validation assert_eq!( range_query - .build_tantivy_ast_call(&schema, &[], false) + .build_tantivy_ast_call( + &schema, + &create_default_quickwit_tokenizer_manager(), + &[], + false + ) .unwrap() .const_predicate(), Some(MatchAllOrNone::MatchNone) @@ -427,7 +446,12 @@ mod tests { }; // with validation let invalid_query: InvalidQuery = range_query - .build_tantivy_ast_call(&schema, &[], true) + .build_tantivy_ast_call( + &schema, + &create_default_quickwit_tokenizer_manager(), + &[], + true, + ) .unwrap_err(); assert!(matches!( invalid_query, @@ -436,7 +460,12 @@ mod tests { // without validation assert_eq!( range_query - .build_tantivy_ast_call(&schema, &[], false) + .build_tantivy_ast_call( + &schema, + &create_default_quickwit_tokenizer_manager(), + &[], + false + ) .unwrap() .const_predicate(), Some(MatchAllOrNone::MatchNone) @@ -452,7 +481,12 @@ mod tests { }; let schema = make_schema(true); let tantivy_ast = range_query - .build_tantivy_ast_call(&schema, &[], true) + .build_tantivy_ast_call( + &schema, + &create_default_quickwit_tokenizer_manager(), + &[], + true, + ) .unwrap(); let TantivyBoolQuery { must, @@ -495,7 +529,12 @@ mod tests { }; let schema = make_schema(false); let err = range_query - .build_tantivy_ast_call(&schema, &[], true) + .build_tantivy_ast_call( + &schema, + &create_default_quickwit_tokenizer_manager(), + &[], + true, + ) .unwrap_err(); assert!(matches!(err, InvalidQuery::SchemaError { .. })); } diff --git a/quickwit/quickwit-query/src/query_ast/term_query.rs b/quickwit/quickwit-query/src/query_ast/term_query.rs index 3b19e1b6289..0f62da9544d 100644 --- a/quickwit/quickwit-query/src/query_ast/term_query.rs +++ b/quickwit/quickwit-query/src/query_ast/term_query.rs @@ -21,6 +21,7 @@ use std::collections::HashMap; use serde::{Deserialize, Serialize}; use tantivy::schema::Schema as TantivySchema; +use tantivy::tokenizer::TokenizerManager; use super::{BuildTantivyAst, QueryAst}; use crate::query_ast::{FullTextParams, TantivyQueryAst}; @@ -54,6 +55,7 @@ impl BuildTantivyAst for TermQuery { fn build_tantivy_ast_impl( &self, schema: &TantivySchema, + tokenizer_manager: &TokenizerManager, _search_fields: &[String], _with_validation: bool, ) -> Result { @@ -68,6 +70,7 @@ impl BuildTantivyAst for TermQuery { &self.value, &full_text_params, schema, + tokenizer_manager, ) } } @@ -124,6 +127,7 @@ impl From for HashMap { mod tests { use tantivy::schema::{Schema, INDEXED}; + use crate::create_default_quickwit_tokenizer_manager; use crate::query_ast::{BuildTantivyAst, TermQuery}; #[test] @@ -136,7 +140,12 @@ mod tests { schema_builder.add_ip_addr_field("ip", INDEXED); let schema = schema_builder.build(); let tantivy_query_ast = term_query - .build_tantivy_ast_call(&schema, &[], true) + .build_tantivy_ast_call( + &schema, + &create_default_quickwit_tokenizer_manager(), + &[], + true, + ) .unwrap(); let leaf = tantivy_query_ast.as_leaf().unwrap(); assert_eq!( @@ -155,7 +164,12 @@ mod tests { schema_builder.add_ip_addr_field("ip", INDEXED); let schema = schema_builder.build(); let tantivy_query_ast = term_query - .build_tantivy_ast_call(&schema, &[], true) + .build_tantivy_ast_call( + &schema, + &create_default_quickwit_tokenizer_manager(), + &[], + true, + ) .unwrap(); let leaf = tantivy_query_ast.as_leaf().unwrap(); assert_eq!( @@ -174,7 +188,12 @@ mod tests { schema_builder.add_bytes_field("bytes", INDEXED); let schema = schema_builder.build(); let tantivy_query_ast = term_query - .build_tantivy_ast_call(&schema, &[], true) + .build_tantivy_ast_call( + &schema, + &create_default_quickwit_tokenizer_manager(), + &[], + true, + ) .unwrap(); let leaf = tantivy_query_ast.as_leaf().unwrap(); assert_eq!( @@ -193,7 +212,12 @@ mod tests { schema_builder.add_bytes_field("bytes", INDEXED); let schema = schema_builder.build(); let tantivy_query_ast = term_query - .build_tantivy_ast_call(&schema, &[], true) + .build_tantivy_ast_call( + &schema, + &create_default_quickwit_tokenizer_manager(), + &[], + true, + ) .unwrap(); let leaf = tantivy_query_ast.as_leaf().unwrap(); assert_eq!( diff --git a/quickwit/quickwit-query/src/query_ast/term_set_query.rs b/quickwit/quickwit-query/src/query_ast/term_set_query.rs index 9b53f1680ed..d27597e37bc 100644 --- a/quickwit/quickwit-query/src/query_ast/term_set_query.rs +++ b/quickwit/quickwit-query/src/query_ast/term_set_query.rs @@ -21,6 +21,7 @@ use std::collections::{BTreeSet, HashMap, HashSet}; use serde::{Deserialize, Serialize}; use tantivy::schema::Schema as TantivySchema; +use tantivy::tokenizer::TokenizerManager; use tantivy::Term; use crate::query_ast::{BuildTantivyAst, QueryAst, TantivyQueryAst, TermQuery}; @@ -36,7 +37,11 @@ pub struct TermSetQuery { } impl TermSetQuery { - fn make_term_iterator(&self, schema: &TantivySchema) -> Result, InvalidQuery> { + fn make_term_iterator( + &self, + schema: &TantivySchema, + tokenizer_manager: &TokenizerManager, + ) -> Result, InvalidQuery> { let mut terms: HashSet = HashSet::default(); for (full_path, values) in &self.terms_per_field { for value in values { @@ -51,7 +56,8 @@ impl TermSetQuery { field: full_path.to_string(), value: value.to_string(), }; - let ast = term_query.build_tantivy_ast_call(schema, &[], false)?; + let ast = + term_query.build_tantivy_ast_call(schema, tokenizer_manager, &[], false)?; let tantivy_query: Box = ast.simplify().into(); tantivy_query.query_terms(&mut |term, _| { terms.insert(term.clone()); @@ -66,10 +72,11 @@ impl BuildTantivyAst for TermSetQuery { fn build_tantivy_ast_impl( &self, schema: &TantivySchema, + tokenizer_manager: &TokenizerManager, _search_fields: &[String], _with_validation: bool, ) -> Result { - let terms_it = self.make_term_iterator(schema)?; + let terms_it = self.make_term_iterator(schema, tokenizer_manager)?; let term_set_query = tantivy::query::TermSetQuery::new(terms_it); Ok(term_set_query.into()) } diff --git a/quickwit/quickwit-query/src/query_ast/user_input_query.rs b/quickwit/quickwit-query/src/query_ast/user_input_query.rs index 89d2b856e1e..1ea12eb0c71 100644 --- a/quickwit/quickwit-query/src/query_ast/user_input_query.rs +++ b/quickwit/quickwit-query/src/query_ast/user_input_query.rs @@ -26,6 +26,7 @@ use tantivy::query_grammar::{ Delimiter, Occur, UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral, }; use tantivy::schema::Schema as TantivySchema; +use tantivy::tokenizer::TokenizerManager; use crate::not_nan_f32::NotNaNf32; use crate::query_ast::tantivy_query_ast::TantivyQueryAst; @@ -84,6 +85,7 @@ impl BuildTantivyAst for UserInputQuery { fn build_tantivy_ast_impl( &self, _schema: &TantivySchema, + _tokenizer_manager: &TokenizerManager, _default_search_fields: &[String], _with_validation: bool, ) -> Result { @@ -249,7 +251,7 @@ mod tests { use crate::query_ast::{ BoolQuery, BuildTantivyAst, FullTextMode, FullTextQuery, QueryAst, UserInputQuery, }; - use crate::{BooleanOperand, InvalidQuery}; + use crate::{create_default_quickwit_tokenizer_manager, BooleanOperand, InvalidQuery}; #[test] fn test_user_input_query_not_parsed_error() { @@ -261,13 +263,23 @@ mod tests { let schema = tantivy::schema::Schema::builder().build(); { let invalid_query = user_input_query - .build_tantivy_ast_call(&schema, &[], true) + .build_tantivy_ast_call( + &schema, + &create_default_quickwit_tokenizer_manager(), + &[], + true, + ) .unwrap_err(); assert!(matches!(invalid_query, InvalidQuery::UserQueryNotParsed)); } { let invalid_query = user_input_query - .build_tantivy_ast_call(&schema, &[], false) + .build_tantivy_ast_call( + &schema, + &create_default_quickwit_tokenizer_manager(), + &[], + false, + ) .unwrap_err(); assert!(matches!(invalid_query, InvalidQuery::UserQueryNotParsed)); } diff --git a/quickwit/quickwit-query/src/query_ast/utils.rs b/quickwit/quickwit-query/src/query_ast/utils.rs index d1a5bfdffe6..7ac12a4c2b0 100644 --- a/quickwit/quickwit-query/src/query_ast/utils.rs +++ b/quickwit/quickwit-query/src/query_ast/utils.rs @@ -23,7 +23,7 @@ use tantivy::schema::{ Field, FieldEntry, FieldType, IndexRecordOption, JsonObjectOptions, Schema as TantivySchema, Type, }; -use tantivy::Term; +use tantivy::{tokenizer, Term}; use crate::json_literal::InterpretUserInput; use crate::query_ast::full_text_query::FullTextParams; @@ -75,9 +75,17 @@ pub(crate) fn full_text_query( text_query: &str, full_text_params: &FullTextParams, schema: &TantivySchema, + tokenizer_manager: &tokenizer::TokenizerManager, ) -> Result { let (field, field_entry, path) = find_field_or_hit_dynamic(full_path, schema)?; - compute_query_with_field(field, field_entry, path, text_query, full_text_params) + compute_query_with_field( + field, + field_entry, + path, + text_query, + full_text_params, + tokenizer_manager, + ) } fn parse_value_from_user_text<'a, T: InterpretUserInput<'a>>( @@ -100,6 +108,7 @@ fn compute_query_with_field( json_path: &str, value: &str, full_text_params: &FullTextParams, + tokenizer_manager: &tokenizer::TokenizerManager, ) -> Result { let field_type = field_entry.field_type(); match field_type { @@ -135,8 +144,12 @@ fn compute_query_with_field( field_entry.name() )) })?; - let terms = - full_text_params.tokenize_text_into_terms(field, value, text_field_indexing)?; + let terms = full_text_params.tokenize_text_into_terms( + field, + value, + text_field_indexing, + tokenizer_manager, + )?; full_text_params.make_query(terms, text_field_indexing.index_option()) } FieldType::IpAddr(_) => { @@ -150,6 +163,7 @@ fn compute_query_with_field( value, full_text_params, json_options, + tokenizer_manager, ), FieldType::Facet(_) => Err(InvalidQuery::SchemaError( "Facets are not supported in Quickwit.".to_string(), @@ -168,6 +182,7 @@ fn compute_tantivy_ast_query_for_json( text: &str, full_text_params: &FullTextParams, json_options: &JsonObjectOptions, + tokenizer_manager: &tokenizer::TokenizerManager, ) -> Result { let mut bool_query = TantivyBoolQuery::default(); let mut term = Term::with_capacity(100); @@ -182,8 +197,13 @@ fn compute_tantivy_ast_query_for_json( .should .push(TantivyTermQuery::new(term, IndexRecordOption::Basic).into()); } - let position_terms: Vec<(usize, Term)> = - full_text_params.tokenize_text_into_terms_json(field, json_path, text, json_options)?; + let position_terms: Vec<(usize, Term)> = full_text_params.tokenize_text_into_terms_json( + field, + json_path, + text, + json_options, + tokenizer_manager, + )?; let index_record_option = json_options .get_text_indexing_options() .map(|text_indexing_options| text_indexing_options.index_option()) diff --git a/quickwit/quickwit-query/src/tokenizers.rs b/quickwit/quickwit-query/src/tokenizers.rs index a406c3b39b9..cb4de09f6b8 100644 --- a/quickwit/quickwit-query/src/tokenizers.rs +++ b/quickwit/quickwit-query/src/tokenizers.rs @@ -25,13 +25,15 @@ use tantivy::tokenizer::{ TokenizerManager, }; -fn create_quickwit_tokenizer_manager() -> TokenizerManager { +pub const DEFAULT_REMOVE_TOKEN_LENGTH: usize = 255; + +pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager { let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default()) - .filter(RemoveLongFilter::limit(255)) + .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) .build(); let chinese_tokenizer = TextAnalyzer::builder(ChineseTokenizer) - .filter(RemoveLongFilter::limit(255)) + .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) .filter(LowerCaser) .build(); @@ -42,14 +44,14 @@ fn create_quickwit_tokenizer_manager() -> TokenizerManager { tokenizer_manager.register( "default", TextAnalyzer::builder(tantivy::tokenizer::SimpleTokenizer::default()) - .filter(RemoveLongFilter::limit(255)) + .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) .filter(LowerCaser) .build(), ); tokenizer_manager.register( "en_stem", TextAnalyzer::builder(tantivy::tokenizer::SimpleTokenizer::default()) - .filter(RemoveLongFilter::limit(255)) + .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) .filter(LowerCaser) .filter(tantivy::tokenizer::Stemmer::new( tantivy::tokenizer::Language::English, @@ -62,12 +64,12 @@ fn create_quickwit_tokenizer_manager() -> TokenizerManager { fn create_quickwit_fastfield_normalizer_manager() -> TokenizerManager { let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default()) - .filter(RemoveLongFilter::limit(255)) + .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) .build(); let lower_case_tokenizer = TextAnalyzer::builder(RawTokenizer::default()) .filter(LowerCaser) - .filter(RemoveLongFilter::limit(255)) + .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) .build(); let tokenizer_manager = TokenizerManager::new(); @@ -182,13 +184,6 @@ impl<'a> TokenStream for ChineseTokenStream<'a> { } } -pub fn get_quickwit_tokenizer_manager() -> &'static TokenizerManager { - /// Quickwits default tokenizer - static QUICKWIT_TOKENIZER_MANAGER: Lazy = - Lazy::new(create_quickwit_tokenizer_manager); - &QUICKWIT_TOKENIZER_MANAGER -} - pub fn get_quickwit_fastfield_normalizer_manager() -> &'static TokenizerManager { static QUICKWIT_FAST_FIELD_NORMALIZER_MANAGER: Lazy = Lazy::new(create_quickwit_fastfield_normalizer_manager); @@ -199,7 +194,7 @@ pub fn get_quickwit_fastfield_normalizer_manager() -> &'static TokenizerManager mod tests { use tantivy::tokenizer::Token; - use super::get_quickwit_tokenizer_manager; + use super::create_default_quickwit_tokenizer_manager; #[test] fn test_raw_tokenizer() { @@ -209,7 +204,9 @@ mod tests { sand in my face "#; - let mut tokenizer = get_quickwit_tokenizer_manager().get("raw").unwrap(); + let mut tokenizer = create_default_quickwit_tokenizer_manager() + .get("raw") + .unwrap(); { let mut haiku_stream = tokenizer.token_stream(my_haiku); assert!(haiku_stream.advance()); @@ -229,7 +226,7 @@ mod tests { fn test_chinese_tokenizer() { let text = "Hello world, 你好世界, bonjour monde"; - let mut tokenizer = get_quickwit_tokenizer_manager() + let mut tokenizer = create_default_quickwit_tokenizer_manager() .get("chinese_compatible") .unwrap(); let mut text_stream = tokenizer.token_stream(text); @@ -306,7 +303,7 @@ mod tests { fn test_chinese_tokenizer_no_space() { let text = "Hello你好bonjour"; - let mut tokenizer = get_quickwit_tokenizer_manager() + let mut tokenizer = create_default_quickwit_tokenizer_manager() .get("chinese_compatible") .unwrap(); let mut text_stream = tokenizer.token_stream(text); @@ -353,8 +350,8 @@ mod tests { proptest::proptest! { #[test] fn test_proptest_ascii_default_chinese_equal(text in "[ -~]{0,64}") { - let mut cn_tok = get_quickwit_tokenizer_manager().get("chinese_compatible").unwrap(); - let mut default_tok = get_quickwit_tokenizer_manager().get("default").unwrap(); + let mut cn_tok = create_default_quickwit_tokenizer_manager().get("chinese_compatible").unwrap(); + let mut default_tok = create_default_quickwit_tokenizer_manager().get("default").unwrap(); let mut text_stream = cn_tok.token_stream(&text); diff --git a/quickwit/quickwit-search/Cargo.toml b/quickwit/quickwit-search/Cargo.toml index 20915ab3534..82a5576af9e 100644 --- a/quickwit/quickwit-search/Cargo.toml +++ b/quickwit/quickwit-search/Cargo.toml @@ -59,6 +59,7 @@ tempfile = { workspace = true } quickwit-indexing = { workspace = true, features = ["testsuite"] } quickwit-metastore = { workspace = true, features = ["testsuite"] } +quickwit-storage = { workspace = true, features = ["testsuite"] } [features] testsuite = [] diff --git a/quickwit/quickwit-search/src/fetch_docs.rs b/quickwit/quickwit-search/src/fetch_docs.rs index 3c4d7ae84dc..8124a0fe89a 100644 --- a/quickwit/quickwit-search/src/fetch_docs.rs +++ b/quickwit/quickwit-search/src/fetch_docs.rs @@ -171,9 +171,15 @@ async fn fetch_docs_in_split( global_doc_addrs.sort_by_key(|doc| doc.doc_addr); // Opens the index without the ephemeral unbounded cache, this cache is indeed not useful // when fetching docs as we will fetch them only once. - let index = open_index_with_caches(&searcher_context, index_storage, split, false) - .await - .with_context(|| "open-index-for-split")?; + let index = open_index_with_caches( + &searcher_context, + index_storage, + split, + Some(doc_mapper.tokenizer_manager()), + false, + ) + .await + .context("open-index-for-split")?; let index_reader = index .reader_builder() // the docs are presorted so a cache size of NUM_CONCURRENT_REQUESTS is fine diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index c79d6b684ec..658e6b34a0d 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -39,6 +39,7 @@ use tantivy::collector::Collector; use tantivy::directory::FileSlice; use tantivy::fastfield::FastFieldReaders; use tantivy::schema::{Field, FieldType}; +use tantivy::tokenizer::TokenizerManager; use tantivy::{Index, ReloadPolicy, Searcher, Term}; use tracing::*; @@ -86,11 +87,12 @@ async fn get_split_footer_from_cache_or_fetch( /// - A split footer cache given by `SearcherContext.split_footer_cache`. /// - A fast fields cache given by `SearcherContext.storage_long_term_cache`. /// - An ephemeral unbounded cache directory whose lifetime is tied to the returned `Index`. -#[instrument(skip(searcher_context, index_storage))] +#[instrument(skip(searcher_context, index_storage, tokenizer_manager))] pub(crate) async fn open_index_with_caches( searcher_context: &SearcherContext, index_storage: Arc, split_and_footer_offsets: &SplitIdAndFooterOffsets, + tokenizer_manager: Option<&TokenizerManager>, ephemeral_unbounded_cache: bool, ) -> anyhow::Result { let split_file = PathBuf::from(format!("{}.split", split_and_footer_offsets.split_id)); @@ -118,7 +120,9 @@ pub(crate) async fn open_index_with_caches( HotDirectory::open(directory, hotcache_bytes.read_bytes()?)? }; let mut index = Index::open(hot_directory)?; - index.set_tokenizers(quickwit_query::get_quickwit_tokenizer_manager().clone()); + if let Some(tokenizer_manager) = tokenizer_manager { + index.set_tokenizers(tokenizer_manager.clone()); + } index.set_fast_field_tokenizers( quickwit_query::get_quickwit_fastfield_normalizer_manager().clone(), ); @@ -339,7 +343,14 @@ async fn leaf_search_single_split( } let split_id = split.split_id.to_string(); - let index = open_index_with_caches(searcher_context, storage, &split, true).await?; + let index = open_index_with_caches( + searcher_context, + storage, + &split, + Some(doc_mapper.tokenizer_manager()), + true, + ) + .await?; let split_schema = index.schema(); let quickwit_collector = make_collector_for_split( @@ -509,7 +520,7 @@ async fn leaf_list_terms_single_split( storage: Arc, split: SplitIdAndFooterOffsets, ) -> crate::Result { - let index = open_index_with_caches(searcher_context, storage, &split, true).await?; + let index = open_index_with_caches(searcher_context, storage, &split, None, true).await?; let split_schema = index.schema(); let reader = index .reader_builder() diff --git a/quickwit/quickwit-search/src/search_stream/leaf.rs b/quickwit/quickwit-search/src/search_stream/leaf.rs index 5e72c1a02bb..4a676e9ec32 100644 --- a/quickwit/quickwit-search/src/search_stream/leaf.rs +++ b/quickwit/quickwit-search/src/search_stream/leaf.rs @@ -124,7 +124,14 @@ async fn leaf_search_stream_single_split( &split, ); - let index = open_index_with_caches(&searcher_context, storage, &split, true).await?; + let index = open_index_with_caches( + &searcher_context, + storage, + &split, + Some(doc_mapper.tokenizer_manager()), + true, + ) + .await?; let split_schema = index.schema(); let request_fields = Arc::new(SearchStreamRequestFields::from_request( diff --git a/quickwit/quickwit-search/src/tests.rs b/quickwit/quickwit-search/src/tests.rs index b8ceaf8fedb..b247e10f687 100644 --- a/quickwit/quickwit-search/src/tests.rs +++ b/quickwit/quickwit-search/src/tests.rs @@ -852,7 +852,7 @@ async fn test_single_node_split_pruning_by_tags() -> anyhow::Result<()> { Ok(()) } -async fn test_search_dynamic_util(test_sandbox: &TestSandbox, query: &str) -> Vec { +async fn test_search_util(test_sandbox: &TestSandbox, query: &str) -> Vec { let splits = test_sandbox .metastore() .list_all_splits(test_sandbox.index_uid()) @@ -908,11 +908,11 @@ async fn test_search_dynamic_mode() -> anyhow::Result<()> { ]; test_sandbox.add_documents(docs).await.unwrap(); { - let docs = test_search_dynamic_util(&test_sandbox, "body:hello").await; + let docs = test_search_util(&test_sandbox, "body:hello").await; assert_eq!(&docs[..], &[1u32, 0u32]); } { - let docs = test_search_dynamic_util(&test_sandbox, "body_dynamic:hello").await; + let docs = test_search_util(&test_sandbox, "body_dynamic:hello").await; assert_eq!(&docs[..], &[3u32]); // 1 is not matched due to the raw tokenizer } test_sandbox.assert_quit().await; @@ -938,12 +938,11 @@ async fn test_search_dynamic_mode_expand_dots() -> anyhow::Result<()> { let docs = vec![json!({"k8s.component.name": "quickwit"})]; test_sandbox.add_documents(docs).await.unwrap(); { - let docs = test_search_dynamic_util(&test_sandbox, "k8s.component.name:quickwit").await; + let docs = test_search_util(&test_sandbox, "k8s.component.name:quickwit").await; assert_eq!(&docs[..], &[0u32]); } { - let docs = - test_search_dynamic_util(&test_sandbox, r#"k8s\.component\.name:quickwit"#).await; + let docs = test_search_util(&test_sandbox, r#"k8s\.component\.name:quickwit"#).await; assert_eq!(&docs[..], &[0u32]); } test_sandbox.assert_quit().await; @@ -969,12 +968,11 @@ async fn test_search_dynamic_mode_do_not_expand_dots() -> anyhow::Result<()> { let docs = vec![json!({"k8s.component.name": "quickwit"})]; test_sandbox.add_documents(docs).await.unwrap(); { - let docs = - test_search_dynamic_util(&test_sandbox, r#"k8s\.component\.name:quickwit"#).await; + let docs = test_search_util(&test_sandbox, r#"k8s\.component\.name:quickwit"#).await; assert_eq!(&docs[..], &[0u32]); } { - let docs = test_search_dynamic_util(&test_sandbox, r#"k8s.component.name:quickwit"#).await; + let docs = test_search_util(&test_sandbox, r#"k8s.component.name:quickwit"#).await; assert!(docs.is_empty()); } test_sandbox.assert_quit().await; @@ -1664,3 +1662,35 @@ async fn test_single_node_find_trace_ids_collector() { } test_sandbox.assert_quit().await; } + +#[tokio::test] +async fn test_search_in_text_field_with_custom_tokenizer() -> anyhow::Result<()> { + let doc_mapping_yaml = r#" + tokenizers: + - name: custom_tokenizer + type: ngram + min_gram: 3 + max_gram: 5 + prefix_only: true + field_mappings: + - name: body + type: text + tokenizer: custom_tokenizer + indexed: true + "#; + let test_sandbox = TestSandbox::create("search_custom_tokenizer", doc_mapping_yaml, "{}", &[]) + .await + .unwrap(); + let docs = vec![json!({"body": "hellohappy"})]; + test_sandbox.add_documents(docs).await.unwrap(); + { + let docs = test_search_util(&test_sandbox, "body:happy").await; + assert!(&docs.is_empty()); + } + { + let docs = test_search_util(&test_sandbox, "body:hel").await; + assert_eq!(&docs[..], &[0u32]); + } + test_sandbox.assert_quit().await; + Ok(()) +} diff --git a/quickwit/quickwit-serve/src/index_api/rest_handler.rs b/quickwit/quickwit-serve/src/index_api/rest_handler.rs index e205c2ae1c7..093d2c39a20 100644 --- a/quickwit/quickwit-serve/src/index_api/rest_handler.rs +++ b/quickwit/quickwit-serve/src/index_api/rest_handler.rs @@ -28,6 +28,7 @@ use quickwit_config::{ CLI_INGEST_SOURCE_ID, INGEST_API_SOURCE_ID, }; use quickwit_core::{IndexService, IndexServiceError}; +use quickwit_doc_mapper::{analyze_text, TokenizerConfig}; use quickwit_metastore::{ IndexMetadata, ListSplitsQuery, Metastore, MetastoreError, Split, SplitState, }; @@ -82,6 +83,8 @@ pub fn index_management_handlers( .or(create_source_handler(index_service.clone())) .or(get_source_handler(index_service.metastore())) .or(delete_source_handler(index_service.metastore())) + // Tokenizer handlers. + .or(analyze_request_handler()) } fn json_body( @@ -718,6 +721,47 @@ async fn delete_source( Ok(()) } +#[derive(Debug, Deserialize, utoipa::IntoParams, utoipa::ToSchema)] +struct AnalyzeRequest { + /// The tokenizer to use. + #[serde(flatten)] + pub tokenizer_config: TokenizerConfig, + /// The text to analyze. + pub text: String, +} + +fn analyze_request_filter() -> impl Filter + Clone { + warp::path!("analyze") + .and(warp::post()) + .and(warp::body::json()) +} + +fn analyze_request_handler() -> impl Filter + Clone +{ + analyze_request_filter() + .then(analyze_request) + .and(extract_format_from_qs()) + .map(make_json_api_response) +} + +/// Analyzes text with given tokenizer config and returns the list of tokens. +#[utoipa::path( + post, + tag = "analyze", + path = "/analyze", + request_body = AnalyzeRequest, + responses( + (status = 200, description = "Successfully analyze text.") + ), +)] +async fn analyze_request(request: AnalyzeRequest) -> Result { + let tokens = analyze_text(&request.text, &request.tokenizer_config) + .map_err(|err| IndexServiceError::Internal(format!("{err:?}")))?; + let json_value = serde_json::to_value(tokens) + .map_err(|err| IndexServiceError::Internal(format!("Cannot serialize tokens: {err}")))?; + Ok(json_value) +} + #[cfg(test)] mod tests { use std::ops::{Bound, RangeInclusive}; @@ -1642,4 +1686,45 @@ mod tests { assert_eq!(resp.status(), 405); Ok(()) } + + #[tokio::test] + async fn test_analyze_request() { + let mut metastore = MockMetastore::new(); + metastore + .expect_index_metadata() + .return_once(|_index_id: &str| { + Ok(IndexMetadata::for_test( + "test-index", + "ram:///indexes/test-index", + )) + }); + let index_service = IndexService::new(Arc::new(metastore), StorageResolver::unconfigured()); + let index_management_handler = super::index_management_handlers( + Arc::new(index_service), + Arc::new(QuickwitConfig::for_test()), + ) + .recover(recover_fn); + let resp = warp::test::request() + .path("/analyze") + .method("POST") + .json(&true) + .body(r#"{"type": "ngram", "min_gram": 3, "max_gram": 3, "text": "Hel", "filters": ["lower_caser"]}"#) + .reply(&index_management_handler) + .await; + assert_eq!(resp.status(), 200); + let actual_response_json: JsonValue = serde_json::from_slice(resp.body()).unwrap(); + let expected_response_json = serde_json::json!([ + { + "offset_from": 0, + "offset_to": 3, + "position": 0, + "position_length": 1, + "text": "hel" + } + ]); + assert_json_include!( + actual: actual_response_json, + expected: expected_response_json + ); + } }