diff --git a/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.json b/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.json index 738162998cb..c4a61cf79a5 100644 --- a/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.json +++ b/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.json @@ -4,6 +4,13 @@ "index_id": "hdfs-logs", "index_uri": "s3://quickwit-indexes/hdfs-logs", "doc_mapping": { + "tokenizers": [ + { + "name": "service_regex", + "type": "regex", + "pattern": "\\w*" + } + ], "field_mappings": [ { "name": "tenant_id", @@ -33,7 +40,7 @@ { "name": "service", "type": "text", - "tokenizer": "raw" + "tokenizer": "service_regex" } ] } diff --git a/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.toml b/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.toml index dc5ddcefb3d..4c177c3c513 100644 --- a/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.toml +++ b/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.toml @@ -3,12 +3,15 @@ index_id = "hdfs-logs" index_uri = "s3://quickwit-indexes/hdfs-logs" [doc_mapping] +tokenizers = [ + { name = "service_regex", type = "regex", pattern = "\\w*" }, +] field_mappings = [ { name = "tenant_id", type = "u64", fast = true }, { name = "timestamp", type = "datetime", fast = true }, { name = "severity_text", type = "text", tokenizer = "raw" }, { name = "body", type = "text", tokenizer = "default", record = "position" }, - { name = "resource", type = "object", field_mappings = [ { name = "service", type = "text", tokenizer = "raw" } ] }, + { name = "resource", type = "object", field_mappings = [ { name = "service", type = "text", tokenizer = "service_regex" } ] }, ] tag_fields = [ "tenant_id" ] store_source = true diff --git a/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.yaml b/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.yaml index c8658f9107a..caa390111d3 100644 --- a/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.yaml +++ b/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.yaml @@ -3,6 +3,10 @@ index_id: hdfs-logs index_uri: s3://quickwit-indexes/hdfs-logs doc_mapping: + tokenizers: + - name: service_regex + type: regex + pattern: "\\w*" field_mappings: - name: tenant_id type: u64 @@ -22,7 +26,7 @@ doc_mapping: field_mappings: - name: service type: text - tokenizer: raw + tokenizer: service_regex tag_fields: [tenant_id] timestamp_field: timestamp store_source: true diff --git a/quickwit/quickwit-config/src/index_config/mod.rs b/quickwit/quickwit-config/src/index_config/mod.rs index 0d86d62c407..ac290688431 100644 --- a/quickwit/quickwit-config/src/index_config/mod.rs +++ b/quickwit/quickwit-config/src/index_config/mod.rs @@ -551,6 +551,8 @@ mod tests { &Uri::from_well_formed("s3://defaultbucket/"), ) .unwrap(); + assert_eq!(index_config.doc_mapping.tokenizers.len(), 1); + assert_eq!(index_config.doc_mapping.tokenizers[0].name, "service_regex"); assert_eq!(index_config.doc_mapping.field_mappings.len(), 5); assert_eq!(index_config.doc_mapping.field_mappings[0].name, "tenant_id"); assert_eq!(index_config.doc_mapping.field_mappings[1].name, "timestamp"); diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper_builder.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper_builder.rs index bc6e2cf3357..0cde2fe48a2 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper_builder.rs @@ -67,7 +67,7 @@ pub struct DefaultDocMapperBuilder { /// how the unmapped fields should be handled. #[serde(default)] pub dynamic_mapping: Option, - /// Custom tokenizers. + /// User-defined tokenizers. #[serde(default)] pub tokenizers: Vec, } diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mod.rs index db6f9c8ba6c..a039dfe4bc3 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mod.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mod.rs @@ -40,8 +40,8 @@ pub(crate) use self::field_mapping_entry::{ }; pub(crate) use self::field_mapping_type::FieldMappingType; pub use self::tokenizer_entry::{ - NgramTokenizerOption, RegexTokenizerOption, TokenFilterType, TokenizerConfig, TokenizerEntry, - TokenizerType, + analyze_text, NgramTokenizerOption, RegexTokenizerOption, TokenFilterType, TokenizerConfig, + TokenizerEntry, TokenizerType, }; use crate::QW_RESERVED_FIELD_NAMES; diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs index f4b041d86e4..14b5aa7eea4 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs @@ -21,7 +21,8 @@ use anyhow::Context; use quickwit_query::DEFAULT_REMOVE_TOKEN_LENGTH; use serde::{Deserialize, Serialize}; use tantivy::tokenizer::{ - AsciiFoldingFilter, LowerCaser, NgramTokenizer, RegexTokenizer, RemoveLongFilter, TextAnalyzer, SimpleTokenizer, + AsciiFoldingFilter, LowerCaser, NgramTokenizer, RegexTokenizer, RemoveLongFilter, + SimpleTokenizer, TextAnalyzer, Token, }; /// A `TokenizerEntry` defines a custom tokenizer with its name and configuration. @@ -34,6 +35,7 @@ pub struct TokenizerEntry { pub(crate) config: TokenizerConfig, } +/// Tokenizer configuration. #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, utoipa::ToSchema)] pub struct TokenizerConfig { #[serde(flatten)] @@ -43,6 +45,7 @@ pub struct TokenizerConfig { } impl TokenizerConfig { + /// Build a `TextAnalyzer` from a `TokenizerConfig`. pub fn text_analyzer(&self) -> anyhow::Result { let mut text_analyzer_builder = match &self.tokenizer_type { TokenizerType::Simple => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(), @@ -75,6 +78,17 @@ impl TokenizerConfig { } } +/// Helper function to analyze a text with a given `TokenizerConfig`. +pub fn analyze_text(text: &str, tokenizer: &TokenizerConfig) -> anyhow::Result> { + let mut text_analyzer = tokenizer.text_analyzer()?; + let mut token_stream = text_analyzer.token_stream(text); + let mut tokens = Vec::new(); + token_stream.process(&mut |token| { + tokens.push(token.clone()); + }); + Ok(tokens) +} + #[derive(Clone, Debug, PartialEq, Serialize, Deserialize, utoipa::ToSchema)] #[serde(rename_all = "snake_case")] pub enum TokenFilterType { @@ -129,6 +143,7 @@ pub struct RegexTokenizerOption { #[cfg(test)] mod tests { use super::{NgramTokenizerOption, TokenizerType}; + use crate::default_doc_mapper::RegexTokenizerOption; use crate::TokenizerEntry; #[test] @@ -139,13 +154,13 @@ mod tests { { "name": "my_tokenizer", "type": "ngram", + "min_gram": 1, + "max_gram": 3, "filters": [ "remove_long", "lower_caser", "ascii_folding" - ], - "min_gram": 1, - "max_gram": 3 + ] } "#, ); @@ -175,13 +190,13 @@ mod tests { { "name": "my_tokenizer", "type": "ngram", + "min_gram": 1, + "max_gram": 3, "filters": [ "remove_long", "lower_caser", "ascii_folding" ], - "min_gram": 1, - "max_gram": 3, "abc": 123 } "#, @@ -194,7 +209,7 @@ mod tests { } #[test] - fn test_deserialize_tokenizer_entry_regex() { + fn test_tokenizer_entry_regex() { let result: Result = serde_json::from_str::( r#" @@ -206,5 +221,18 @@ mod tests { "#, ); assert!(result.is_ok()); + let tokenizer_config_entry = result.unwrap(); + assert_eq!(tokenizer_config_entry.config.filters.len(), 0); + match tokenizer_config_entry.config.tokenizer_type { + TokenizerType::Regex(options) => { + assert_eq!( + options, + RegexTokenizerOption { + pattern: "(my_pattern)".to_string(), + } + ) + } + _ => panic!("Unexpected tokenizer type"), + } } } diff --git a/quickwit/quickwit-doc-mapper/src/lib.rs b/quickwit/quickwit-doc-mapper/src/lib.rs index 5eb791edef5..de7410c3435 100644 --- a/quickwit/quickwit-doc-mapper/src/lib.rs +++ b/quickwit/quickwit-doc-mapper/src/lib.rs @@ -35,13 +35,13 @@ mod routing_expression; pub mod tag_pruning; pub use default_doc_mapper::{ - DefaultDocMapper, DefaultDocMapperBuilder, FieldMappingEntry, ModeType, QuickwitJsonOptions, - TokenizerEntry, + analyze_text, DefaultDocMapper, DefaultDocMapperBuilder, FieldMappingEntry, ModeType, + QuickwitJsonOptions, TokenizerConfig, TokenizerEntry, }; use default_doc_mapper::{ FastFieldOptions, FieldMappingEntryForSerialization, IndexRecordOptionSchema, NgramTokenizerOption, QuickwitTextNormalizer, QuickwitTextTokenizer, RegexTokenizerOption, - TokenFilterType, TokenizerConfig, TokenizerType, + TokenFilterType, TokenizerType, }; pub use doc_mapper::{DocMapper, JsonObject, NamedField, TermRange, WarmupInfo}; pub use error::{DocParsingError, QueryParserError}; diff --git a/quickwit/quickwit-serve/src/index_api/rest_handler.rs b/quickwit/quickwit-serve/src/index_api/rest_handler.rs index e205c2ae1c7..7e8327bcbf6 100644 --- a/quickwit/quickwit-serve/src/index_api/rest_handler.rs +++ b/quickwit/quickwit-serve/src/index_api/rest_handler.rs @@ -28,6 +28,7 @@ use quickwit_config::{ CLI_INGEST_SOURCE_ID, INGEST_API_SOURCE_ID, }; use quickwit_core::{IndexService, IndexServiceError}; +use quickwit_doc_mapper::{analyze_text, TokenizerConfig}; use quickwit_metastore::{ IndexMetadata, ListSplitsQuery, Metastore, MetastoreError, Split, SplitState, }; @@ -82,6 +83,8 @@ pub fn index_management_handlers( .or(create_source_handler(index_service.clone())) .or(get_source_handler(index_service.metastore())) .or(delete_source_handler(index_service.metastore())) + // Tokenizer handlers. + .or(analyze_request_handler()) } fn json_body( @@ -718,6 +721,47 @@ async fn delete_source( Ok(()) } +#[derive(Debug, Deserialize, utoipa::IntoParams, utoipa::ToSchema)] +struct AnalyzeRequest { + /// The tokenizer to use. + #[serde(flatten)] + pub tokenizer_config: TokenizerConfig, + /// The text to analyze. + pub text: String, +} + +fn analyze_request_filter() -> impl Filter + Clone { + warp::path!("analyze") + .and(warp::post()) + .and(warp::body::json()) +} + +fn analyze_request_handler() -> impl Filter + Clone +{ + analyze_request_filter() + .then(analyze_request) + .and(extract_format_from_qs()) + .map(make_json_api_response) +} + +/// Analyzes text with given tokenizer config and returns the list of tokens. +#[utoipa::path( + post, + tag = "analyze", + path = "/analyze", + request_body = AnalyzeRequest, + responses( + (status = 200, description = "Successfully analyze text.") + ), +)] +async fn analyze_request(request: AnalyzeRequest) -> Result { + let tokens = analyze_text(&request.text, &request.tokenizer_config) + .map_err(|err| IndexServiceError::Internal(err.to_string()))?; + let json_value = + serde_json::to_value(tokens).map_err(|err| IndexServiceError::Internal(err.to_string()))?; + Ok(json_value) +} + #[cfg(test)] mod tests { use std::ops::{Bound, RangeInclusive}; @@ -1642,4 +1686,45 @@ mod tests { assert_eq!(resp.status(), 405); Ok(()) } + + #[tokio::test] + async fn test_analyze_request() { + let mut metastore = MockMetastore::new(); + metastore + .expect_index_metadata() + .return_once(|_index_id: &str| { + Ok(IndexMetadata::for_test( + "test-index", + "ram:///indexes/test-index", + )) + }); + let index_service = IndexService::new(Arc::new(metastore), StorageResolver::unconfigured()); + let index_management_handler = super::index_management_handlers( + Arc::new(index_service), + Arc::new(QuickwitConfig::for_test()), + ) + .recover(recover_fn); + let resp = warp::test::request() + .path("/analyze") + .method("POST") + .json(&true) + .body(r#"{"type": "ngram", "min_gram": 3, "max_gram": 3, "text": "Hel", "filters": ["lower_caser"]}"#) + .reply(&index_management_handler) + .await; + assert_eq!(resp.status(), 200); + let actual_response_json: JsonValue = serde_json::from_slice(resp.body()).unwrap(); + let expected_response_json = serde_json::json!([ + { + "offset_from": 0, + "offset_to": 3, + "position": 0, + "position_length": 1, + "text": "hel" + } + ]); + assert_json_include!( + actual: actual_response_json, + expected: expected_response_json + ); + } }