Improve tests and add endpoint to analyze text.

quickwit-oss · Jul 4, 2023 · a6cd4f4 · a6cd4f4
1 parent 0dd5aae
commit a6cd4f4
Show file tree

Hide file tree

Showing 9 changed files with 145 additions and 16 deletions.
diff --git a/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.json b/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.json
@@ -4,6 +4,13 @@
     "index_id": "hdfs-logs",
     "index_uri": "s3://quickwit-indexes/hdfs-logs",
     "doc_mapping": {
+        "tokenizers": [
+            {
+                "name": "service_regex",
+                "type": "regex",
+                "pattern": "\\w*"
+            }
+        ],
         "field_mappings": [
             {
                 "name": "tenant_id",
@@ -33,7 +40,7 @@
                     {
                         "name": "service",
                         "type": "text",
-                        "tokenizer": "raw"
+                        "tokenizer": "service_regex"
                     }
                 ]
             }

diff --git a/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.toml b/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.toml
@@ -3,12 +3,15 @@ index_id = "hdfs-logs"
 index_uri = "s3://quickwit-indexes/hdfs-logs"
 
 [doc_mapping]
+tokenizers = [
+  { name = "service_regex", type = "regex", pattern = "\\w*" },
+]
 field_mappings = [
   { name = "tenant_id", type = "u64", fast = true },
   { name = "timestamp", type = "datetime", fast = true },
   { name = "severity_text", type = "text", tokenizer = "raw" },
   { name = "body", type = "text", tokenizer = "default", record = "position" },
-  { name = "resource", type = "object", field_mappings = [ { name = "service", type = "text", tokenizer = "raw" } ] },
+  { name = "resource", type = "object", field_mappings = [ { name = "service", type = "text", tokenizer = "service_regex" } ] },
 ]
 tag_fields = [ "tenant_id" ]
 store_source = true

diff --git a/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.yaml b/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.yaml
@@ -3,6 +3,10 @@ index_id: hdfs-logs
 index_uri: s3://quickwit-indexes/hdfs-logs
 
 doc_mapping:
+  tokenizers:
+    - name: service_regex
+      type: regex
+      pattern: "\\w*"
   field_mappings:
     - name: tenant_id
       type: u64
@@ -22,7 +26,7 @@ doc_mapping:
       field_mappings:
         - name: service
           type: text
-          tokenizer: raw
+          tokenizer: service_regex
   tag_fields: [tenant_id]
   timestamp_field: timestamp
   store_source: true

diff --git a/quickwit/quickwit-config/src/index_config/mod.rs b/quickwit/quickwit-config/src/index_config/mod.rs
@@ -551,6 +551,8 @@ mod tests {
             &Uri::from_well_formed("s3://defaultbucket/"),
         )
         .unwrap();
+        assert_eq!(index_config.doc_mapping.tokenizers.len(), 1);
+        assert_eq!(index_config.doc_mapping.tokenizers[0].name, "service_regex");
         assert_eq!(index_config.doc_mapping.field_mappings.len(), 5);
         assert_eq!(index_config.doc_mapping.field_mappings[0].name, "tenant_id");
         assert_eq!(index_config.doc_mapping.field_mappings[1].name, "timestamp");

diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper_builder.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper_builder.rs
@@ -67,7 +67,7 @@ pub struct DefaultDocMapperBuilder {
     /// how the unmapped fields should be handled.
     #[serde(default)]
     pub dynamic_mapping: Option<QuickwitJsonOptions>,
-    /// Custom tokenizers.
+    /// User-defined tokenizers.
     #[serde(default)]
     pub tokenizers: Vec<TokenizerEntry>,
 }

diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mod.rs
@@ -40,8 +40,8 @@ pub(crate) use self::field_mapping_entry::{
 };
 pub(crate) use self::field_mapping_type::FieldMappingType;
 pub use self::tokenizer_entry::{
-    NgramTokenizerOption, RegexTokenizerOption, TokenFilterType, TokenizerConfig, TokenizerEntry,
-    TokenizerType,
+    analyze_text, NgramTokenizerOption, RegexTokenizerOption, TokenFilterType, TokenizerConfig,
+    TokenizerEntry, TokenizerType,
 };
 use crate::QW_RESERVED_FIELD_NAMES;
 

diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs
@@ -21,7 +21,8 @@ use anyhow::Context;
 use quickwit_query::DEFAULT_REMOVE_TOKEN_LENGTH;
 use serde::{Deserialize, Serialize};
 use tantivy::tokenizer::{
-    AsciiFoldingFilter, LowerCaser, NgramTokenizer, RegexTokenizer, RemoveLongFilter, TextAnalyzer, SimpleTokenizer,
+    AsciiFoldingFilter, LowerCaser, NgramTokenizer, RegexTokenizer, RemoveLongFilter,
+    SimpleTokenizer, TextAnalyzer, Token,
 };
 
 /// A `TokenizerEntry` defines a custom tokenizer with its name and configuration.
@@ -34,6 +35,7 @@ pub struct TokenizerEntry {
     pub(crate) config: TokenizerConfig,
 }
 
+/// Tokenizer configuration.
 #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, utoipa::ToSchema)]
 pub struct TokenizerConfig {
     #[serde(flatten)]
@@ -43,6 +45,7 @@ pub struct TokenizerConfig {
 }
 
 impl TokenizerConfig {
+    /// Build a `TextAnalyzer` from a `TokenizerConfig`.
     pub fn text_analyzer(&self) -> anyhow::Result<TextAnalyzer> {
         let mut text_analyzer_builder = match &self.tokenizer_type {
             TokenizerType::Simple => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(),
@@ -75,6 +78,17 @@ impl TokenizerConfig {
     }
 }
 
+/// Helper function to analyze a text with a given `TokenizerConfig`.
+pub fn analyze_text(text: &str, tokenizer: &TokenizerConfig) -> anyhow::Result<Vec<Token>> {
+    let mut text_analyzer = tokenizer.text_analyzer()?;
+    let mut token_stream = text_analyzer.token_stream(text);
+    let mut tokens = Vec::new();
+    token_stream.process(&mut |token| {
+        tokens.push(token.clone());
+    });
+    Ok(tokens)
+}
+
 #[derive(Clone, Debug, PartialEq, Serialize, Deserialize, utoipa::ToSchema)]
 #[serde(rename_all = "snake_case")]
 pub enum TokenFilterType {
@@ -129,6 +143,7 @@ pub struct RegexTokenizerOption {
 #[cfg(test)]
 mod tests {
     use super::{NgramTokenizerOption, TokenizerType};
+    use crate::default_doc_mapper::RegexTokenizerOption;
     use crate::TokenizerEntry;
 
     #[test]
@@ -139,13 +154,13 @@ mod tests {
             {
                 "name": "my_tokenizer",
                 "type": "ngram",
+                "min_gram": 1,
+                "max_gram": 3,
                 "filters": [
                     "remove_long",
                     "lower_caser",
                     "ascii_folding"
-                ],
-                "min_gram": 1,
-                "max_gram": 3
+                ]
             }
             "#,
             );
@@ -175,13 +190,13 @@ mod tests {
             {
                 "name": "my_tokenizer",
                 "type": "ngram",
+                "min_gram": 1,
+                "max_gram": 3,
                 "filters": [
                     "remove_long",
                     "lower_caser",
                     "ascii_folding"
                 ],
-                "min_gram": 1,
-                "max_gram": 3,
                 "abc": 123
             }
             "#,
@@ -194,7 +209,7 @@ mod tests {
     }
 
     #[test]
-    fn test_deserialize_tokenizer_entry_regex() {
+    fn test_tokenizer_entry_regex() {
         let result: Result<TokenizerEntry, serde_json::Error> =
             serde_json::from_str::<TokenizerEntry>(
                 r#"
@@ -206,5 +221,18 @@ mod tests {
             "#,
             );
         assert!(result.is_ok());
+        let tokenizer_config_entry = result.unwrap();
+        assert_eq!(tokenizer_config_entry.config.filters.len(), 0);
+        match tokenizer_config_entry.config.tokenizer_type {
+            TokenizerType::Regex(options) => {
+                assert_eq!(
+                    options,
+                    RegexTokenizerOption {
+                        pattern: "(my_pattern)".to_string(),
+                    }
+                )
+            }
+            _ => panic!("Unexpected tokenizer type"),
+        }
     }
 }
diff --git a/quickwit/quickwit-doc-mapper/src/lib.rs b/quickwit/quickwit-doc-mapper/src/lib.rs
@@ -35,13 +35,13 @@ mod routing_expression;
 pub mod tag_pruning;
 
 pub use default_doc_mapper::{
-    DefaultDocMapper, DefaultDocMapperBuilder, FieldMappingEntry, ModeType, QuickwitJsonOptions,
-    TokenizerEntry,
+    analyze_text, DefaultDocMapper, DefaultDocMapperBuilder, FieldMappingEntry, ModeType,
+    QuickwitJsonOptions, TokenizerConfig, TokenizerEntry,
 };
 use default_doc_mapper::{
     FastFieldOptions, FieldMappingEntryForSerialization, IndexRecordOptionSchema,
     NgramTokenizerOption, QuickwitTextNormalizer, QuickwitTextTokenizer, RegexTokenizerOption,
-    TokenFilterType, TokenizerConfig, TokenizerType,
+    TokenFilterType, TokenizerType,
 };
 pub use doc_mapper::{DocMapper, JsonObject, NamedField, TermRange, WarmupInfo};
 pub use error::{DocParsingError, QueryParserError};

diff --git a/quickwit/quickwit-serve/src/index_api/rest_handler.rs b/quickwit/quickwit-serve/src/index_api/rest_handler.rs
@@ -28,6 +28,7 @@ use quickwit_config::{
     CLI_INGEST_SOURCE_ID, INGEST_API_SOURCE_ID,
 };
 use quickwit_core::{IndexService, IndexServiceError};
+use quickwit_doc_mapper::{analyze_text, TokenizerConfig};
 use quickwit_metastore::{
     IndexMetadata, ListSplitsQuery, Metastore, MetastoreError, Split, SplitState,
 };
@@ -82,6 +83,8 @@ pub fn index_management_handlers(
         .or(create_source_handler(index_service.clone()))
         .or(get_source_handler(index_service.metastore()))
         .or(delete_source_handler(index_service.metastore()))
+        // Tokenizer handlers.
+        .or(analyze_request_handler())
 }
 
 fn json_body<T: DeserializeOwned + Send>(
@@ -718,6 +721,47 @@ async fn delete_source(
     Ok(())
 }
 
+#[derive(Debug, Deserialize, utoipa::IntoParams, utoipa::ToSchema)]
+struct AnalyzeRequest {
+    /// The tokenizer to use.
+    #[serde(flatten)]
+    pub tokenizer_config: TokenizerConfig,
+    /// The text to analyze.
+    pub text: String,
+}
+
+fn analyze_request_filter() -> impl Filter<Extract = (AnalyzeRequest,), Error = Rejection> + Clone {
+    warp::path!("analyze")
+        .and(warp::post())
+        .and(warp::body::json())
+}
+
+fn analyze_request_handler() -> impl Filter<Extract = (impl warp::Reply,), Error = Rejection> + Clone
+{
+    analyze_request_filter()
+        .then(analyze_request)
+        .and(extract_format_from_qs())
+        .map(make_json_api_response)
+}
+
+/// Analyzes text with given tokenizer config and returns the list of tokens.
+#[utoipa::path(
+    post,
+    tag = "analyze",
+    path = "/analyze",
+    request_body = AnalyzeRequest,
+    responses(
+        (status = 200, description = "Successfully analyze text.")
+    ),
+)]
+async fn analyze_request(request: AnalyzeRequest) -> Result<serde_json::Value, IndexServiceError> {
+    let tokens = analyze_text(&request.text, &request.tokenizer_config)
+        .map_err(|err| IndexServiceError::Internal(err.to_string()))?;
+    let json_value =
+        serde_json::to_value(tokens).map_err(|err| IndexServiceError::Internal(err.to_string()))?;
+    Ok(json_value)
+}
+
 #[cfg(test)]
 mod tests {
     use std::ops::{Bound, RangeInclusive};
@@ -1642,4 +1686,45 @@ mod tests {
         assert_eq!(resp.status(), 405);
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_analyze_request() {
+        let mut metastore = MockMetastore::new();
+        metastore
+            .expect_index_metadata()
+            .return_once(|_index_id: &str| {
+                Ok(IndexMetadata::for_test(
+                    "test-index",
+                    "ram:///indexes/test-index",
+                ))
+            });
+        let index_service = IndexService::new(Arc::new(metastore), StorageResolver::unconfigured());
+        let index_management_handler = super::index_management_handlers(
+            Arc::new(index_service),
+            Arc::new(QuickwitConfig::for_test()),
+        )
+        .recover(recover_fn);
+        let resp = warp::test::request()
+            .path("/analyze")
+            .method("POST")
+            .json(&true)
+            .body(r#"{"type": "ngram", "min_gram": 3, "max_gram": 3, "text": "Hel", "filters": ["lower_caser"]}"#)
+            .reply(&index_management_handler)
+            .await;
+        assert_eq!(resp.status(), 200);
+        let actual_response_json: JsonValue = serde_json::from_slice(resp.body()).unwrap();
+        let expected_response_json = serde_json::json!([
+            {
+                "offset_from": 0,
+                "offset_to": 3,
+                "position": 0,
+                "position_length": 1,
+                "text": "hel"
+            }
+        ]);
+        assert_json_include!(
+            actual: actual_response_json,
+            expected: expected_response_json
+        );
+    }
 }