Skip to content

Commit

Permalink
Improve tests and add endpoint to analyze text.
Browse files Browse the repository at this point in the history
  • Loading branch information
fmassot committed Jul 4, 2023
1 parent 0dd5aae commit a6cd4f4
Show file tree
Hide file tree
Showing 9 changed files with 145 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@
"index_id": "hdfs-logs",
"index_uri": "s3://quickwit-indexes/hdfs-logs",
"doc_mapping": {
"tokenizers": [
{
"name": "service_regex",
"type": "regex",
"pattern": "\\w*"
}
],
"field_mappings": [
{
"name": "tenant_id",
Expand Down Expand Up @@ -33,7 +40,7 @@
{
"name": "service",
"type": "text",
"tokenizer": "raw"
"tokenizer": "service_regex"
}
]
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@ index_id = "hdfs-logs"
index_uri = "s3://quickwit-indexes/hdfs-logs"

[doc_mapping]
tokenizers = [
{ name = "service_regex", type = "regex", pattern = "\\w*" },
]
field_mappings = [
{ name = "tenant_id", type = "u64", fast = true },
{ name = "timestamp", type = "datetime", fast = true },
{ name = "severity_text", type = "text", tokenizer = "raw" },
{ name = "body", type = "text", tokenizer = "default", record = "position" },
{ name = "resource", type = "object", field_mappings = [ { name = "service", type = "text", tokenizer = "raw" } ] },
{ name = "resource", type = "object", field_mappings = [ { name = "service", type = "text", tokenizer = "service_regex" } ] },
]
tag_fields = [ "tenant_id" ]
store_source = true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@ index_id: hdfs-logs
index_uri: s3://quickwit-indexes/hdfs-logs

doc_mapping:
tokenizers:
- name: service_regex
type: regex
pattern: "\\w*"
field_mappings:
- name: tenant_id
type: u64
Expand All @@ -22,7 +26,7 @@ doc_mapping:
field_mappings:
- name: service
type: text
tokenizer: raw
tokenizer: service_regex
tag_fields: [tenant_id]
timestamp_field: timestamp
store_source: true
Expand Down
2 changes: 2 additions & 0 deletions quickwit/quickwit-config/src/index_config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,8 @@ mod tests {
&Uri::from_well_formed("s3://defaultbucket/"),
)
.unwrap();
assert_eq!(index_config.doc_mapping.tokenizers.len(), 1);
assert_eq!(index_config.doc_mapping.tokenizers[0].name, "service_regex");
assert_eq!(index_config.doc_mapping.field_mappings.len(), 5);
assert_eq!(index_config.doc_mapping.field_mappings[0].name, "tenant_id");
assert_eq!(index_config.doc_mapping.field_mappings[1].name, "timestamp");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ pub struct DefaultDocMapperBuilder {
/// how the unmapped fields should be handled.
#[serde(default)]
pub dynamic_mapping: Option<QuickwitJsonOptions>,
/// Custom tokenizers.
/// User-defined tokenizers.
#[serde(default)]
pub tokenizers: Vec<TokenizerEntry>,
}
Expand Down
4 changes: 2 additions & 2 deletions quickwit/quickwit-doc-mapper/src/default_doc_mapper/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ pub(crate) use self::field_mapping_entry::{
};
pub(crate) use self::field_mapping_type::FieldMappingType;
pub use self::tokenizer_entry::{
NgramTokenizerOption, RegexTokenizerOption, TokenFilterType, TokenizerConfig, TokenizerEntry,
TokenizerType,
analyze_text, NgramTokenizerOption, RegexTokenizerOption, TokenFilterType, TokenizerConfig,
TokenizerEntry, TokenizerType,
};
use crate::QW_RESERVED_FIELD_NAMES;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ use anyhow::Context;
use quickwit_query::DEFAULT_REMOVE_TOKEN_LENGTH;
use serde::{Deserialize, Serialize};
use tantivy::tokenizer::{
AsciiFoldingFilter, LowerCaser, NgramTokenizer, RegexTokenizer, RemoveLongFilter, TextAnalyzer, SimpleTokenizer,
AsciiFoldingFilter, LowerCaser, NgramTokenizer, RegexTokenizer, RemoveLongFilter,
SimpleTokenizer, TextAnalyzer, Token,
};

/// A `TokenizerEntry` defines a custom tokenizer with its name and configuration.
Expand All @@ -34,6 +35,7 @@ pub struct TokenizerEntry {
pub(crate) config: TokenizerConfig,
}

/// Tokenizer configuration.
#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, utoipa::ToSchema)]
pub struct TokenizerConfig {
#[serde(flatten)]
Expand All @@ -43,6 +45,7 @@ pub struct TokenizerConfig {
}

impl TokenizerConfig {
/// Build a `TextAnalyzer` from a `TokenizerConfig`.
pub fn text_analyzer(&self) -> anyhow::Result<TextAnalyzer> {
let mut text_analyzer_builder = match &self.tokenizer_type {
TokenizerType::Simple => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(),
Expand Down Expand Up @@ -75,6 +78,17 @@ impl TokenizerConfig {
}
}

/// Helper function to analyze a text with a given `TokenizerConfig`.
pub fn analyze_text(text: &str, tokenizer: &TokenizerConfig) -> anyhow::Result<Vec<Token>> {
let mut text_analyzer = tokenizer.text_analyzer()?;
let mut token_stream = text_analyzer.token_stream(text);
let mut tokens = Vec::new();
token_stream.process(&mut |token| {
tokens.push(token.clone());
});
Ok(tokens)
}

#[derive(Clone, Debug, PartialEq, Serialize, Deserialize, utoipa::ToSchema)]
#[serde(rename_all = "snake_case")]
pub enum TokenFilterType {
Expand Down Expand Up @@ -129,6 +143,7 @@ pub struct RegexTokenizerOption {
#[cfg(test)]
mod tests {
use super::{NgramTokenizerOption, TokenizerType};
use crate::default_doc_mapper::RegexTokenizerOption;
use crate::TokenizerEntry;

#[test]
Expand All @@ -139,13 +154,13 @@ mod tests {
{
"name": "my_tokenizer",
"type": "ngram",
"min_gram": 1,
"max_gram": 3,
"filters": [
"remove_long",
"lower_caser",
"ascii_folding"
],
"min_gram": 1,
"max_gram": 3
]
}
"#,
);
Expand Down Expand Up @@ -175,13 +190,13 @@ mod tests {
{
"name": "my_tokenizer",
"type": "ngram",
"min_gram": 1,
"max_gram": 3,
"filters": [
"remove_long",
"lower_caser",
"ascii_folding"
],
"min_gram": 1,
"max_gram": 3,
"abc": 123
}
"#,
Expand All @@ -194,7 +209,7 @@ mod tests {
}

#[test]
fn test_deserialize_tokenizer_entry_regex() {
fn test_tokenizer_entry_regex() {
let result: Result<TokenizerEntry, serde_json::Error> =
serde_json::from_str::<TokenizerEntry>(
r#"
Expand All @@ -206,5 +221,18 @@ mod tests {
"#,
);
assert!(result.is_ok());
let tokenizer_config_entry = result.unwrap();
assert_eq!(tokenizer_config_entry.config.filters.len(), 0);
match tokenizer_config_entry.config.tokenizer_type {
TokenizerType::Regex(options) => {
assert_eq!(
options,
RegexTokenizerOption {
pattern: "(my_pattern)".to_string(),
}
)
}
_ => panic!("Unexpected tokenizer type"),
}
}
}
6 changes: 3 additions & 3 deletions quickwit/quickwit-doc-mapper/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,13 @@ mod routing_expression;
pub mod tag_pruning;

pub use default_doc_mapper::{
DefaultDocMapper, DefaultDocMapperBuilder, FieldMappingEntry, ModeType, QuickwitJsonOptions,
TokenizerEntry,
analyze_text, DefaultDocMapper, DefaultDocMapperBuilder, FieldMappingEntry, ModeType,
QuickwitJsonOptions, TokenizerConfig, TokenizerEntry,
};
use default_doc_mapper::{
FastFieldOptions, FieldMappingEntryForSerialization, IndexRecordOptionSchema,
NgramTokenizerOption, QuickwitTextNormalizer, QuickwitTextTokenizer, RegexTokenizerOption,
TokenFilterType, TokenizerConfig, TokenizerType,
TokenFilterType, TokenizerType,
};
pub use doc_mapper::{DocMapper, JsonObject, NamedField, TermRange, WarmupInfo};
pub use error::{DocParsingError, QueryParserError};
Expand Down
85 changes: 85 additions & 0 deletions quickwit/quickwit-serve/src/index_api/rest_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ use quickwit_config::{
CLI_INGEST_SOURCE_ID, INGEST_API_SOURCE_ID,
};
use quickwit_core::{IndexService, IndexServiceError};
use quickwit_doc_mapper::{analyze_text, TokenizerConfig};
use quickwit_metastore::{
IndexMetadata, ListSplitsQuery, Metastore, MetastoreError, Split, SplitState,
};
Expand Down Expand Up @@ -82,6 +83,8 @@ pub fn index_management_handlers(
.or(create_source_handler(index_service.clone()))
.or(get_source_handler(index_service.metastore()))
.or(delete_source_handler(index_service.metastore()))
// Tokenizer handlers.
.or(analyze_request_handler())
}

fn json_body<T: DeserializeOwned + Send>(
Expand Down Expand Up @@ -718,6 +721,47 @@ async fn delete_source(
Ok(())
}

#[derive(Debug, Deserialize, utoipa::IntoParams, utoipa::ToSchema)]
struct AnalyzeRequest {
/// The tokenizer to use.
#[serde(flatten)]
pub tokenizer_config: TokenizerConfig,
/// The text to analyze.
pub text: String,
}

fn analyze_request_filter() -> impl Filter<Extract = (AnalyzeRequest,), Error = Rejection> + Clone {
warp::path!("analyze")
.and(warp::post())
.and(warp::body::json())
}

fn analyze_request_handler() -> impl Filter<Extract = (impl warp::Reply,), Error = Rejection> + Clone
{
analyze_request_filter()
.then(analyze_request)
.and(extract_format_from_qs())
.map(make_json_api_response)
}

/// Analyzes text with given tokenizer config and returns the list of tokens.
#[utoipa::path(
post,
tag = "analyze",
path = "/analyze",
request_body = AnalyzeRequest,
responses(
(status = 200, description = "Successfully analyze text.")
),
)]
async fn analyze_request(request: AnalyzeRequest) -> Result<serde_json::Value, IndexServiceError> {
let tokens = analyze_text(&request.text, &request.tokenizer_config)
.map_err(|err| IndexServiceError::Internal(err.to_string()))?;
let json_value =
serde_json::to_value(tokens).map_err(|err| IndexServiceError::Internal(err.to_string()))?;
Ok(json_value)
}

#[cfg(test)]
mod tests {
use std::ops::{Bound, RangeInclusive};
Expand Down Expand Up @@ -1642,4 +1686,45 @@ mod tests {
assert_eq!(resp.status(), 405);
Ok(())
}

#[tokio::test]
async fn test_analyze_request() {
let mut metastore = MockMetastore::new();
metastore
.expect_index_metadata()
.return_once(|_index_id: &str| {
Ok(IndexMetadata::for_test(
"test-index",
"ram:///indexes/test-index",
))
});
let index_service = IndexService::new(Arc::new(metastore), StorageResolver::unconfigured());
let index_management_handler = super::index_management_handlers(
Arc::new(index_service),
Arc::new(QuickwitConfig::for_test()),
)
.recover(recover_fn);
let resp = warp::test::request()
.path("/analyze")
.method("POST")
.json(&true)
.body(r#"{"type": "ngram", "min_gram": 3, "max_gram": 3, "text": "Hel", "filters": ["lower_caser"]}"#)
.reply(&index_management_handler)
.await;
assert_eq!(resp.status(), 200);
let actual_response_json: JsonValue = serde_json::from_slice(resp.body()).unwrap();
let expected_response_json = serde_json::json!([
{
"offset_from": 0,
"offset_to": 3,
"position": 0,
"position_length": 1,
"text": "hel"
}
]);
assert_json_include!(
actual: actual_response_json,
expected: expected_response_json
);
}
}

0 comments on commit a6cd4f4

Please sign in to comment.