Skip to content

Commit

Permalink
Clean up DefaultDocMapperBuilder (#5085)
Browse files Browse the repository at this point in the history
  • Loading branch information
guilload authored Jun 6, 2024
1 parent a3e912a commit e69d3c5
Show file tree
Hide file tree
Showing 28 changed files with 455 additions and 434 deletions.
1 change: 0 additions & 1 deletion quickwit/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion quickwit/quickwit-config/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ vrl = { workspace = true, optional = true }

quickwit-common = { workspace = true }
quickwit-doc-mapper = { workspace = true }
quickwit-macros = { workspace = true }
quickwit-proto = { workspace = true }

[dev-dependencies]
Expand Down
104 changes: 11 additions & 93 deletions quickwit/quickwit-config/src/index_config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,7 @@ use chrono::Utc;
use cron::Schedule;
use humantime::parse_duration;
use quickwit_common::uri::Uri;
use quickwit_doc_mapper::{
DefaultDocMapper, DefaultDocMapperBuilder, DocMapper, FieldMappingEntry, Mode, ModeType,
QuickwitJsonOptions, TokenizerEntry,
};
use quickwit_doc_mapper::{DefaultDocMapperBuilder, DocMapper, DocMapping, Mode};
use quickwit_proto::types::IndexId;
use serde::{Deserialize, Serialize};
pub use serialize::load_index_config_from_user_config;
Expand All @@ -44,57 +41,6 @@ use crate::index_config::serialize::VersionedIndexConfig;
use crate::merge_policy_config::{MergePolicyConfig, StableLogMergePolicyConfig};
use crate::TestableForRegression;

// Note(fmassot): `DocMapping` is a struct only used for
// serialization/deserialization of `DocMapper` parameters.
// This is partly a duplicate of the `DefaultDocMapper` and
// can be viewed as a temporary hack for 0.2 release before
// refactoring.
#[quickwit_macros::serde_multikey]
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, utoipa::ToSchema)]
#[serde(deny_unknown_fields)]
pub struct DocMapping {
#[serde(default)]
#[schema(value_type = Vec<FieldMappingEntryForSerialization>)]
/// The mapping of the index schema fields.
///
/// This defines the name, type and other information about the field(s).
///
/// Properties are determined by the specified type, for more information
/// please see: <https://quickwit.io/docs/configuration/index-config#field-types>
pub field_mappings: Vec<FieldMappingEntry>,
#[schema(value_type = Vec<String>)]
#[serde(default)]
pub tag_fields: BTreeSet<String>,
#[serde(default)]
pub store_source: bool,
#[serde(default)]
pub index_field_presence: bool,
#[serde(default)]
pub timestamp_field: Option<String>,
#[serde_multikey(
deserializer = Mode::from_parts,
serializer = Mode::into_parts,
fields = (
#[serde(default)]
mode: ModeType,
#[serde(skip_serializing_if = "Option::is_none")]
dynamic_mapping: Option<QuickwitJsonOptions>
),
)]
pub mode: Mode,
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub partition_key: Option<String>,
#[schema(value_type = u32)]
#[serde(default = "DefaultDocMapper::default_max_num_partitions")]
pub max_num_partitions: NonZeroU32,
#[serde(default)]
pub tokenizers: Vec<TokenizerEntry>,
/// Record document length
#[serde(default)]
pub document_length: bool,
}

#[derive(Clone, Debug, Serialize, Deserialize, utoipa::ToSchema)]
#[serde(deny_unknown_fields)]
pub struct IndexingResources {
Expand Down Expand Up @@ -440,24 +386,21 @@ impl TestableForRegression for IndexConfig {
)
.unwrap();
let doc_mapping = DocMapping {
index_field_presence: true,
mode: Mode::default(),
field_mappings: vec![
tenant_id_mapping,
timestamp_mapping,
log_level_mapping,
message_mapping,
],
tag_fields: ["tenant_id", "log_level"]
.into_iter()
.map(|tag_field| tag_field.to_string())
.collect::<BTreeSet<String>>(),
store_source: true,
mode: Mode::default(),
timestamp_field: Some("timestamp".to_string()),
tag_fields: BTreeSet::from_iter(["tenant_id".to_string(), "log_level".to_string()]),
partition_key: Some("tenant_id".to_string()),
max_num_partitions: NonZeroU32::new(100).unwrap(),
timestamp_field: Some("timestamp".to_string()),
index_field_presence: true,
store_document_size: false,
store_source: true,
tokenizers: vec![tokenizer],
document_length: false,
};
let retention_policy = Some(RetentionPolicy {
retention_period: "90 days".to_string(),
Expand Down Expand Up @@ -496,46 +439,20 @@ impl TestableForRegression for IndexConfig {
fn assert_equality(&self, other: &Self) {
assert_eq!(self.index_id, other.index_id);
assert_eq!(self.index_uri, other.index_uri);
assert_eq!(
self.doc_mapping
.field_mappings
.iter()
.map(|field_mapping| &field_mapping.name)
.collect::<Vec<_>>(),
other
.doc_mapping
.field_mappings
.iter()
.map(|field_mapping| &field_mapping.name)
.collect::<Vec<_>>(),
);
assert_eq!(self.doc_mapping.tag_fields, other.doc_mapping.tag_fields,);
assert_eq!(
self.doc_mapping.store_source,
other.doc_mapping.store_source,
);
assert_eq!(self.doc_mapping, other.doc_mapping);
assert_eq!(self.indexing_settings, other.indexing_settings);
assert_eq!(self.search_settings, other.search_settings);
}
}

/// Builds and returns the doc mapper associated with index.
/// Builds and returns the doc mapper associated with an index.
pub fn build_doc_mapper(
doc_mapping: &DocMapping,
search_settings: &SearchSettings,
) -> anyhow::Result<Arc<dyn DocMapper>> {
let builder = DefaultDocMapperBuilder {
store_source: doc_mapping.store_source,
index_field_presence: doc_mapping.index_field_presence,
doc_mapping: doc_mapping.clone(),
default_search_fields: search_settings.default_search_fields.clone(),
timestamp_field: doc_mapping.timestamp_field.clone(),
field_mappings: doc_mapping.field_mappings.clone(),
tag_fields: doc_mapping.tag_fields.iter().cloned().collect(),
mode: doc_mapping.mode.clone(),
partition_key: doc_mapping.partition_key.clone(),
max_num_partitions: doc_mapping.max_num_partitions,
tokenizers: doc_mapping.tokenizers.clone(),
document_length: doc_mapping.document_length,
};
Ok(Arc::new(builder.try_build()?))
}
Expand Down Expand Up @@ -571,6 +488,7 @@ pub(super) fn validate_index_config(
mod tests {

use cron::TimeUnitSpec;
use quickwit_doc_mapper::ModeType;

use super::*;
use crate::merge_policy_config::MergePolicyConfig;
Expand Down
9 changes: 5 additions & 4 deletions quickwit/quickwit-config/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,10 @@ pub use cluster_config::ClusterConfig;
// See #2048
use index_config::serialize::{IndexConfigV0_8, VersionedIndexConfig};
pub use index_config::{
build_doc_mapper, load_index_config_from_user_config, DocMapping, IndexConfig,
IndexingResources, IndexingSettings, RetentionPolicy, SearchSettings,
build_doc_mapper, load_index_config_from_user_config, IndexConfig, IndexingResources,
IndexingSettings, RetentionPolicy, SearchSettings,
};
pub use quickwit_doc_mapper::DocMapping;
use serde::de::DeserializeOwned;
use serde::Serialize;
use serde_json::Value as JsonValue;
Expand Down Expand Up @@ -225,7 +226,7 @@ impl ConfigFormat {
serde_json::from_reader(StripComments::new(payload))?;
let version_value = json_value.get_mut("version").context("missing version")?;
if let Some(version_number) = version_value.as_u64() {
warn!(version_value=?version_value, "`version` is supposed to be a string");
warn!(version_value=?version_value, "`version` should be a string");
*version_value = JsonValue::String(version_number.to_string());
}
serde_json::from_value(json_value).context("failed to parse JSON file")
Expand All @@ -237,7 +238,7 @@ impl ConfigFormat {
toml::from_str(payload_str).context("failed to parse TOML file")?;
let version_value = toml_value.get_mut("version").context("missing version")?;
if let Some(version_number) = version_value.as_integer() {
warn!(version_value=?version_value, "`version` is supposed to be a string");
warn!(version_value=?version_value, "`version` should be a string");
*version_value = toml::Value::String(version_number.to_string());
let reserialized = toml::to_string(version_value)
.context("failed to reserialize toml config")?;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ mod tests {
assert_eq!(field_mapping_entry.name, "updated_at");

let date_time_options = match field_mapping_entry.mapping_type {
FieldMappingType::DateTime(date_time_options, Cardinality::SingleValue) => {
FieldMappingType::DateTime(date_time_options, Cardinality::SingleValued) => {
date_time_options
}
_ => panic!("Expected a date time field mapping"),
Expand Down Expand Up @@ -226,7 +226,7 @@ mod tests {
assert_eq!(field_mapping_entry.name, "updated_at");

let date_time_options = match field_mapping_entry.mapping_type {
FieldMappingType::DateTime(date_time_options, Cardinality::MultiValues) => {
FieldMappingType::DateTime(date_time_options, Cardinality::MultiValued) => {
date_time_options
}
_ => panic!("Expected a date time field mapping."),
Expand Down
Loading

0 comments on commit e69d3c5

Please sign in to comment.