Skip to content

Commit

Permalink
Add coerce and output format options for numeric fields (#3704)
Browse files Browse the repository at this point in the history
  • Loading branch information
guilload authored Aug 3, 2023
1 parent dcbe67a commit 8c2caf5
Show file tree
Hide file tree
Showing 16 changed files with 315 additions and 75 deletions.
16 changes: 9 additions & 7 deletions docs/configuration/index-config.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ Indexing with position is required to run phrase queries.

Quickwit handles three numeric types: `i64`, `u64`, and `f64`.

Numeric values can be stored in a fast field (the equivalent of Lucene's `DocValues`) which is a column-oriented storage.
Numeric values can be stored in a fast field (the equivalent of Lucene's `DocValues`), which is a column-oriented storage used for range queries and aggregations.

Example of a mapping for an u64 field:

Expand All @@ -179,12 +179,14 @@ fast: true

**Parameters for i64, u64 and f64 field**

| Variable | Description | Default value |
| ------------- | ------------- | ------------- |
| `description` | Optional description for the field. | `None` |
| `stored` | Whether the field values are stored in the document store | `true` |
| `indexed` | Whether the field values are indexed | `true` |
| `fast` | Whether the field values are stored in a fast field | `false` |
| Variable | Description | Default value |
| --------------- | ------------- | ------------- |
| `description` | Optional description for the field. | `None` |
| `stored` | Whether the field values are stored in the document store. | `true` |
| `indexed` | Whether the field values are indexed. | `true` |
| `fast` | Whether the field values are stored in a fast field. | `false` |
| `coerce` | Whether to convert numbers passed as strings to integers or floats. | `true` |
| `output_format` | JSON type used to return numbers in search results. Possible values are `number` or `string`. | `number` |

#### `datetime` type

Expand Down
5 changes: 3 additions & 2 deletions quickwit/quickwit-doc-mapper/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,12 @@ quickwit-query = { workspace = true }
criterion = { workspace = true }
matches = { workspace = true }
proptest = { workspace = true }
quickwit-proto = { workspace = true }
quickwit-query = { workspace = true, features = ["testsuite"] }
serde_yaml = { workspace = true }
time = { workspace = true }

quickwit-proto = { workspace = true }
quickwit-query = { workspace = true, features = ["testsuite"] }

[features]
multilang = ["quickwit-query/multilang"]
testsuite = ["multilang"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,40 @@ pub struct QuickwitNumericOptions {
pub indexed: bool,
#[serde(default)]
pub fast: bool,
#[serde(default = "default_as_true")]
pub coerce: bool,
#[serde(default)]
pub output_format: NumericOutputFormat,
}

impl Default for QuickwitNumericOptions {
fn default() -> Self {
Self {
description: None,
indexed: true,
stored: true,
fast: false,
coerce: true,
output_format: NumericOutputFormat::default(),
}
}
}

#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, utoipa::ToSchema)]
#[serde(deny_unknown_fields)]
pub struct QuickwitBoolOptions {
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub description: Option<String>,
#[serde(default = "default_as_true")]
pub stored: bool,
#[serde(default = "default_as_true")]
pub indexed: bool,
#[serde(default)]
pub fast: bool,
}

impl Default for QuickwitBoolOptions {
fn default() -> Self {
Self {
description: None,
Expand Down Expand Up @@ -150,15 +181,15 @@ pub enum BinaryFormat {
impl BinaryFormat {
pub fn as_str(&self) -> &str {
match self {
BinaryFormat::Base64 => "base64",
BinaryFormat::Hex => "hex",
Self::Base64 => "base64",
Self::Hex => "hex",
}
}

pub fn format_to_json(&self, value: &[u8]) -> JsonValue {
match self {
BinaryFormat::Base64 => BASE64_STANDARD.encode(value).into(),
BinaryFormat::Hex => hex::encode(value).into(),
Self::Base64 => BASE64_STANDARD.encode(value).into(),
Self::Hex => hex::encode(value).into(),
}
}

Expand All @@ -172,21 +203,27 @@ impl BinaryFormat {
));
};
let payload = match self {
BinaryFormat::Base64 => {
BASE64_STANDARD
.decode(&byte_str)
.map_err(|base64_decode_err| {
format!("Expected base64 string, got `{byte_str}`: {base64_decode_err}")
})?
}
BinaryFormat::Hex => hex::decode(&byte_str).map_err(|hex_decode_err| {
Self::Base64 => BASE64_STANDARD
.decode(&byte_str)
.map_err(|base64_decode_err| {
format!("Expected base64 string, got `{byte_str}`: {base64_decode_err}")
})?,
Self::Hex => hex::decode(&byte_str).map_err(|hex_decode_err| {
format!("Expected hex string, got `{byte_str}`: {hex_decode_err}")
})?,
};
Ok(TantivyValue::Bytes(payload))
}
}

#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Default, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum NumericOutputFormat {
#[default]
Number,
String,
}

#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, utoipa::ToSchema)]
#[serde(deny_unknown_fields)]
pub struct QuickwitIpAddrOptions {
Expand Down Expand Up @@ -618,8 +655,8 @@ fn deserialize_mapping_type(
Ok(FieldMappingType::F64(numeric_options, cardinality))
}
Type::Bool => {
let numeric_options: QuickwitNumericOptions = serde_json::from_value(json)?;
Ok(FieldMappingType::Bool(numeric_options, cardinality))
let bool_options: QuickwitBoolOptions = serde_json::from_value(json)?;
Ok(FieldMappingType::Bool(bool_options, cardinality))
}
Type::IpAddr => {
let ip_addr_options: QuickwitIpAddrOptions = serde_json::from_value(json)?;
Expand Down Expand Up @@ -685,8 +722,8 @@ fn typed_mapping_to_json_params(
FieldMappingType::Text(text_options, _) => serialize_to_map(&text_options),
FieldMappingType::U64(options, _)
| FieldMappingType::I64(options, _)
| FieldMappingType::F64(options, _)
| FieldMappingType::Bool(options, _) => serialize_to_map(&options),
| FieldMappingType::F64(options, _) => serialize_to_map(&options),
FieldMappingType::Bool(options, _) => serialize_to_map(&options),
FieldMappingType::Bytes(options, _) => serialize_to_map(&options),
FieldMappingType::IpAddr(options, _) => serialize_to_map(&options),
FieldMappingType::DateTime(date_time_options, _) => serialize_to_map(&date_time_options),
Expand Down Expand Up @@ -1062,20 +1099,21 @@ mod tests {

#[test]
fn test_deserialize_i64_parsing_error_with_text_options() {
let result = serde_json::from_str::<FieldMappingEntry>(
let error = serde_json::from_str::<FieldMappingEntry>(
r#"
{
"name": "my_field_name",
"type": "i64",
"tokenizer": "basic"
}
"#,
);
let error = result.unwrap_err();
)
.unwrap_err();

assert_eq!(
error.to_string(),
"Error while parsing field `my_field_name`: unknown field `tokenizer`, expected one \
of `description`, `stored`, `indexed`, `fast`"
of `description`, `stored`, `indexed`, `fast`, `coerce`, `output_format`"
);
}

Expand Down Expand Up @@ -1146,6 +1184,8 @@ mod tests {
"stored": true,
"fast": false,
"indexed": true,
"coerce": true,
"output_format": "number"
})
);
Ok(())
Expand All @@ -1165,7 +1205,7 @@ mod tests {
.unwrap_err()
.to_string(),
"Error while parsing field `my_field_name`: unknown field `tokenizer`, expected one \
of `description`, `stored`, `indexed`, `fast`"
of `description`, `stored`, `indexed`, `fast`, `coerce`, `output_format`"
);
}

Expand Down Expand Up @@ -1232,6 +1272,8 @@ mod tests {
"stored": true,
"fast": false,
"indexed": true,
"coerce": true,
"output_format": "number"
})
);
}
Expand All @@ -1256,6 +1298,8 @@ mod tests {
"stored": true,
"fast": false,
"indexed": true,
"coerce": true,
"output_format": "number"
})
);
}
Expand Down Expand Up @@ -1616,6 +1660,8 @@ mod tests {
"stored": true,
"fast": false,
"indexed": true,
"coerce": true,
"output_format": "number"
})
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
use tantivy::schema::Type;

use super::date_time_type::QuickwitDateTimeOptions;
use super::field_mapping_entry::QuickwitBoolOptions;
use crate::default_doc_mapper::field_mapping_entry::{
QuickwitBytesOptions, QuickwitIpAddrOptions, QuickwitJsonOptions, QuickwitNumericOptions,
QuickwitObjectOptions, QuickwitTextOptions,
Expand All @@ -41,7 +42,7 @@ pub(crate) enum FieldMappingType {
/// 64-bit float mapping type configuration.
F64(QuickwitNumericOptions, Cardinality),
/// Bool mapping type configuration.
Bool(QuickwitNumericOptions, Cardinality),
Bool(QuickwitBoolOptions, Cardinality),
/// IP Address mapping type configuration.
IpAddr(QuickwitIpAddrOptions, Cardinality),
/// Bytes mapping type configuration.
Expand Down
Loading

0 comments on commit 8c2caf5

Please sign in to comment.