Skip to content

Commit

Permalink
Faster document validation (#5163)
Browse files Browse the repository at this point in the history
* Faster document validation

This PR just removes the need to build documents when validating
documents.

* Apply suggestions from code review

Co-authored-by: Adrien Guillo <[email protected]>

* minor change

---------

Co-authored-by: Adrien Guillo <[email protected]>
  • Loading branch information
fulmicoton and guilload authored Jun 27, 2024
1 parent 568561c commit 6ac0ef8
Show file tree
Hide file tree
Showing 7 changed files with 327 additions and 60 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ impl Default for QuickwitDateTimeOptions {
}

impl QuickwitDateTimeOptions {
pub(crate) fn parse_json(&self, json_value: JsonValue) -> Result<TantivyValue, String> {
pub(crate) fn parse_json(&self, json_value: &JsonValue) -> Result<TantivyValue, String> {
let date_time = match json_value {
JsonValue::Number(timestamp) => {
// `.as_f64()` actually converts floats to integers, so we must check for integers
Expand All @@ -87,7 +87,7 @@ impl QuickwitDateTimeOptions {
}
}
JsonValue::String(date_time_str) => {
quickwit_datetime::parse_date_time_str(&date_time_str, &self.input_formats.0)?
quickwit_datetime::parse_date_time_str(date_time_str, &self.input_formats.0)?
}
_ => {
return Err(format!(
Expand Down Expand Up @@ -383,7 +383,7 @@ mod tests {
let expected_timestamp = datetime!(2012-05-21 12:09:14 UTC).unix_timestamp();
{
let json_value = serde_json::json!("2012-05-21T12:09:14-00:00");
let tantivy_value = date_time_options.parse_json(json_value).unwrap();
let tantivy_value = date_time_options.parse_json(&json_value).unwrap();
let date_time = match tantivy_value {
TantivyValue::Date(date_time) => date_time,
other => panic!("Expected a tantivy date time, got `{other:?}`."),
Expand All @@ -392,7 +392,7 @@ mod tests {
}
{
let json_value = serde_json::json!(expected_timestamp);
let tantivy_value = date_time_options.parse_json(json_value).unwrap();
let tantivy_value = date_time_options.parse_json(&json_value).unwrap();
let date_time = match tantivy_value {
TantivyValue::Date(date_time) => date_time,
other => panic!("Expected a tantivy date time, got `{other:?}`."),
Expand All @@ -401,7 +401,7 @@ mod tests {
}
{
let json_value = serde_json::json!(expected_timestamp as f64);
let tantivy_value = date_time_options.parse_json(json_value).unwrap();
let tantivy_value = date_time_options.parse_json(&json_value).unwrap();
let date_time = match tantivy_value {
TantivyValue::Date(date_time) => date_time,
other => panic!("Expected a tantivy date time, got `{other:?}`."),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ use crate::doc_mapper::{JsonObject, Partition};
use crate::query_builder::build_query;
use crate::routing_expression::RoutingExpr;
use crate::{
Cardinality, DocMapper, DocMapping, DocParsingError, Mode, QueryParserError, TokenizerEntry,
WarmupInfo, DOCUMENT_SIZE_FIELD_NAME, DYNAMIC_FIELD_NAME, FIELD_PRESENCE_FIELD_NAME,
SOURCE_FIELD_NAME,
Cardinality, DocMapper, DocMapping, DocParsingError, Mode, ModeType, QueryParserError,
TokenizerEntry, WarmupInfo, DOCUMENT_SIZE_FIELD_NAME, DYNAMIC_FIELD_NAME,
FIELD_PRESENCE_FIELD_NAME, SOURCE_FIELD_NAME,
};

const FIELD_PRESENCE_FIELD: Field = Field::from_field_id(0u32);
Expand Down Expand Up @@ -507,6 +507,14 @@ impl DocMapper for DefaultDocMapper {
self.doc_mapping_uid
}

fn validate_json_obj(&self, json_obj: &JsonObject) -> Result<(), DocParsingError> {
let is_strict = self.mode.mode_type() == ModeType::Strict;
let mut field_path = Vec::new();
self.field_mappings
.validate_from_json(json_obj, is_strict, &mut field_path)?;
Ok(())
}

fn doc_from_json_obj(
&self,
json_obj: JsonObject,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ impl BinaryFormat {
}

/// Parses the `serde_json::Value` into `tantivy::schema::Value`.
pub fn parse_json(&self, json_val: JsonValue) -> Result<TantivyValue, String> {
pub fn parse_json(&self, json_val: &JsonValue) -> Result<TantivyValue, String> {
let byte_str = if let JsonValue::String(byte_str) = json_val {
byte_str
} else {
Expand All @@ -217,11 +217,11 @@ impl BinaryFormat {
};
let payload = match self {
Self::Base64 => BASE64_STANDARD
.decode(&byte_str)
.decode(byte_str)
.map_err(|base64_decode_err| {
format!("expected base64 string, got `{byte_str}`: {base64_decode_err}")
})?,
Self::Hex => hex::decode(&byte_str).map_err(|hex_decode_err| {
Self::Hex => hex::decode(byte_str).map_err(|hex_decode_err| {
format!("expected hex string, got `{byte_str}`: {hex_decode_err}")
})?,
};
Expand Down
138 changes: 131 additions & 7 deletions quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,55 @@ pub(crate) fn map_primitive_json_to_tantivy(value: JsonValue) -> Option<TantivyV
}

impl LeafType {
fn validate_from_json(&self, json_val: &JsonValue) -> Result<(), String> {
match self {
LeafType::Text(_) => {
if let JsonValue::String(_) = json_val {
Ok(())
} else {
Err(format!("expected string, got `{json_val}`"))
}
}
LeafType::I64(numeric_options) => {
i64::from_json_to_self(json_val, numeric_options.coerce).map(|_| ())
}
LeafType::U64(numeric_options) => {
u64::from_json_to_self(json_val, numeric_options.coerce).map(|_| ())
}
LeafType::F64(numeric_options) => {
f64::from_json_to_self(json_val, numeric_options.coerce).map(|_| ())
}
LeafType::Bool(_) => {
if let JsonValue::Bool(_) = json_val {
Ok(())
} else {
Err(format!("expected boolean, got `{json_val}`"))
}
}
LeafType::IpAddr(_) => {
let JsonValue::String(ip_address) = json_val else {
return Err(format!("expected string, got `{json_val}`"));
};
IpAddr::from_str(ip_address.as_str())
.map_err(|err| format!("failed to parse IP address `{ip_address}`: {err}"))?;
Ok(())
}
LeafType::DateTime(date_time_options) => {
date_time_options.parse_json(json_val).map(|_| ())
}
LeafType::Bytes(binary_options) => {
binary_options.input_format.parse_json(json_val).map(|_| ())
}
LeafType::Json(_) => {
if let JsonValue::Object(_) = json_val {
Ok(())
} else {
Err(format!("expected object, got `{json_val}`"))
}
}
}
}

fn value_from_json(&self, json_val: JsonValue) -> Result<TantivyValue, String> {
match self {
LeafType::Text(_) => {
Expand Down Expand Up @@ -187,8 +236,8 @@ impl LeafType {
Err(format!("expected string, got `{json_val}`"))
}
}
LeafType::DateTime(date_time_options) => date_time_options.parse_json(json_val),
LeafType::Bytes(binary_options) => binary_options.input_format.parse_json(json_val),
LeafType::DateTime(date_time_options) => date_time_options.parse_json(&json_val),
LeafType::Bytes(binary_options) => binary_options.input_format.parse_json(&json_val),
LeafType::Json(_) => {
if let JsonValue::Object(json_obj) = json_val {
Ok(TantivyValue::Object(
Expand Down Expand Up @@ -217,11 +266,11 @@ impl LeafType {
}
}
LeafType::I64(numeric_options) => {
let val = i64::from_json_to_self(json_val, numeric_options.coerce)?;
let val = i64::from_json_to_self(&json_val, numeric_options.coerce)?;
Ok(OneOrIter::one((val).into()))
}
LeafType::U64(numeric_options) => {
let val = u64::from_json_to_self(json_val, numeric_options.coerce)?;
let val = u64::from_json_to_self(&json_val, numeric_options.coerce)?;
Ok(OneOrIter::one((val).into()))
}
LeafType::F64(_) => Err("unsuported concat type: f64".to_string()),
Expand Down Expand Up @@ -276,6 +325,38 @@ pub(crate) struct MappingLeaf {
}

impl MappingLeaf {
fn validate_from_json(
&self,
json_value: &JsonValue,
path: &[&str],
) -> Result<(), DocParsingError> {
if json_value.is_null() {
// We just ignore `null`.
return Ok(());
}
if let JsonValue::Array(els) = json_value {
if self.cardinality == Cardinality::SingleValued {
return Err(DocParsingError::MultiValuesNotSupported(path.join(".")));
}
for el_json_val in els {
if el_json_val.is_null() {
// We just ignore `null`.
continue;
}
self.typ
.validate_from_json(el_json_val)
.map_err(|err_msg| DocParsingError::ValueError(path.join("."), err_msg))?;
}
return Ok(());
}

self.typ
.validate_from_json(json_value)
.map_err(|err_msg| DocParsingError::ValueError(path.join("."), err_msg))?;

Ok(())
}

pub fn doc_from_json(
&self,
json_val: JsonValue,
Expand Down Expand Up @@ -610,9 +691,9 @@ fn insert_json_val(
trait NumVal: Sized + FromStr + ToString + Into<TantivyValue> {
fn from_json_number(num: &serde_json::Number) -> Option<Self>;

fn from_json_to_self(json_val: JsonValue, coerce: bool) -> Result<Self, String> {
fn from_json_to_self(json_val: &JsonValue, coerce: bool) -> Result<Self, String> {
match json_val {
JsonValue::Number(num_val) => Self::from_json_number(&num_val).ok_or_else(|| {
JsonValue::Number(num_val) => Self::from_json_number(num_val).ok_or_else(|| {
format!(
"expected {}, got inconvertible JSON number `{}`",
type_name::<Self>(),
Expand Down Expand Up @@ -647,7 +728,7 @@ trait NumVal: Sized + FromStr + ToString + Into<TantivyValue> {
}

fn from_json(json_val: JsonValue, coerce: bool) -> Result<TantivyValue, String> {
Self::from_json_to_self(json_val, coerce).map(Self::into)
Self::from_json_to_self(&json_val, coerce).map(Self::into)
}

fn to_json(&self, output_format: NumericOutputFormat) -> Option<JsonValue>;
Expand Down Expand Up @@ -790,6 +871,26 @@ impl MappingNode {
field_mapping_entries
}

pub fn validate_from_json<'a>(
&self,
json_obj: &'a serde_json::Map<String, JsonValue>,
strict_mode: bool,
path: &mut Vec<&'a str>,
) -> Result<(), DocParsingError> {
for (field_name, json_val) in json_obj {
if let Some(child_tree) = self.branches.get(field_name) {
path.push(field_name.as_str());
child_tree.validate_from_json(json_val, path, strict_mode)?;
path.pop();
} else if strict_mode {
path.push(field_name);
let field_path = path.join(".");
return Err(DocParsingError::NoSuchFieldInSchema(field_path));
}
}
Ok(())
}

pub fn doc_from_json(
&self,
json_obj: serde_json::Map<String, JsonValue>,
Expand Down Expand Up @@ -878,6 +979,29 @@ pub(crate) enum MappingTree {
}

impl MappingTree {
fn validate_from_json<'a>(
&self,
json_value: &'a JsonValue,
field_path: &mut Vec<&'a str>,
strict_mode: bool,
) -> Result<(), DocParsingError> {
match self {
MappingTree::Leaf(mapping_leaf) => {
mapping_leaf.validate_from_json(json_value, field_path)
}
MappingTree::Node(mapping_node) => {
if let JsonValue::Object(json_obj) = json_value {
mapping_node.validate_from_json(json_obj, strict_mode, field_path)
} else {
Err(DocParsingError::ValueError(
field_path.join("."),
format!("expected an JSON object, got {json_value}"),
))
}
}
}
}

fn doc_from_json(
&self,
json_value: JsonValue,
Expand Down
Loading

0 comments on commit 6ac0ef8

Please sign in to comment.