From 0c6a5f462ee70a686627d3e050476a60b1c4ee51 Mon Sep 17 00:00:00 2001 From: Vladimir Kuznichenkov Date: Sat, 28 Sep 2024 23:21:08 +0300 Subject: [PATCH] Replace regexp tokenizer with recursion parser In the current implementation, the RegexTokenizer uses regular expressions to tokenize the format string. However, regular expressions are not well-suited for parsing nested structures like nested brackets because they cannot match patterns with recursive depth. To address this, we'll write a custom recursive parser that can handle nested optional brackets. This parser will process the format string character by character, building the format items and managing the nesting of optional components. That change allowed me to cover `strict_date_optional_time` which has a nested depth according to the examples in [OS repo][2] Additional tests taken from [the doc][1] [1]: https://opensearch.org/docs/latest/field-types/supported-field-types/date/ [2]: https://github.com/opensearch-project/OpenSearch/blob/main/server/src/test/java/org/opensearch/common/time/DateFormattersTests.java --- quickwit/Cargo.lock | 1 - quickwit/quickwit-datetime/Cargo.toml | 1 - .../quickwit-datetime/src/date_time_format.rs | 315 ++++++++++++++---- quickwit/quickwit-datetime/src/lib.rs | 2 - .../quickwit-datetime/src/regex_tokenizer.rs | 131 -------- 5 files changed, 249 insertions(+), 201 deletions(-) delete mode 100644 quickwit/quickwit-datetime/src/regex_tokenizer.rs diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index 888948304f3..3ce5526bd7a 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -5894,7 +5894,6 @@ version = "0.8.0" dependencies = [ "anyhow", "itertools 0.13.0", - "regex", "serde", "serde_json", "tantivy", diff --git a/quickwit/quickwit-datetime/Cargo.toml b/quickwit/quickwit-datetime/Cargo.toml index a4943d375e1..004e959a348 100644 --- a/quickwit/quickwit-datetime/Cargo.toml +++ b/quickwit/quickwit-datetime/Cargo.toml @@ -13,7 +13,6 @@ license.workspace = true [dependencies] anyhow = { workspace = true } itertools = { workspace = true } -regex = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } tantivy = { workspace = true } diff --git a/quickwit/quickwit-datetime/src/date_time_format.rs b/quickwit/quickwit-datetime/src/date_time_format.rs index 2b4d4e23270..47d18008399 100644 --- a/quickwit/quickwit-datetime/src/date_time_format.rs +++ b/quickwit/quickwit-datetime/src/date_time_format.rs @@ -19,6 +19,7 @@ use std::collections::HashMap; use std::fmt::Display; +use std::num::NonZeroU8; use std::str::FromStr; use std::sync::OnceLock; @@ -36,7 +37,45 @@ use time::parsing::Parsed; use time::{Month, OffsetDateTime, PrimitiveDateTime, UtcOffset}; use time_fmt::parse::time_format_item::parse_to_format_item; -use crate::{RegexTokenizer, TantivyDateTime}; +use crate::TantivyDateTime; + +const JAVA_DATE_FORMAT_TOKENS: &[&str] = &[ + "yyyy", + "xxxx", + "xx[xx]", + "SSSSSSSSS", // For nanoseconds + "SSSSSSS", // For microseconds + "SSSSSS", // For fractional seconds up to six digits + "SSSSS", + "SSSS", + "SSS", + "SS", + "ZZ", + "xx", + "ww", + "w[w]", + "yy", + "MM", + "dd", + "HH", + "hh", + "kk", + "mm", + "ss", + "aa", + "a", + "w", + "M", + "d", + "H", + "h", + "k", + "m", + "s", + "S", + "Z", + "e", +]; fn literal(s: &[u8]) -> OwnedFormatItem { // builds a boxed slice from a slice @@ -53,16 +92,6 @@ fn get_padding(ptn: &str) -> Padding { } } -fn build_optional_item(java_datetime_format: &str) -> Option { - assert!(java_datetime_format.len() >= 2); - let optional_date_format = &java_datetime_format[1..java_datetime_format.len() - 1]; - let format_items: Box<[OwnedFormatItem]> = - parse_java_datetime_format_items(optional_date_format).ok()?; - Some(OwnedFormatItem::Optional(Box::new( - OwnedFormatItem::Compound(format_items), - ))) -} - fn build_zone_offset(_: &str) -> Option { // 'Z' literal to represent UTC offset let z_literal = OwnedFormatItem::Literal(Box::from(b"Z".as_ref())); @@ -90,17 +119,26 @@ fn build_zone_offset(_: &str) -> Option { } fn build_year_item(ptn: &str) -> Option { - let mut year = Year::default(); - let year_repr = if ptn.len() == 4 { - YearRepr::Full + let mut full_year = Year::default(); + full_year.repr = YearRepr::Full; + let full_year_component = OwnedFormatItem::Component(Component::Year(full_year)); + + let mut short_year = Year::default(); + short_year.repr = YearRepr::LastTwo; + let short_year_component = OwnedFormatItem::Component(Component::Year(short_year)); + + if ptn.len() == 4 { + Some(full_year_component) + } else if ptn.len() == 2 { + Some(short_year_component) } else { - YearRepr::LastTwo - }; - year.repr = year_repr; - Some(OwnedFormatItem::Component(Component::Year(year))) + Some(OwnedFormatItem::First( + vec![full_year_component, short_year_component].into_boxed_slice(), + )) + } } -fn build_week_year_item(ptn: &str) -> Option { +fn build_week_based_year_item(ptn: &str) -> Option { // TODO no `Component` for that build_year_item(ptn) } @@ -117,14 +155,14 @@ fn build_day_item(ptn: &str) -> Option { Some(OwnedFormatItem::Component(Component::Day(day))) } -fn build_weekday_item(_: &str) -> Option { +fn build_day_of_week_item(_: &str) -> Option { let mut weekday = Weekday::default(); weekday.repr = WeekdayRepr::Monday; weekday.one_indexed = false; Some(OwnedFormatItem::Component(Component::Weekday(weekday))) } -fn build_week_number_item(ptn: &str) -> Option { +fn build_week_of_year_item(ptn: &str) -> Option { let mut week_number = WeekNumber::default(); week_number.repr = WeekNumberRepr::Monday; week_number.padding = get_padding(ptn); @@ -158,32 +196,92 @@ fn build_fraction_of_second_item(_ptn: &str) -> Option { Some(OwnedFormatItem::Component(Component::Subsecond(subsecond))) } +fn parse_java_datetime_format_items_recursive( + chars: &mut std::iter::Peekable, +) -> Result, String> { + let mut items = Vec::new(); + + while let Some(&c) = chars.peek() { + match c { + '[' => { + chars.next(); + let optional_items = parse_java_datetime_format_items_recursive(chars)?; + items.push(OwnedFormatItem::Optional(Box::new( + OwnedFormatItem::Compound(optional_items.into_boxed_slice()), + ))); + } + ']' => { + chars.next(); + break; + } + '\'' => { + chars.next(); + let mut literal_str = String::new(); + while let Some(&next_c) = chars.peek() { + if next_c == '\'' { + chars.next(); + break; + } else { + literal_str.push(next_c); + chars.next(); + } + } + items.push(literal(literal_str.as_bytes())); + } + _ => { + if let Some(format_item) = match_java_date_format_token(chars)? { + items.push(format_item); + } else { + // Treat as a literal character + items.push(literal(c.to_string().as_bytes())); + chars.next(); + } + } + } + } + + Ok(items) +} + // Elasticsearch/OpenSearch uses a set of preconfigured formats, more information could be found // here https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-date-format.html -fn java_date_format_tokenizer() -> &'static RegexTokenizer { - static JAVA_DATE_FORMAT_TOKENIZER: OnceLock> = OnceLock::new(); - JAVA_DATE_FORMAT_TOKENIZER.get_or_init(|| { - RegexTokenizer::new(vec![ - (r#"yy(yy)?"#, build_year_item), - (r#"MM?"#, build_month_item), - (r#"dd?"#, build_day_item), - (r#"HH?"#, build_hour_item), - (r#"mm?"#, build_minute_item), - (r#"ss?"#, build_second_item), - (r#"S+"#, build_fraction_of_second_item), - (r#"Z"#, build_zone_offset), - (r#"''"#, |_| Some(literal(b"'"))), - (r#"'[^']+'"#, |s| { - Some(literal(s[1..s.len() - 1].as_bytes())) - }), - (r#"[^\w\[\]{}]"#, |s| Some(literal(s.as_bytes()))), - (r#"\[.*\]"#, build_optional_item), - (r#"xx(xx)?"#, build_week_year_item), - (r#"ww?"#, build_week_number_item), - (r#"e?"#, build_weekday_item), - ]) - .unwrap() - }) +fn match_java_date_format_token( + chars: &mut std::iter::Peekable, +) -> Result, String> { + if chars.peek().is_none() { + return Ok(None); + } + + let remaining: String = chars.clone().collect(); + + // Try to match the longest possible token + for token in JAVA_DATE_FORMAT_TOKENS { + if remaining.starts_with(token) { + for _ in 0..token.len() { + chars.next(); + } + + let format_item = match *token { + "yyyy" | "yy" => build_year_item(token), + "xxxx" | "xx[xx]" | "xx" => build_week_based_year_item(token), + "MM" | "M" => build_month_item(token), + "dd" | "d" => build_day_item(token), + "HH" | "H" => build_hour_item(token), + "mm" | "m" => build_minute_item(token), + "ss" | "s" => build_second_item(token), + "SSSSSSSSS" | "SSSSSSS" | "SSSSSS" | "SSSSS" | "SSSS" | "SSS" | "SS" | "S" => { + build_fraction_of_second_item(token) + } + "Z" => build_zone_offset(token), + "ww" | "w[w]" | "w" => build_week_of_year_item(token), + "e" => build_day_of_week_item(token), + _ => return Err(format!("Unrecognized token '{}'", token)), + }; + return Ok(format_item); + } + } + + Ok(None) } // Check if the given date time format is a common alias and replace it with the @@ -195,8 +293,11 @@ fn resolve_java_datetime_format_alias(java_datetime_format: &str) -> &str { OnceLock::new(); let java_datetime_format_map = JAVA_DATE_FORMAT_ALIASES.get_or_init(|| { let mut m = HashMap::new(); - m.insert("date_optional_time", "yyyy-MM-dd['T'HH:mm:ss.SSSZ]"); - m.insert("strict_date_optional_time", "yyyy-MM-dd['T'HH:mm:ss.SSSZ]"); + m.insert("date_optional_time", "yyyy-MM-dd['T'HH:mm:ss.SSSZ]"); + m.insert( + "strict_date_optional_time", + "yyyy[-MM[-dd['T'HH[:mm[:ss[.SSS[Z]]]]]]]", + ); m.insert( "strict_date_optional_time_nanos", "yyyy-MM-dd['T'HH:mm:ss.SSSSSSZ]", @@ -235,17 +336,9 @@ pub struct StrptimeParser { fn parse_java_datetime_format_items( java_datetime_format: &str, ) -> Result, String> { - let java_datetime_format_resolved = resolve_java_datetime_format_alias(java_datetime_format); - let items = java_date_format_tokenizer() - .tokenize(java_datetime_format_resolved) - .map_err(|pos| { - format!( - "failed to parse date format `{java_datetime_format}`. Pattern at pos {pos} is \ - not recognized." - ) - })? - .into_boxed_slice(); - Ok(items) + let mut chars = java_datetime_format.chars().peekable(); + let items = parse_java_datetime_format_items_recursive(&mut chars)?; + Ok(items.into_boxed_slice()) } impl StrptimeParser { @@ -290,6 +383,14 @@ impl StrptimeParser { parsed.set_year(year); } + if parsed.day().is_none() && parsed.monday_week_number().is_none() { + parsed.set_day(NonZeroU8::try_from(1u8).unwrap()); + } + + if parsed.month().is_none() && parsed.monday_week_number().is_none() { + parsed.set_month(Month::January); + } + if parsed.offset_hour().is_some() { let offset_datetime: OffsetDateTime = parsed .try_into() @@ -424,14 +525,18 @@ impl FromStr for DateTimeInputFormat { impl Serialize for DateTimeInputFormat { fn serialize(&self, serializer: S) -> Result - where S: serde::Serializer { + where + S: serde::Serializer, + { serializer.serialize_str(self.as_str()) } } impl<'de> Deserialize<'de> for DateTimeInputFormat { fn deserialize(deserializer: D) -> Result - where D: Deserializer<'de> { + where + D: Deserializer<'de>, + { let date_time_format_str: String = Deserialize::deserialize(deserializer)?; let date_time_format = date_time_format_str.parse().map_err(D::Error::custom)?; Ok(date_time_format) @@ -526,14 +631,18 @@ impl FromStr for DateTimeOutputFormat { impl Serialize for DateTimeOutputFormat { fn serialize(&self, serializer: S) -> Result - where S: serde::Serializer { + where + S: serde::Serializer, + { serializer.serialize_str(self.as_str()) } } impl<'de> Deserialize<'de> for DateTimeOutputFormat { fn deserialize(deserializer: D) -> Result - where D: Deserializer<'de> { + where + D: Deserializer<'de>, + { let date_time_format_str: String = Deserialize::deserialize(deserializer)?; let date_time_format = date_time_format_str.parse().map_err(D::Error::custom)?; Ok(date_time_format) @@ -558,7 +667,6 @@ pub(super) fn infer_year( #[cfg(test)] mod tests { - use time::macros::datetime; use time::Month; @@ -721,6 +829,7 @@ mod tests { #[test] fn test_parse_java_datetime_format() { + test_parse_java_datetime_aux("yyyyMMdd", "20210101", datetime!(2021-01-01 00:00:00 UTC)); test_parse_java_datetime_aux( "yyyy MM dd", "2021 01 01", @@ -737,12 +846,17 @@ mod tests { datetime!(2021-01-01 13:00:00 UTC), ); test_parse_java_datetime_aux( - "yyyy!MM?dd['T'HH:]", + "yyyy!MM?dd['T'[HH:]]", "2021!01?01", datetime!(2021-01-01 00:00:00 UTC), ); test_parse_java_datetime_aux( - "yyyy!MM?dd['T'HH:]", + "yyyy!MM?dd['T'[HH:]", + "2021!01?01T", + datetime!(2021-01-01 00:00:00 UTC), + ); + test_parse_java_datetime_aux( + "yyyy!MM?dd['T'[HH:]]", "2021!01?01T13:", datetime!(2021-01-01 13:00:00 UTC), ); @@ -780,7 +894,7 @@ mod tests { ); test_parse_java_datetime_aux( "date_optional_time", - "2021W313", + "2021-01-21T03:01:22.312+01:00", datetime!(2021-01-21 03:01:22.312 +1), ); } @@ -879,6 +993,40 @@ mod tests { ); } + #[test] + fn test_parse_strict_date_optional_time() { + let parser = + StrptimeParser::from_java_datetime_format("strict_date_optional_time").unwrap(); + let dates = [ + "2019", + "2019-03", + "2019-03-23", + "2019-03-23T21:34", + "2019-03-23T21:34:46", + "2019-03-23T21:34:46.123Z", + "2019-03-23T21:35:46.123+00:00", + "2019-03-23T21:36:46.123+03:00", + "2019-03-23T21:37:46.123+0300", + ]; + let expected = [ + datetime!(2019-01-01 00:00:00 UTC), + datetime!(2019-03-01 00:00:00 UTC), + datetime!(2019-03-23 00:00:00 UTC), + datetime!(2019-03-23 21:34 UTC), + datetime!(2019-03-23 21:34:46 UTC), + datetime!(2019-03-23 21:34:46.123 UTC), + datetime!(2019-03-23 21:35:46.123 UTC), + datetime!(2019-03-23 21:36:46.123 +03:00:00), + datetime!(2019-03-23 21:37:46.123 +03:00:00), + ]; + for (date_str, &expected_dt) in dates.iter().zip(expected.iter()) { + let parsed_dt = parser + .parse_date_time(date_str) + .unwrap_or_else(|e| panic!("Failed to parse {}: {}", date_str, e)); + assert_eq!(parsed_dt, expected_dt); + } + } + #[test] fn test_infer_year() { let inferred_year = infer_year(None, Month::January, 2024); @@ -902,4 +1050,39 @@ mod tests { let inferred_year = infer_year(Some(Month::May), Month::January, 2024); assert_eq!(inferred_year, 2023); } + + #[test] + fn test_parse_java_datetime_format_items() { + let format_str = "xx[xx]'W'wwe"; + let result = parse_java_datetime_format_items(format_str).unwrap(); + + // We expect the tokens to be parsed as: + // - 'xx[xx]' (week-based year) + // - 'W' (literal) + // - 'ww' (week of year) + // - 'e' (day of week) + + assert_eq!(result.len(), 4); + + // Verify each token + match &result[0] { + OwnedFormatItem::Component(Component::Year(_)) => {} + _ => panic!("Expected WeekBasedYear component"), + } + + match &result[1] { + OwnedFormatItem::Literal(lit) => assert_eq!(lit.as_ref(), b"W"), + _ => panic!("Expected literal 'W'"), + } + + match &result[2] { + OwnedFormatItem::Component(Component::WeekNumber(_)) => {} + _ => panic!("Expected WeekNumber component"), + } + + match &result[3] { + OwnedFormatItem::Component(Component::Weekday(_)) => {} + _ => panic!("Expected Weekday component"), + } + } } diff --git a/quickwit/quickwit-datetime/src/lib.rs b/quickwit/quickwit-datetime/src/lib.rs index 5e9b4edc59c..eb4d8c940ba 100644 --- a/quickwit/quickwit-datetime/src/lib.rs +++ b/quickwit/quickwit-datetime/src/lib.rs @@ -19,11 +19,9 @@ mod date_time_format; mod date_time_parsing; -mod regex_tokenizer; pub use date_time_format::{DateTimeInputFormat, DateTimeOutputFormat, StrptimeParser}; pub use date_time_parsing::{ parse_date_time_str, parse_timestamp, parse_timestamp_float, parse_timestamp_int, }; -pub(crate) use regex_tokenizer::RegexTokenizer; pub use tantivy::DateTime as TantivyDateTime; diff --git a/quickwit/quickwit-datetime/src/regex_tokenizer.rs b/quickwit/quickwit-datetime/src/regex_tokenizer.rs deleted file mode 100644 index 77294f1a9a8..00000000000 --- a/quickwit/quickwit-datetime/src/regex_tokenizer.rs +++ /dev/null @@ -1,131 +0,0 @@ -// Copyright (C) 2024 Quickwit, Inc. -// -// Quickwit is offered under the AGPL v3.0 and as commercial software. -// For commercial licensing, contact us at hello@quickwit.io. -// -// AGPL: -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU Affero General Public License as -// published by the Free Software Foundation, either version 3 of the -// License, or (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Affero General Public License for more details. -// -// You should have received a copy of the GNU Affero General Public License -// along with this program. If not, see . - -use regex::{Regex, RegexSet}; - -type TokenMatcherFn = fn(&str) -> Option; - -pub(crate) struct RegexTokenizer { - regexset: RegexSet, - regexs: Vec, - token_matchers: Vec>, -} - -impl RegexTokenizer { - pub fn new(ptns_router: Vec<(&str, TokenMatcherFn)>) -> Result { - let ptns: Vec = ptns_router - .iter() - .map(|(ptn, _)| format!("^{ptn}")) - .collect(); - let regexset = RegexSet::new(ptns)?; - let regexs: Vec = regexset - .patterns() - .iter() - .map(|ptn| Regex::new(ptn)) - .collect::>()?; - let token_matchers = ptns_router - .into_iter() - .map(|(_, token_matcher)| token_matcher) - .collect(); - Ok(RegexTokenizer { - regexset, - regexs, - token_matchers, - }) - } - - fn match_token(&self, text: &mut &str) -> Option { - let matches = self.regexset.matches(text); - for pattern_id in matches { - // unfortunately regexset does not give us the length of the match, so we need to rerun - // the targeted ptn. - let m = self.regexs[pattern_id].find(text)?; - let match_len = m.len(); - if match_len == 0 { - return None; - } - let match_str = &text[0..match_len]; - if let Some(token) = self.token_matchers[pattern_id](match_str) { - *text = &text[match_len..]; - return Some(token); - } - } - None - } - - /// Tokenize the input text. If no pattern matches, returns the position of the error. - pub fn tokenize(&self, mut text: &str) -> Result, usize> { - let len = text.len(); - let mut tokens = Vec::new(); - while !text.is_empty() { - let token = self - .match_token(&mut text) - .ok_or_else(|| len - text.len())?; - tokens.push(token); - } - Ok(tokens) - } -} - -#[cfg(test)] -mod tests { - use super::RegexTokenizer; - - #[derive(Eq, PartialEq, Debug)] - enum Token { - Number(u64), - Ip(String), - Dot, - } - - #[test] - fn test_regex_tokenizer_simple_priority() { - use std::str::FromStr; - let regex_tokenizer = RegexTokenizer::new(vec![ - (r#"\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}"#, |s| { - Some(Token::Ip(s.to_string())) - }), - (r#"\d{1,10}"#, |s| { - Some(Token::Number(u64::from_str(s).unwrap())) - }), - (r#"\."#, |_s| Some(Token::Dot)), - ]) - .unwrap(); - let tokens = regex_tokenizer.tokenize("128.1.").unwrap(); - assert_eq!(tokens.len(), 4); - assert_eq!(tokens[0], Token::Number(128)); - assert_eq!(tokens[1], Token::Dot); - assert_eq!(tokens[2], Token::Number(1)); - assert_eq!(tokens[3], Token::Dot); - let tokens = regex_tokenizer.tokenize("128.1.1.12").unwrap(); - assert_eq!(tokens.len(), 1); - assert_eq!(tokens[0], Token::Ip("128.1.1.12".to_string())); - } - - #[test] - fn test_regex_tokenizer_invalid() { - use std::str::FromStr; - let regex_tokenizer = RegexTokenizer::new(vec![(r#"\d+"#, |s| { - Some(Token::Number(u64::from_str(s).unwrap())) - })]) - .unwrap(); - let error_position = regex_tokenizer.tokenize("993s3").unwrap_err(); - assert_eq!(error_position, 3); - } -}