diff --git a/quickwit/quickwit-datetime/src/date_time_format.rs b/quickwit/quickwit-datetime/src/date_time_format.rs index 2b4d4e23270..47d18008399 100644 --- a/quickwit/quickwit-datetime/src/date_time_format.rs +++ b/quickwit/quickwit-datetime/src/date_time_format.rs @@ -19,6 +19,7 @@ use std::collections::HashMap; use std::fmt::Display; +use std::num::NonZeroU8; use std::str::FromStr; use std::sync::OnceLock; @@ -36,7 +37,45 @@ use time::parsing::Parsed; use time::{Month, OffsetDateTime, PrimitiveDateTime, UtcOffset}; use time_fmt::parse::time_format_item::parse_to_format_item; -use crate::{RegexTokenizer, TantivyDateTime}; +use crate::TantivyDateTime; + +const JAVA_DATE_FORMAT_TOKENS: &[&str] = &[ + "yyyy", + "xxxx", + "xx[xx]", + "SSSSSSSSS", // For nanoseconds + "SSSSSSS", // For microseconds + "SSSSSS", // For fractional seconds up to six digits + "SSSSS", + "SSSS", + "SSS", + "SS", + "ZZ", + "xx", + "ww", + "w[w]", + "yy", + "MM", + "dd", + "HH", + "hh", + "kk", + "mm", + "ss", + "aa", + "a", + "w", + "M", + "d", + "H", + "h", + "k", + "m", + "s", + "S", + "Z", + "e", +]; fn literal(s: &[u8]) -> OwnedFormatItem { // builds a boxed slice from a slice @@ -53,16 +92,6 @@ fn get_padding(ptn: &str) -> Padding { } } -fn build_optional_item(java_datetime_format: &str) -> Option { - assert!(java_datetime_format.len() >= 2); - let optional_date_format = &java_datetime_format[1..java_datetime_format.len() - 1]; - let format_items: Box<[OwnedFormatItem]> = - parse_java_datetime_format_items(optional_date_format).ok()?; - Some(OwnedFormatItem::Optional(Box::new( - OwnedFormatItem::Compound(format_items), - ))) -} - fn build_zone_offset(_: &str) -> Option { // 'Z' literal to represent UTC offset let z_literal = OwnedFormatItem::Literal(Box::from(b"Z".as_ref())); @@ -90,17 +119,26 @@ fn build_zone_offset(_: &str) -> Option { } fn build_year_item(ptn: &str) -> Option { - let mut year = Year::default(); - let year_repr = if ptn.len() == 4 { - YearRepr::Full + let mut full_year = Year::default(); + full_year.repr = YearRepr::Full; + let full_year_component = OwnedFormatItem::Component(Component::Year(full_year)); + + let mut short_year = Year::default(); + short_year.repr = YearRepr::LastTwo; + let short_year_component = OwnedFormatItem::Component(Component::Year(short_year)); + + if ptn.len() == 4 { + Some(full_year_component) + } else if ptn.len() == 2 { + Some(short_year_component) } else { - YearRepr::LastTwo - }; - year.repr = year_repr; - Some(OwnedFormatItem::Component(Component::Year(year))) + Some(OwnedFormatItem::First( + vec![full_year_component, short_year_component].into_boxed_slice(), + )) + } } -fn build_week_year_item(ptn: &str) -> Option { +fn build_week_based_year_item(ptn: &str) -> Option { // TODO no `Component` for that build_year_item(ptn) } @@ -117,14 +155,14 @@ fn build_day_item(ptn: &str) -> Option { Some(OwnedFormatItem::Component(Component::Day(day))) } -fn build_weekday_item(_: &str) -> Option { +fn build_day_of_week_item(_: &str) -> Option { let mut weekday = Weekday::default(); weekday.repr = WeekdayRepr::Monday; weekday.one_indexed = false; Some(OwnedFormatItem::Component(Component::Weekday(weekday))) } -fn build_week_number_item(ptn: &str) -> Option { +fn build_week_of_year_item(ptn: &str) -> Option { let mut week_number = WeekNumber::default(); week_number.repr = WeekNumberRepr::Monday; week_number.padding = get_padding(ptn); @@ -158,32 +196,92 @@ fn build_fraction_of_second_item(_ptn: &str) -> Option { Some(OwnedFormatItem::Component(Component::Subsecond(subsecond))) } +fn parse_java_datetime_format_items_recursive( + chars: &mut std::iter::Peekable, +) -> Result, String> { + let mut items = Vec::new(); + + while let Some(&c) = chars.peek() { + match c { + '[' => { + chars.next(); + let optional_items = parse_java_datetime_format_items_recursive(chars)?; + items.push(OwnedFormatItem::Optional(Box::new( + OwnedFormatItem::Compound(optional_items.into_boxed_slice()), + ))); + } + ']' => { + chars.next(); + break; + } + '\'' => { + chars.next(); + let mut literal_str = String::new(); + while let Some(&next_c) = chars.peek() { + if next_c == '\'' { + chars.next(); + break; + } else { + literal_str.push(next_c); + chars.next(); + } + } + items.push(literal(literal_str.as_bytes())); + } + _ => { + if let Some(format_item) = match_java_date_format_token(chars)? { + items.push(format_item); + } else { + // Treat as a literal character + items.push(literal(c.to_string().as_bytes())); + chars.next(); + } + } + } + } + + Ok(items) +} + // Elasticsearch/OpenSearch uses a set of preconfigured formats, more information could be found // here https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-date-format.html -fn java_date_format_tokenizer() -> &'static RegexTokenizer { - static JAVA_DATE_FORMAT_TOKENIZER: OnceLock> = OnceLock::new(); - JAVA_DATE_FORMAT_TOKENIZER.get_or_init(|| { - RegexTokenizer::new(vec![ - (r#"yy(yy)?"#, build_year_item), - (r#"MM?"#, build_month_item), - (r#"dd?"#, build_day_item), - (r#"HH?"#, build_hour_item), - (r#"mm?"#, build_minute_item), - (r#"ss?"#, build_second_item), - (r#"S+"#, build_fraction_of_second_item), - (r#"Z"#, build_zone_offset), - (r#"''"#, |_| Some(literal(b"'"))), - (r#"'[^']+'"#, |s| { - Some(literal(s[1..s.len() - 1].as_bytes())) - }), - (r#"[^\w\[\]{}]"#, |s| Some(literal(s.as_bytes()))), - (r#"\[.*\]"#, build_optional_item), - (r#"xx(xx)?"#, build_week_year_item), - (r#"ww?"#, build_week_number_item), - (r#"e?"#, build_weekday_item), - ]) - .unwrap() - }) +fn match_java_date_format_token( + chars: &mut std::iter::Peekable, +) -> Result, String> { + if chars.peek().is_none() { + return Ok(None); + } + + let remaining: String = chars.clone().collect(); + + // Try to match the longest possible token + for token in JAVA_DATE_FORMAT_TOKENS { + if remaining.starts_with(token) { + for _ in 0..token.len() { + chars.next(); + } + + let format_item = match *token { + "yyyy" | "yy" => build_year_item(token), + "xxxx" | "xx[xx]" | "xx" => build_week_based_year_item(token), + "MM" | "M" => build_month_item(token), + "dd" | "d" => build_day_item(token), + "HH" | "H" => build_hour_item(token), + "mm" | "m" => build_minute_item(token), + "ss" | "s" => build_second_item(token), + "SSSSSSSSS" | "SSSSSSS" | "SSSSSS" | "SSSSS" | "SSSS" | "SSS" | "SS" | "S" => { + build_fraction_of_second_item(token) + } + "Z" => build_zone_offset(token), + "ww" | "w[w]" | "w" => build_week_of_year_item(token), + "e" => build_day_of_week_item(token), + _ => return Err(format!("Unrecognized token '{}'", token)), + }; + return Ok(format_item); + } + } + + Ok(None) } // Check if the given date time format is a common alias and replace it with the @@ -195,8 +293,11 @@ fn resolve_java_datetime_format_alias(java_datetime_format: &str) -> &str { OnceLock::new(); let java_datetime_format_map = JAVA_DATE_FORMAT_ALIASES.get_or_init(|| { let mut m = HashMap::new(); - m.insert("date_optional_time", "yyyy-MM-dd['T'HH:mm:ss.SSSZ]"); - m.insert("strict_date_optional_time", "yyyy-MM-dd['T'HH:mm:ss.SSSZ]"); + m.insert("date_optional_time", "yyyy-MM-dd['T'HH:mm:ss.SSSZ]"); + m.insert( + "strict_date_optional_time", + "yyyy[-MM[-dd['T'HH[:mm[:ss[.SSS[Z]]]]]]]", + ); m.insert( "strict_date_optional_time_nanos", "yyyy-MM-dd['T'HH:mm:ss.SSSSSSZ]", @@ -235,17 +336,9 @@ pub struct StrptimeParser { fn parse_java_datetime_format_items( java_datetime_format: &str, ) -> Result, String> { - let java_datetime_format_resolved = resolve_java_datetime_format_alias(java_datetime_format); - let items = java_date_format_tokenizer() - .tokenize(java_datetime_format_resolved) - .map_err(|pos| { - format!( - "failed to parse date format `{java_datetime_format}`. Pattern at pos {pos} is \ - not recognized." - ) - })? - .into_boxed_slice(); - Ok(items) + let mut chars = java_datetime_format.chars().peekable(); + let items = parse_java_datetime_format_items_recursive(&mut chars)?; + Ok(items.into_boxed_slice()) } impl StrptimeParser { @@ -290,6 +383,14 @@ impl StrptimeParser { parsed.set_year(year); } + if parsed.day().is_none() && parsed.monday_week_number().is_none() { + parsed.set_day(NonZeroU8::try_from(1u8).unwrap()); + } + + if parsed.month().is_none() && parsed.monday_week_number().is_none() { + parsed.set_month(Month::January); + } + if parsed.offset_hour().is_some() { let offset_datetime: OffsetDateTime = parsed .try_into() @@ -424,14 +525,18 @@ impl FromStr for DateTimeInputFormat { impl Serialize for DateTimeInputFormat { fn serialize(&self, serializer: S) -> Result - where S: serde::Serializer { + where + S: serde::Serializer, + { serializer.serialize_str(self.as_str()) } } impl<'de> Deserialize<'de> for DateTimeInputFormat { fn deserialize(deserializer: D) -> Result - where D: Deserializer<'de> { + where + D: Deserializer<'de>, + { let date_time_format_str: String = Deserialize::deserialize(deserializer)?; let date_time_format = date_time_format_str.parse().map_err(D::Error::custom)?; Ok(date_time_format) @@ -526,14 +631,18 @@ impl FromStr for DateTimeOutputFormat { impl Serialize for DateTimeOutputFormat { fn serialize(&self, serializer: S) -> Result - where S: serde::Serializer { + where + S: serde::Serializer, + { serializer.serialize_str(self.as_str()) } } impl<'de> Deserialize<'de> for DateTimeOutputFormat { fn deserialize(deserializer: D) -> Result - where D: Deserializer<'de> { + where + D: Deserializer<'de>, + { let date_time_format_str: String = Deserialize::deserialize(deserializer)?; let date_time_format = date_time_format_str.parse().map_err(D::Error::custom)?; Ok(date_time_format) @@ -558,7 +667,6 @@ pub(super) fn infer_year( #[cfg(test)] mod tests { - use time::macros::datetime; use time::Month; @@ -721,6 +829,7 @@ mod tests { #[test] fn test_parse_java_datetime_format() { + test_parse_java_datetime_aux("yyyyMMdd", "20210101", datetime!(2021-01-01 00:00:00 UTC)); test_parse_java_datetime_aux( "yyyy MM dd", "2021 01 01", @@ -737,12 +846,17 @@ mod tests { datetime!(2021-01-01 13:00:00 UTC), ); test_parse_java_datetime_aux( - "yyyy!MM?dd['T'HH:]", + "yyyy!MM?dd['T'[HH:]]", "2021!01?01", datetime!(2021-01-01 00:00:00 UTC), ); test_parse_java_datetime_aux( - "yyyy!MM?dd['T'HH:]", + "yyyy!MM?dd['T'[HH:]", + "2021!01?01T", + datetime!(2021-01-01 00:00:00 UTC), + ); + test_parse_java_datetime_aux( + "yyyy!MM?dd['T'[HH:]]", "2021!01?01T13:", datetime!(2021-01-01 13:00:00 UTC), ); @@ -780,7 +894,7 @@ mod tests { ); test_parse_java_datetime_aux( "date_optional_time", - "2021W313", + "2021-01-21T03:01:22.312+01:00", datetime!(2021-01-21 03:01:22.312 +1), ); } @@ -879,6 +993,40 @@ mod tests { ); } + #[test] + fn test_parse_strict_date_optional_time() { + let parser = + StrptimeParser::from_java_datetime_format("strict_date_optional_time").unwrap(); + let dates = [ + "2019", + "2019-03", + "2019-03-23", + "2019-03-23T21:34", + "2019-03-23T21:34:46", + "2019-03-23T21:34:46.123Z", + "2019-03-23T21:35:46.123+00:00", + "2019-03-23T21:36:46.123+03:00", + "2019-03-23T21:37:46.123+0300", + ]; + let expected = [ + datetime!(2019-01-01 00:00:00 UTC), + datetime!(2019-03-01 00:00:00 UTC), + datetime!(2019-03-23 00:00:00 UTC), + datetime!(2019-03-23 21:34 UTC), + datetime!(2019-03-23 21:34:46 UTC), + datetime!(2019-03-23 21:34:46.123 UTC), + datetime!(2019-03-23 21:35:46.123 UTC), + datetime!(2019-03-23 21:36:46.123 +03:00:00), + datetime!(2019-03-23 21:37:46.123 +03:00:00), + ]; + for (date_str, &expected_dt) in dates.iter().zip(expected.iter()) { + let parsed_dt = parser + .parse_date_time(date_str) + .unwrap_or_else(|e| panic!("Failed to parse {}: {}", date_str, e)); + assert_eq!(parsed_dt, expected_dt); + } + } + #[test] fn test_infer_year() { let inferred_year = infer_year(None, Month::January, 2024); @@ -902,4 +1050,39 @@ mod tests { let inferred_year = infer_year(Some(Month::May), Month::January, 2024); assert_eq!(inferred_year, 2023); } + + #[test] + fn test_parse_java_datetime_format_items() { + let format_str = "xx[xx]'W'wwe"; + let result = parse_java_datetime_format_items(format_str).unwrap(); + + // We expect the tokens to be parsed as: + // - 'xx[xx]' (week-based year) + // - 'W' (literal) + // - 'ww' (week of year) + // - 'e' (day of week) + + assert_eq!(result.len(), 4); + + // Verify each token + match &result[0] { + OwnedFormatItem::Component(Component::Year(_)) => {} + _ => panic!("Expected WeekBasedYear component"), + } + + match &result[1] { + OwnedFormatItem::Literal(lit) => assert_eq!(lit.as_ref(), b"W"), + _ => panic!("Expected literal 'W'"), + } + + match &result[2] { + OwnedFormatItem::Component(Component::WeekNumber(_)) => {} + _ => panic!("Expected WeekNumber component"), + } + + match &result[3] { + OwnedFormatItem::Component(Component::Weekday(_)) => {} + _ => panic!("Expected Weekday component"), + } + } } diff --git a/quickwit/quickwit-datetime/src/lib.rs b/quickwit/quickwit-datetime/src/lib.rs index 5e9b4edc59c..eb4d8c940ba 100644 --- a/quickwit/quickwit-datetime/src/lib.rs +++ b/quickwit/quickwit-datetime/src/lib.rs @@ -19,11 +19,9 @@ mod date_time_format; mod date_time_parsing; -mod regex_tokenizer; pub use date_time_format::{DateTimeInputFormat, DateTimeOutputFormat, StrptimeParser}; pub use date_time_parsing::{ parse_date_time_str, parse_timestamp, parse_timestamp_float, parse_timestamp_int, }; -pub(crate) use regex_tokenizer::RegexTokenizer; pub use tantivy::DateTime as TantivyDateTime; diff --git a/quickwit/quickwit-datetime/src/regex_tokenizer.rs b/quickwit/quickwit-datetime/src/regex_tokenizer.rs deleted file mode 100644 index 77294f1a9a8..00000000000 --- a/quickwit/quickwit-datetime/src/regex_tokenizer.rs +++ /dev/null @@ -1,131 +0,0 @@ -// Copyright (C) 2024 Quickwit, Inc. -// -// Quickwit is offered under the AGPL v3.0 and as commercial software. -// For commercial licensing, contact us at hello@quickwit.io. -// -// AGPL: -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU Affero General Public License as -// published by the Free Software Foundation, either version 3 of the -// License, or (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Affero General Public License for more details. -// -// You should have received a copy of the GNU Affero General Public License -// along with this program. If not, see . - -use regex::{Regex, RegexSet}; - -type TokenMatcherFn = fn(&str) -> Option; - -pub(crate) struct RegexTokenizer { - regexset: RegexSet, - regexs: Vec, - token_matchers: Vec>, -} - -impl RegexTokenizer { - pub fn new(ptns_router: Vec<(&str, TokenMatcherFn)>) -> Result { - let ptns: Vec = ptns_router - .iter() - .map(|(ptn, _)| format!("^{ptn}")) - .collect(); - let regexset = RegexSet::new(ptns)?; - let regexs: Vec = regexset - .patterns() - .iter() - .map(|ptn| Regex::new(ptn)) - .collect::>()?; - let token_matchers = ptns_router - .into_iter() - .map(|(_, token_matcher)| token_matcher) - .collect(); - Ok(RegexTokenizer { - regexset, - regexs, - token_matchers, - }) - } - - fn match_token(&self, text: &mut &str) -> Option { - let matches = self.regexset.matches(text); - for pattern_id in matches { - // unfortunately regexset does not give us the length of the match, so we need to rerun - // the targeted ptn. - let m = self.regexs[pattern_id].find(text)?; - let match_len = m.len(); - if match_len == 0 { - return None; - } - let match_str = &text[0..match_len]; - if let Some(token) = self.token_matchers[pattern_id](match_str) { - *text = &text[match_len..]; - return Some(token); - } - } - None - } - - /// Tokenize the input text. If no pattern matches, returns the position of the error. - pub fn tokenize(&self, mut text: &str) -> Result, usize> { - let len = text.len(); - let mut tokens = Vec::new(); - while !text.is_empty() { - let token = self - .match_token(&mut text) - .ok_or_else(|| len - text.len())?; - tokens.push(token); - } - Ok(tokens) - } -} - -#[cfg(test)] -mod tests { - use super::RegexTokenizer; - - #[derive(Eq, PartialEq, Debug)] - enum Token { - Number(u64), - Ip(String), - Dot, - } - - #[test] - fn test_regex_tokenizer_simple_priority() { - use std::str::FromStr; - let regex_tokenizer = RegexTokenizer::new(vec![ - (r#"\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}"#, |s| { - Some(Token::Ip(s.to_string())) - }), - (r#"\d{1,10}"#, |s| { - Some(Token::Number(u64::from_str(s).unwrap())) - }), - (r#"\."#, |_s| Some(Token::Dot)), - ]) - .unwrap(); - let tokens = regex_tokenizer.tokenize("128.1.").unwrap(); - assert_eq!(tokens.len(), 4); - assert_eq!(tokens[0], Token::Number(128)); - assert_eq!(tokens[1], Token::Dot); - assert_eq!(tokens[2], Token::Number(1)); - assert_eq!(tokens[3], Token::Dot); - let tokens = regex_tokenizer.tokenize("128.1.1.12").unwrap(); - assert_eq!(tokens.len(), 1); - assert_eq!(tokens[0], Token::Ip("128.1.1.12".to_string())); - } - - #[test] - fn test_regex_tokenizer_invalid() { - use std::str::FromStr; - let regex_tokenizer = RegexTokenizer::new(vec![(r#"\d+"#, |s| { - Some(Token::Number(u64::from_str(s).unwrap())) - })]) - .unwrap(); - let error_position = regex_tokenizer.tokenize("993s3").unwrap_err(); - assert_eq!(error_position, 3); - } -}