Skip to content

Commit

Permalink
Fix incompatible es java date format (#5462)
Browse files Browse the repository at this point in the history
* fix(es): add java date format support

* Adding rest integration test

* Bugfix: expressing date in rfc3339. unit test + integration test

* fix: add week patterns

* Add support for variable zone offset type

ES supports 3 formats at the same time. To do the same we will use
`First` format item with multiple options.

* Replace regexp tokenizer with recursion parser

In the current implementation, the RegexTokenizer uses regular
expressions to tokenize the format string. However, regular expressions
are not well-suited for parsing nested structures like nested brackets
because they cannot match patterns with recursive depth.

To address this, we'll write a custom recursive parser that can handle
nested optional brackets. This parser will process the format string
character by character, building the format items and managing the
nesting of optional components.

That change allowed me to cover `strict_date_optional_time` which has a
nested depth according to the examples in [OS repo][2]

Additional tests taken from [the doc][1]

[1]: https://opensearch.org/docs/latest/field-types/supported-field-types/date/
[2]: https://github.com/opensearch-project/OpenSearch/blob/main/server/src/test/java/org/opensearch/common/time/DateFormattersTests.java

* Move StrptimeParser and Java related parsers to a dedicated file

This way it is easier to keep it manageable and clearly separate QW API DatParser logic and ES API related parser.

Tests moved together with the code base.

* Add additional test to cover initially reported issue

This test covers that we actually can call ES API endpoint with named alias as a formatter.

#5460

---------

Co-authored-by: Eugene Tolbakov <[email protected]>
Co-authored-by: Paul Masurel <[email protected]>
  • Loading branch information
3 people authored Sep 30, 2024
1 parent efac619 commit 6373edb
Show file tree
Hide file tree
Showing 9 changed files with 908 additions and 195 deletions.
1 change: 0 additions & 1 deletion quickwit/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion quickwit/quickwit-datetime/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ license.workspace = true
[dependencies]
anyhow = { workspace = true }
itertools = { workspace = true }
ouroboros = "0.18.0"
serde = { workspace = true }
serde_json = { workspace = true }
tantivy = { workspace = true }
Expand Down
153 changes: 7 additions & 146 deletions quickwit/quickwit-datetime/src/date_time_format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,138 +20,14 @@
use std::fmt::Display;
use std::str::FromStr;

use ouroboros::self_referencing;
use serde::de::Error;
use serde::{Deserialize, Deserializer, Serialize};
use serde_json::Value as JsonValue;
use time::error::Format;
use time::format_description::well_known::{Iso8601, Rfc2822, Rfc3339};
use time::format_description::FormatItem;
use time::parsing::Parsed;
use time::{Month, OffsetDateTime, PrimitiveDateTime};
use time_fmt::parse::time_format_item::parse_to_format_item;

use crate::TantivyDateTime;

/// A date time parser that holds the format specification `Vec<FormatItem>`.
#[self_referencing]
pub struct StrptimeParser {
strptime_format: String,
with_timezone: bool,
#[borrows(strptime_format)]
#[covariant]
items: Vec<FormatItem<'this>>,
}

impl FromStr for StrptimeParser {
type Err = String;

fn from_str(strptime_format: &str) -> Result<Self, Self::Err> {
StrptimeParser::try_new(
strptime_format.to_string(),
strptime_format.to_lowercase().contains("%z"),
|strptime_format: &String| {
parse_to_format_item(strptime_format).map_err(|error| {
format!("invalid strptime format `{strptime_format}`: {error}")
})
},
)
}
}

impl StrptimeParser {
/// Parse a given date according to the datetime format specified during the StrptimeParser
/// creation. If the date format does not provide a specific a time, the time will be set to
/// 00:00:00.
fn parse_primitive_date_time(&self, date_time_str: &str) -> anyhow::Result<PrimitiveDateTime> {
let mut parsed = Parsed::new();
if !parsed
.parse_items(date_time_str.as_bytes(), self.borrow_items())?
.is_empty()
{
anyhow::bail!(
"datetime string `{}` does not match strptime format `{}`",
date_time_str,
self.borrow_strptime_format()
);
}
// The parsed datetime contains a date but seems to be missing "time".
// We complete it artificially with 00:00:00.
if parsed.hour_24().is_none()
&& !(parsed.hour_12().is_some() && parsed.hour_12_is_pm().is_some())
{
parsed.set_hour_24(0u8);
parsed.set_minute(0u8);
parsed.set_second(0u8);
}
if parsed.year().is_none() {
let now = OffsetDateTime::now_utc();
let year = infer_year(parsed.month(), now.month(), now.year());
parsed.set_year(year);
}
let date_time = parsed.try_into()?;
Ok(date_time)
}

pub fn parse_date_time(&self, date_time_str: &str) -> Result<OffsetDateTime, String> {
if *self.borrow_with_timezone() {
OffsetDateTime::parse(date_time_str, self.borrow_items()).map_err(|err| err.to_string())
} else {
self.parse_primitive_date_time(date_time_str)
.map(|date_time| date_time.assume_utc())
.map_err(|err| err.to_string())
}
}

pub fn format_date_time(&self, date_time: &OffsetDateTime) -> Result<String, Format> {
date_time.format(self.borrow_items())
}
}
use time::Month;

impl Clone for StrptimeParser {
fn clone(&self) -> Self {
// `self.format` is already known to be a valid format.
Self::from_str(self.borrow_strptime_format().as_str()).unwrap()
}
}

impl PartialEq for StrptimeParser {
fn eq(&self, other: &Self) -> bool {
self.borrow_strptime_format() == other.borrow_strptime_format()
}
}

impl Eq for StrptimeParser {}

impl std::fmt::Debug for StrptimeParser {
fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
formatter
.debug_struct("StrptimeParser")
.field("format", &self.borrow_strptime_format())
.finish()
}
}

impl std::hash::Hash for StrptimeParser {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.borrow_strptime_format().hash(state);
}
}

// `Strftime` format special characters.
// These characters are taken from the parsing crate we use for compatibility.
const STRFTIME_FORMAT_MARKERS: [&str; 36] = [
"%a", "%A", "%b", "%B", "%c", "%C", "%d", "%D", "%e", "%f", "%F", "%h", "%H", "%I", "%j", "%k",
"%l", "%m", "%M", "%n", "%p", "%P", "%r", "%R", "%S", "%t", "%T", "%U", "%w", "%W", "%x", "%X",
"%y", "%Y", "%z", "%Z",
];

// Checks if a format contains `strftime` special characters.
fn is_strftime_formatting(format_str: &str) -> bool {
STRFTIME_FORMAT_MARKERS
.iter()
.any(|marker| format_str.contains(marker))
}
use crate::java_date_time_format::is_strftime_formatting;
use crate::{StrptimeParser, TantivyDateTime};

/// Specifies the datetime and unix timestamp formats to use when parsing date strings.
#[derive(Clone, Debug, Eq, PartialEq, Hash, Default)]
Expand All @@ -170,7 +46,7 @@ impl DateTimeInputFormat {
DateTimeInputFormat::Iso8601 => "iso8601",
DateTimeInputFormat::Rfc2822 => "rfc2822",
DateTimeInputFormat::Rfc3339 => "rfc3339",
DateTimeInputFormat::Strptime(parser) => parser.borrow_strptime_format(),
DateTimeInputFormat::Strptime(parser) => parser.strptime_format.as_str(),
DateTimeInputFormat::Timestamp => "unix_timestamp",
}
}
Expand Down Expand Up @@ -198,7 +74,7 @@ impl FromStr for DateTimeInputFormat {
format must contain at least one `strftime` special characters"
));
}
DateTimeInputFormat::Strptime(StrptimeParser::from_str(date_time_format_str)?)
DateTimeInputFormat::Strptime(StrptimeParser::from_strptime(date_time_format_str)?)
}
};
Ok(date_time_format)
Expand Down Expand Up @@ -241,7 +117,7 @@ impl DateTimeOutputFormat {
DateTimeOutputFormat::Iso8601 => "iso8601",
DateTimeOutputFormat::Rfc2822 => "rfc2822",
DateTimeOutputFormat::Rfc3339 => "rfc3339",
DateTimeOutputFormat::Strptime(parser) => parser.borrow_strptime_format(),
DateTimeOutputFormat::Strptime(parser) => parser.strptime_format.as_str(),
DateTimeOutputFormat::TimestampSecs => "unix_timestamp_secs",
DateTimeOutputFormat::TimestampMillis => "unix_timestamp_millis",
DateTimeOutputFormat::TimestampMicros => "unix_timestamp_micros",
Expand Down Expand Up @@ -300,7 +176,7 @@ impl FromStr for DateTimeOutputFormat {
format must contain at least one `strftime` special characters"
));
}
DateTimeOutputFormat::Strptime(StrptimeParser::from_str(date_time_format_str)?)
DateTimeOutputFormat::Strptime(StrptimeParser::from_strptime(date_time_format_str)?)
}
};
Ok(date_time_format)
Expand Down Expand Up @@ -341,7 +217,6 @@ pub(super) fn infer_year(

#[cfg(test)]
mod tests {
use time::macros::datetime;
use time::Month;

use super::*;
Expand Down Expand Up @@ -462,20 +337,6 @@ mod tests {
}
}

#[test]
fn test_strictly_parse_datetime_format() {
let parser = StrptimeParser::from_str("%Y-%m-%d").unwrap();
assert_eq!(
parser.parse_date_time("2021-01-01").unwrap(),
datetime!(2021-01-01 00:00:00 UTC)
);
let error = parser.parse_date_time("2021-01-01TABC").unwrap_err();
assert_eq!(
error,
"datetime string `2021-01-01TABC` does not match strptime format `%Y-%m-%d`"
);
}

#[test]
fn test_infer_year() {
let inferred_year = infer_year(None, Month::January, 2024);
Expand Down
16 changes: 7 additions & 9 deletions quickwit/quickwit-datetime/src/date_time_parsing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,6 @@ pub fn parse_timestamp(timestamp: i64) -> Result<TantivyDateTime, String> {

#[cfg(test)]
mod tests {
use std::str::FromStr;

use time::macros::datetime;
use time::Month;

Expand Down Expand Up @@ -262,7 +260,7 @@ mod tests {
),
];
for (fmt, date_time_str, expected) in test_data {
let parser = StrptimeParser::from_str(fmt).unwrap();
let parser = StrptimeParser::from_strptime(fmt).unwrap();
let result = parser.parse_date_time(date_time_str);
if let Err(error) = &result {
panic!(
Expand All @@ -276,14 +274,14 @@ mod tests {

#[test]
fn test_parse_date_without_time() {
let strptime_parser = StrptimeParser::from_str("%Y-%m-%d").unwrap();
let strptime_parser = StrptimeParser::from_strptime("%Y-%m-%d").unwrap();
let date = strptime_parser.parse_date_time("2012-05-21").unwrap();
assert_eq!(date, datetime!(2012-05-21 00:00:00 UTC));
}

#[test]
fn test_parse_date_am_pm_hour_not_zeroed() {
let strptime_parser = StrptimeParser::from_str("%Y-%m-%d %I:%M:%S %p").unwrap();
let strptime_parser = StrptimeParser::from_strptime("%Y-%m-%d %I:%M:%S %p").unwrap();
let date = strptime_parser
.parse_date_time("2012-05-21 10:05:12 pm")
.unwrap();
Expand All @@ -309,13 +307,13 @@ mod tests {
DateTimeInputFormat::Rfc2822,
DateTimeInputFormat::Rfc3339,
DateTimeInputFormat::Strptime(
StrptimeParser::from_str("%Y-%m-%d %H:%M:%S").unwrap(),
StrptimeParser::from_strptime("%Y-%m-%d %H:%M:%S").unwrap(),
),
DateTimeInputFormat::Strptime(
StrptimeParser::from_str("%Y/%m/%d %H:%M:%S").unwrap(),
StrptimeParser::from_strptime("%Y/%m/%d %H:%M:%S").unwrap(),
),
DateTimeInputFormat::Strptime(
StrptimeParser::from_str("%Y/%m/%d %H:%M:%S %z").unwrap(),
StrptimeParser::from_strptime("%Y/%m/%d %H:%M:%S %z").unwrap(),
),
DateTimeInputFormat::Timestamp,
],
Expand Down Expand Up @@ -452,7 +450,7 @@ mod tests {
DateTimeInputFormat::Iso8601,
DateTimeInputFormat::Rfc3339,
DateTimeInputFormat::Strptime(
StrptimeParser::from_str("%Y-%m-%d %H:%M:%S.%f").unwrap(),
StrptimeParser::from_strptime("%Y-%m-%d %H:%M:%S.%f").unwrap(),
),
],
)
Expand Down
Loading

0 comments on commit 6373edb

Please sign in to comment.