Skip to content

Commit

Permalink
Allowing custom date
Browse files Browse the repository at this point in the history
  • Loading branch information
fulmicoton committed Jul 19, 2024
1 parent d7c5d28 commit 8910cd7
Show file tree
Hide file tree
Showing 8 changed files with 446 additions and 363 deletions.
447 changes: 224 additions & 223 deletions quickwit/Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion quickwit/quickwit-datetime/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ license.workspace = true
[dependencies]
anyhow = { workspace = true }
itertools = { workspace = true }
ouroboros = "0.18.0"
regex = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
tantivy = { workspace = true }
Expand Down
153 changes: 113 additions & 40 deletions quickwit/quickwit-datetime/src/date_time_format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,60 +19,123 @@

use std::fmt::Display;
use std::str::FromStr;
use std::sync::OnceLock;

use ouroboros::self_referencing;
use serde::de::Error;
use serde::{Deserialize, Deserializer, Serialize};
use serde_json::Value as JsonValue;
use time::error::Format;
use time::format_description::modifier::{Day, Month as MonthModifier, Padding, Year, YearRepr};
use time::format_description::well_known::{Iso8601, Rfc2822, Rfc3339};
use time::format_description::FormatItem;
use time::format_description::{Component, OwnedFormatItem};
use time::parsing::Parsed;
use time::{Month, OffsetDateTime, PrimitiveDateTime};
use time_fmt::parse::time_format_item::parse_to_format_item;

use crate::TantivyDateTime;
use crate::{RegexTokenizer, TantivyDateTime};

fn literal(s: &[u8]) -> OwnedFormatItem{
// builds a boxed slice from a slice
let boxed_slice: Box<[u8]> = s.to_vec().into_boxed_slice();
OwnedFormatItem::Literal(boxed_slice)
}

fn build_month_item(ptn: &str) -> Option<OwnedFormatItem> {
let mut month: MonthModifier = Default::default();
if ptn.len() == 2 {
month.padding = Padding::Zero;
} else {
month.padding = Padding::None;
}
Some(OwnedFormatItem::Component(Component::Month(month)))
}

fn build_year_item(ptn: &str) -> Option<OwnedFormatItem> {
let year_repr = if ptn.len() == 4 {
YearRepr::Full
} else {
YearRepr::LastTwo
};
let mut year = Year::default();
year.repr = year_repr;
Some(OwnedFormatItem::Component(Component::Year(year)))
}

fn build_day_item(ptn: &str) -> Option<OwnedFormatItem> {
let mut day = Day::default();
if ptn.len() == 2 {
day.padding = Padding::Zero;
} else {
day.padding = Padding::None;
};
Some(OwnedFormatItem::Component(Component::Day(day)))
}

// Elasticsearch/OpenSearch uses a set of preconfigured formats, more information could be found
// here https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-date-format.html
fn java_date_format_tokenizer() -> &'static RegexTokenizer<OwnedFormatItem> {
static JAVA_DATE_FORMAT_TOKENIZER: OnceLock<RegexTokenizer<OwnedFormatItem>> = OnceLock::new();
&*JAVA_DATE_FORMAT_TOKENIZER.get_or_init(|| {
super::RegexTokenizer::new(vec![
(r#"yy(yy)?"#, build_year_item),
(r#"MM?"#, build_month_item),
(r#"dd?"#, build_day_item),
(r#"''"#, |_| { Some(literal(b"'")) }),
(r#"'[^']+'"#, |s| { Some(literal(s[1..s.len() - 1].as_bytes())) }),
(r#"[^\w\[\]{}]"#, |s| { Some(literal(s.as_bytes())) }),
]).unwrap()
})
}

/// A date time parser that holds the format specification `Vec<FormatItem>`.
#[self_referencing]
#[derive(Clone)]
pub struct StrptimeParser {
strptime_format: String,
with_timezone: bool,
#[borrows(strptime_format)]
#[covariant]
items: Vec<FormatItem<'this>>,
items: Vec<OwnedFormatItem>,
}

impl FromStr for StrptimeParser {
type Err = String;
impl StrptimeParser {

fn from_str(strptime_format: &str) -> Result<Self, Self::Err> {
StrptimeParser::try_new(
strptime_format.to_string(),
strptime_format.to_lowercase().contains("%z"),
|strptime_format: &String| {
parse_to_format_item(strptime_format).map_err(|error| {
format!("invalid strptime format `{strptime_format}`: {error}")
})
},
)
pub fn from_strptime(strptime_format: &str) -> Result<StrptimeParser, String> {
let items: Vec<OwnedFormatItem> = parse_to_format_item(strptime_format)
.map_err(|error| {
format!("invalid strptime format `{strptime_format}`: {error}")
})?
.into_iter()
.map(|item| item.into())
.collect();
Ok(StrptimeParser {
strptime_format: strptime_format.to_string(),
with_timezone: strptime_format.to_lowercase().contains("%z"),
items
})
}

pub fn from_java_datetime_format(java_datetime_format: &str) -> Result<StrptimeParser, String> {
let items = java_date_format_tokenizer().tokenize(java_datetime_format).map_err(|pos| {
format!("failed to parse date format `{java_datetime_format}`. Pattern at pos {pos} is not recognized.")
})?;
Ok(StrptimeParser {
strptime_format: java_datetime_format.to_string(),
with_timezone: false,
items,
})
}
}

impl StrptimeParser {
/// Parse a given date according to the datetime format specified during the StrptimeParser
/// creation. If the date format does not provide a specific a time, the time will be set to
/// 00:00:00.
fn parse_primitive_date_time(&self, date_time_str: &str) -> anyhow::Result<PrimitiveDateTime> {
let mut parsed = Parsed::new();
if !parsed
.parse_items(date_time_str.as_bytes(), self.borrow_items())?
.parse_items(date_time_str.as_bytes(), &self.items)?
.is_empty()
{
anyhow::bail!(
"datetime string `{}` does not match strptime format `{}`",
date_time_str,
self.borrow_strptime_format()
&self.strptime_format
);
}
// The parsed datetime contains a date but seems to be missing "time".
Expand All @@ -94,8 +157,8 @@ impl StrptimeParser {
}

pub fn parse_date_time(&self, date_time_str: &str) -> Result<OffsetDateTime, String> {
if *self.borrow_with_timezone() {
OffsetDateTime::parse(date_time_str, self.borrow_items()).map_err(|err| err.to_string())
if self.with_timezone {
OffsetDateTime::parse(date_time_str, &self.items).map_err(|err| err.to_string())
} else {
self.parse_primitive_date_time(date_time_str)
.map(|date_time| date_time.assume_utc())
Expand All @@ -104,20 +167,14 @@ impl StrptimeParser {
}

pub fn format_date_time(&self, date_time: &OffsetDateTime) -> Result<String, Format> {
date_time.format(self.borrow_items())
date_time.format(&self.items)
}
}

impl Clone for StrptimeParser {
fn clone(&self) -> Self {
// `self.format` is already known to be a valid format.
Self::from_str(self.borrow_strptime_format().as_str()).unwrap()
}
}

impl PartialEq for StrptimeParser {
fn eq(&self, other: &Self) -> bool {
self.borrow_strptime_format() == other.borrow_strptime_format()
self.strptime_format == other.strptime_format
}
}

Expand All @@ -127,14 +184,14 @@ impl std::fmt::Debug for StrptimeParser {
fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
formatter
.debug_struct("StrptimeParser")
.field("format", &self.borrow_strptime_format())
.field("format", &self.strptime_format)
.finish()
}
}

impl std::hash::Hash for StrptimeParser {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.borrow_strptime_format().hash(state);
self.strptime_format.hash(state);
}
}

Expand Down Expand Up @@ -170,7 +227,7 @@ impl DateTimeInputFormat {
DateTimeInputFormat::Iso8601 => "iso8601",
DateTimeInputFormat::Rfc2822 => "rfc2822",
DateTimeInputFormat::Rfc3339 => "rfc3339",
DateTimeInputFormat::Strptime(parser) => parser.borrow_strptime_format(),
DateTimeInputFormat::Strptime(parser) => parser.strptime_format.as_str(),
DateTimeInputFormat::Timestamp => "unix_timestamp",
}
}
Expand Down Expand Up @@ -198,7 +255,7 @@ impl FromStr for DateTimeInputFormat {
format must contain at least one `strftime` special characters"
));
}
DateTimeInputFormat::Strptime(StrptimeParser::from_str(date_time_format_str)?)
DateTimeInputFormat::Strptime(StrptimeParser::from_strptime(date_time_format_str)?)
}
};
Ok(date_time_format)
Expand Down Expand Up @@ -241,7 +298,7 @@ impl DateTimeOutputFormat {
DateTimeOutputFormat::Iso8601 => "iso8601",
DateTimeOutputFormat::Rfc2822 => "rfc2822",
DateTimeOutputFormat::Rfc3339 => "rfc3339",
DateTimeOutputFormat::Strptime(parser) => parser.borrow_strptime_format(),
DateTimeOutputFormat::Strptime(parser) => parser.strptime_format.as_str(),
DateTimeOutputFormat::TimestampSecs => "unix_timestamp_secs",
DateTimeOutputFormat::TimestampMillis => "unix_timestamp_millis",
DateTimeOutputFormat::TimestampMicros => "unix_timestamp_micros",
Expand Down Expand Up @@ -300,7 +357,7 @@ impl FromStr for DateTimeOutputFormat {
format must contain at least one `strftime` special characters"
));
}
DateTimeOutputFormat::Strptime(StrptimeParser::from_str(date_time_format_str)?)
DateTimeOutputFormat::Strptime(StrptimeParser::from_strptime(date_time_format_str)?)
}
};
Ok(date_time_format)
Expand Down Expand Up @@ -464,7 +521,7 @@ mod tests {

#[test]
fn test_strictly_parse_datetime_format() {
let parser = StrptimeParser::from_str("%Y-%m-%d").unwrap();
let parser = StrptimeParser::from_strptime("%Y-%m-%d").unwrap();
assert_eq!(
parser.parse_date_time("2021-01-01").unwrap(),
datetime!(2021-01-01 00:00:00 UTC)
Expand All @@ -476,6 +533,22 @@ mod tests {
);
}


#[test]
fn test_parse_java_datetime_format() {
let parser = StrptimeParser::from_java_datetime_format("yyyy MM dd").unwrap();
assert_eq!(
parser.parse_date_time("2021 01 01").unwrap(),
datetime!(2021-01-01 00:00:00 UTC)
);

let parser = StrptimeParser::from_java_datetime_format("yyyy!MM?dd").unwrap();
assert_eq!(
parser.parse_date_time("2021!01?01").unwrap(),
datetime!(2021-01-01 00:00:00 UTC)
);
}

#[test]
fn test_infer_year() {
let inferred_year = infer_year(None, Month::January, 2024);
Expand Down
16 changes: 7 additions & 9 deletions quickwit/quickwit-datetime/src/date_time_parsing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,6 @@ pub fn parse_timestamp(timestamp: i64) -> Result<TantivyDateTime, String> {

#[cfg(test)]
mod tests {
use std::str::FromStr;

use time::macros::datetime;
use time::Month;

Expand Down Expand Up @@ -262,7 +260,7 @@ mod tests {
),
];
for (fmt, date_time_str, expected) in test_data {
let parser = StrptimeParser::from_str(fmt).unwrap();
let parser = StrptimeParser::from_strptime(fmt).unwrap();
let result = parser.parse_date_time(date_time_str);
if let Err(error) = &result {
panic!(
Expand All @@ -276,14 +274,14 @@ mod tests {

#[test]
fn test_parse_date_without_time() {
let strptime_parser = StrptimeParser::from_str("%Y-%m-%d").unwrap();
let strptime_parser = StrptimeParser::from_strptime("%Y-%m-%d").unwrap();
let date = strptime_parser.parse_date_time("2012-05-21").unwrap();
assert_eq!(date, datetime!(2012-05-21 00:00:00 UTC));
}

#[test]
fn test_parse_date_am_pm_hour_not_zeroed() {
let strptime_parser = StrptimeParser::from_str("%Y-%m-%d %I:%M:%S %p").unwrap();
let strptime_parser = StrptimeParser::from_strptime("%Y-%m-%d %I:%M:%S %p").unwrap();
let date = strptime_parser
.parse_date_time("2012-05-21 10:05:12 pm")
.unwrap();
Expand All @@ -309,13 +307,13 @@ mod tests {
DateTimeInputFormat::Rfc2822,
DateTimeInputFormat::Rfc3339,
DateTimeInputFormat::Strptime(
StrptimeParser::from_str("%Y-%m-%d %H:%M:%S").unwrap(),
StrptimeParser::from_strptime("%Y-%m-%d %H:%M:%S").unwrap(),
),
DateTimeInputFormat::Strptime(
StrptimeParser::from_str("%Y/%m/%d %H:%M:%S").unwrap(),
StrptimeParser::from_strptime("%Y/%m/%d %H:%M:%S").unwrap(),
),
DateTimeInputFormat::Strptime(
StrptimeParser::from_str("%Y/%m/%d %H:%M:%S %z").unwrap(),
StrptimeParser::from_strptime("%Y/%m/%d %H:%M:%S %z").unwrap(),
),
DateTimeInputFormat::Timestamp,
],
Expand Down Expand Up @@ -452,7 +450,7 @@ mod tests {
DateTimeInputFormat::Iso8601,
DateTimeInputFormat::Rfc3339,
DateTimeInputFormat::Strptime(
StrptimeParser::from_str("%Y-%m-%d %H:%M:%S.%f").unwrap(),
StrptimeParser::from_strptime("%Y-%m-%d %H:%M:%S.%f").unwrap(),
),
],
)
Expand Down
2 changes: 2 additions & 0 deletions quickwit/quickwit-datetime/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@

mod date_time_format;
mod date_time_parsing;
mod regex_tokenizer;

pub(crate) use regex_tokenizer::RegexTokenizer;
pub use date_time_format::{DateTimeInputFormat, DateTimeOutputFormat, StrptimeParser};
pub use date_time_parsing::{
parse_date_time_str, parse_timestamp, parse_timestamp_float, parse_timestamp_int,
Expand Down
Loading

0 comments on commit 8910cd7

Please sign in to comment.