From c98f8affaa57bc5c03993e5727bc9083236f33d6 Mon Sep 17 00:00:00 2001 From: Adrien Guillo Date: Wed, 19 Jul 2023 18:02:13 +0900 Subject: [PATCH] Parse timestamp strings (#3639) --- docs/configuration/index-config.md | 30 ++++-- .../src/date_time_parsing.rs | 100 +++++++++++++----- quickwit/quickwit-datetime/src/lib.rs | 2 +- .../src/default_doc_mapper/date_time_type.rs | 4 +- 4 files changed, 99 insertions(+), 37 deletions(-) diff --git a/docs/configuration/index-config.md b/docs/configuration/index-config.md index fbcdce812a2..b692eeb160b 100644 --- a/docs/configuration/index-config.md +++ b/docs/configuration/index-config.md @@ -187,7 +187,19 @@ fast: true #### `datetime` type -The `datetime` type handles dates and datetimes. Each `datetime` field can be configured to support multiple input formats. +The `datetime` type handles dates and datetimes. Since JSON doesn’t have a date type, the `datetime` field support multiple input types and formats. The supported input types are: +- floating-point or integer numbers representing a Unix timestamp +- strings containing a formatted date, datetime, or Unix timestamp + +The `input_formats` field parameter specifies the accepted date formats. The following input formats are natively supported: +- `iso8601` +- `rfc2822` +- `rfc3339` +- `strptime` +- `unix_timestamp` + +**Input formats** + When specifying multiple input formats, the corresponding parsers are attempted in the order they are declared. The following formats are natively supported: - `iso8601`, `rfc2822`, `rfc3339`: parse dates using standard ISO and RFC formats. - `strptime`: parse dates using the Unix [strptime](https://man7.org/linux/man-pages/man3/strptime.3.html) format with some variations: @@ -195,23 +207,23 @@ When specifying multiple input formats, the corresponding parsers are attempted - `%f` for milliseconds precision support. - `%z` timezone offsets can be specified as `(+|-)hhmm` or `(+|-)hh:mm`. -- `unix_timestamp`: parse float and integer numbers to Unix timestamps. Floating-point values are converted to timestamps expressed in seconds. Integer values are converted to Unix timestamps whose precision determined in `seconds`, `milliseconds`, `microseconds`, or `nanoseconds` is inferred from the number of input digits. Internally, datetimes are stored as `i64`, and Quickwit only supports timestamp values ranging from `Apr 13, 1972 23:59:55` to `Mar 16, 2242 12:56:31` as a result. +:::warning +The timezone name format specifier (`%Z`) is not supported currently. +::: + +- `unix_timestamp`: parse float and integer numbers to Unix timestamps. Floating-point values are converted to timestamps expressed in seconds. Integer values are converted to Unix timestamps whose precision, determined in `seconds`, `milliseconds`, `microseconds`, or `nanoseconds`, is inferred from the number of input digits. Internally, datetimes are converted to UTC (if the time zone is specified) and stored as *i64* integers. As a result, Quickwit only supports timestamp values ranging from `Apr 13, 1972 23:59:55` to `Mar 16, 2242 12:56:31`. :::warning -We discourage ingesting decimal timestamps because the conversion occurs with a loss of precision in some cases. Prefer integer values instead. +Converting timestamps from float to integer values may occurs with a loss of precision. ::: -When a `datetime` field is stored as a fast field, the `precision` parameter indicates the precision used to truncate the values before encoding, which improves compression (truncation here means zeroing). The `precision` parameter can take the following values: `seconds`, `milliseconds`, `microseconds`, or `nanoseconds`. It only affects what is stored in fast fields when a `datetime` field is marked as fast field. Finally, operations on `datetime` fastfields, e.g. via aggregations, need to be done at the nanosecond level. +When a `datetime` field is stored as a fast field, the `precision` parameter indicates the precision used to truncate the values before encoding, which improves compression (truncation here means zeroing). The `precision` parameter can take the following values: `seconds`, `milliseconds`, `microseconds`, or `nanoseconds`. It only affects what is stored in fast fields when a `datetime` field is marked as "fast". Finally, operations on `datetime` fast fields, e.g. via aggregations, need to be done at the nanosecond level. :::info Internally `datetime` is stored in `nanoseconds` in fast fields and in the docstore, and in `seconds` in the term dictionary. ::: -:::warning -The timezone name format specifier (`%Z`) is not currently supported in `strptime` format. -::: - -In addition, Quickwit supports the `output_format` field option to specify with which precision datetimes are deserialized. This options supports the same value as input formats except for `unix_timestamp` which is replaced by the following formats: +In addition, Quickwit supports the `output_format` field parameter to specify with which precision datetimes are deserialized. This parameter supports the same value as input formats except for `unix_timestamp` which is replaced by the following formats: - `unix_timestamp_secs`: displays timestamps in seconds. - `unix_timestamp_millis`: displays timestamps in milliseconds. - `unix_timestamp_micros`: displays timestamps in microseconds. diff --git a/quickwit/quickwit-datetime/src/date_time_parsing.rs b/quickwit/quickwit-datetime/src/date_time_parsing.rs index 58b623b2b63..464280a202b 100644 --- a/quickwit/quickwit-datetime/src/date_time_parsing.rs +++ b/quickwit/quickwit-datetime/src/date_time_parsing.rs @@ -37,23 +37,24 @@ pub fn parse_date_time_str( date_time_formats: &[DateTimeInputFormat], ) -> Result { for date_time_format in date_time_formats { - let date_time_res = match date_time_format { - DateTimeInputFormat::Iso8601 => { - parse_iso8601(date_time_str).map(TantivyDateTime::from_utc) - } - DateTimeInputFormat::Rfc2822 => { - parse_rfc2822(date_time_str).map(TantivyDateTime::from_utc) - } - DateTimeInputFormat::Rfc3339 => { - parse_rfc3339(date_time_str).map(TantivyDateTime::from_utc) - } + let date_time_opt = match date_time_format { + DateTimeInputFormat::Iso8601 => parse_iso8601(date_time_str) + .map(TantivyDateTime::from_utc) + .ok(), + DateTimeInputFormat::Rfc2822 => parse_rfc2822(date_time_str) + .map(TantivyDateTime::from_utc) + .ok(), + DateTimeInputFormat::Rfc3339 => parse_rfc3339(date_time_str) + .map(TantivyDateTime::from_utc) + .ok(), DateTimeInputFormat::Strptime(parser) => parser .parse_date_time(date_time_str) - .map(TantivyDateTime::from_utc), - _ => continue, + .map(TantivyDateTime::from_utc) + .ok(), + DateTimeInputFormat::Timestamp => parse_timestamp_str(date_time_str), }; - if date_time_res.is_ok() { - return date_time_res; + if let Some(date_time) = date_time_opt { + return Ok(date_time); } } Err(format!( @@ -65,7 +66,7 @@ pub fn parse_date_time_str( )) } -pub fn parse_date_time_float( +pub fn parse_timestamp_float( timestamp: f64, date_time_formats: &[DateTimeInputFormat], ) -> Result { @@ -84,7 +85,7 @@ pub fn parse_date_time_float( Ok(TantivyDateTime::from_timestamp_nanos(timestamp_nanos)) } -pub fn parse_date_time_int( +pub fn parse_timestamp_int( timestamp: i64, date_time_formats: &[DateTimeInputFormat], ) -> Result { @@ -100,6 +101,31 @@ pub fn parse_date_time_int( parse_timestamp(timestamp) } +pub fn parse_timestamp_str(timestamp_str: &str) -> Option { + if let Ok(timestamp) = timestamp_str.parse::() { + return parse_timestamp(timestamp).ok(); + } + if let Some((timestamp_secs_str, subsecond_digits_str)) = timestamp_str.split_once('.') { + if subsecond_digits_str.is_empty() { + return parse_timestamp_str(timestamp_secs_str); + } + if let Ok(timestamp_secs) = timestamp_secs_str.parse::() { + if (MIN_TIMESTAMP_SECONDS..=MAX_TIMESTAMP_SECONDS).contains(×tamp_secs) { + let num_subsecond_digits = subsecond_digits_str.len().min(9); + + if let Ok(subsecond_digits) = + subsecond_digits_str[..num_subsecond_digits].parse::() + { + let nanos = subsecond_digits * 10i64.pow(9 - num_subsecond_digits as u32); + let timestamp_nanos = timestamp_secs * 1_000_000_000 + nanos; + return Some(TantivyDateTime::from_timestamp_nanos(timestamp_nanos)); + } + } + } + } + None +} + /// Parses a ISO8601 date. fn parse_iso8601(value: &str) -> Result { OffsetDateTime::parse(value, &Iso8601::DEFAULT).map_err(|error| error.to_string()) @@ -242,6 +268,8 @@ mod tests { "2012-05-21 12:09:14", "2012/05/21 12:09:14", "2012/05/21 12:09:14 +00:00", + "1337602154", + "1337602154.0", ] { let date_time = parse_date_time_str( date_time_str, @@ -258,6 +286,7 @@ mod tests { DateTimeInputFormat::Strptime( StrptimeParser::from_str("%Y/%m/%d %H:%M:%S %z").unwrap(), ), + DateTimeInputFormat::Timestamp, ], ) .unwrap(); @@ -278,10 +307,10 @@ mod tests { } #[test] - fn test_parse_date_time_float() { + fn test_parse_timestamp_float() { let unix_ts_secs = OffsetDateTime::now_utc().unix_timestamp(); { - let date_time = parse_date_time_float( + let date_time = parse_timestamp_float( unix_ts_secs as f64, &[DateTimeInputFormat::Iso8601, DateTimeInputFormat::Timestamp], ) @@ -289,7 +318,7 @@ mod tests { assert_eq!(date_time.into_timestamp_millis(), unix_ts_secs * 1_000); } { - let date_time = parse_date_time_float( + let date_time = parse_timestamp_float( unix_ts_secs as f64 + 0.1230, &[DateTimeInputFormat::Iso8601, DateTimeInputFormat::Timestamp], ) @@ -297,7 +326,7 @@ mod tests { assert!((date_time.into_timestamp_millis() - (unix_ts_secs * 1_000 + 123)).abs() <= 1); } { - let date_time = parse_date_time_float( + let date_time = parse_timestamp_float( unix_ts_secs as f64 + 0.1234560, &[DateTimeInputFormat::Iso8601, DateTimeInputFormat::Timestamp], ) @@ -308,7 +337,7 @@ mod tests { ); } { - let date_time = parse_date_time_float( + let date_time = parse_timestamp_float( unix_ts_secs as f64 + 0.123456789, &[DateTimeInputFormat::Iso8601, DateTimeInputFormat::Timestamp], ) @@ -320,7 +349,7 @@ mod tests { ); } { - let error = parse_date_time_float( + let error = parse_timestamp_float( 1668730394917.01, &[DateTimeInputFormat::Iso8601, DateTimeInputFormat::Rfc2822], ) @@ -334,10 +363,10 @@ mod tests { } #[test] - fn test_parse_date_time_int() { + fn test_parse_timestamp_int() { { let unix_ts_secs = OffsetDateTime::now_utc().unix_timestamp(); - let date_time = parse_date_time_int( + let date_time = parse_timestamp_int( unix_ts_secs, &[DateTimeInputFormat::Iso8601, DateTimeInputFormat::Timestamp], ) @@ -345,7 +374,7 @@ mod tests { assert_eq!(date_time.into_timestamp_secs(), unix_ts_secs); } { - let error = parse_date_time_int( + let error = parse_timestamp_int( 1668730394917, &[DateTimeInputFormat::Iso8601, DateTimeInputFormat::Rfc2822], ) @@ -358,6 +387,27 @@ mod tests { } } + #[test] + fn test_parse_timestamp_str() { + let date_time = parse_timestamp_str("123456789").unwrap(); + assert_eq!(date_time.into_timestamp_secs(), 123456789); + + let date_time = parse_timestamp_str("123456789.").unwrap(); + assert_eq!(date_time.into_timestamp_secs(), 123456789); + + let date_time = parse_timestamp_str("123456789.0").unwrap(); + assert_eq!(date_time.into_timestamp_secs(), 123456789); + + let date_time = parse_timestamp_str("123456789.1").unwrap(); + assert_eq!(date_time.into_timestamp_millis(), 123456789100); + + let date_time = parse_timestamp_str("123456789.100000001").unwrap(); + assert_eq!(date_time.into_timestamp_nanos(), 123456789100000001); + + let date_time = parse_timestamp_str("123456789.1000000011").unwrap(); + assert_eq!(date_time.into_timestamp_nanos(), 123456789100000001); + } + #[test] fn test_parse_date_time_millis() { for date_time_str in [ diff --git a/quickwit/quickwit-datetime/src/lib.rs b/quickwit/quickwit-datetime/src/lib.rs index 66bda263431..ffb28781741 100644 --- a/quickwit/quickwit-datetime/src/lib.rs +++ b/quickwit/quickwit-datetime/src/lib.rs @@ -22,6 +22,6 @@ mod date_time_parsing; pub use date_time_format::{DateTimeInputFormat, DateTimeOutputFormat, StrptimeParser}; pub use date_time_parsing::{ - parse_date_time_float, parse_date_time_int, parse_date_time_str, parse_timestamp, + parse_date_time_str, parse_timestamp, parse_timestamp_float, parse_timestamp_int, }; pub use tantivy::DateTime as TantivyDateTime; diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/date_time_type.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/date_time_type.rs index 3740987847a..52eb64bafd5 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/date_time_type.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/date_time_type.rs @@ -77,9 +77,9 @@ impl QuickwitDateTimeOptions { // `.as_f64()` actually converts floats to integers, so we must check for integers // first. if let Some(timestamp_i64) = timestamp.as_i64() { - quickwit_datetime::parse_date_time_int(timestamp_i64, &self.input_formats.0)? + quickwit_datetime::parse_timestamp_int(timestamp_i64, &self.input_formats.0)? } else if let Some(timestamp_f64) = timestamp.as_f64() { - quickwit_datetime::parse_date_time_float(timestamp_f64, &self.input_formats.0)? + quickwit_datetime::parse_timestamp_float(timestamp_f64, &self.input_formats.0)? } else { return Err(format!( "Failed to parse datetime `{timestamp:?}`: value is larger than i64::MAX.",