From 3dd6d5dcc5eaefdb0855bb19f4982b787747722a Mon Sep 17 00:00:00 2001 From: Mahdi Dibaiee Date: Thu, 5 Oct 2023 17:17:55 +0100 Subject: [PATCH] parser: improve performance by first parsing a naive date --- Cargo.toml | 1 - crates/parser/src/format/sanitize/datetime.rs | 36 +++++++++---------- crates/parser/tests/sanitize_test.rs | 2 -- go/parser/config.go | 4 +-- 4 files changed, 20 insertions(+), 23 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 65f0b0e749..57b200f52e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,7 +40,6 @@ byteorder = "1.4" caseless = "0.2" chardetng = "0.1" chrono = { version = "0.4", features = ["serde"] } -chrono-tz = { version = "0.8" } clap = { version = "3.2", features = ["derive", "env"] } colored_json = "3" comfy-table = "6.1" diff --git a/crates/parser/src/format/sanitize/datetime.rs b/crates/parser/src/format/sanitize/datetime.rs index 8f616df962..b02bcd7be5 100644 --- a/crates/parser/src/format/sanitize/datetime.rs +++ b/crates/parser/src/format/sanitize/datetime.rs @@ -7,31 +7,33 @@ struct DatetimeSanitizer { default_offset: time::UtcOffset, } -// Here we are trying to parse non-ambiguous, non-RFC3339 dates and formatting them as RFC3339 -// So we skip any valid RFC3339 in our processing and pass it as-is +// Here we are trying to parse non-RFC3339 dates fn datetime_to_rfc3339(val: &mut Value, default_offset: time::UtcOffset) { match val { Value::String(s) => { - let offset_format = format_description!( - version = 2, - "[first - [[year]-[month]-[day] [hour]:[minute]:[second][optional [.[subsecond]]]Z] - [[year]-[month]-[day] [hour]:[minute]:[second][optional [.[subsecond]]]z] - [[year]-[month]-[day] [hour]:[minute]:[second][optional [.[subsecond]]][offset_hour]:[offset_minute]] - ]" - ); - let primitive_format = format_description!( version = 2, - "[year]-[month]-[day][optional [T]][optional [ ]][hour]:[minute]:[second][optional [.[subsecond]]]" + "[year]-[month]-[day][optional [T]][optional [ ]][hour]:[minute]:[second][optional [.[subsecond]]][optional [Z]][optional [z]][optional [[offset_hour]:[offset_minute]]]" ); - let parsed_with_tz = time::OffsetDateTime::parse(&s, offset_format); - let parsed_no_tz = time::PrimitiveDateTime::parse(&s, primitive_format); + let parsed_no_tz = time::PrimitiveDateTime::parse(&s, primitive_format).ok(); + + let parsed_with_tz = if parsed_no_tz.is_some() { + let offset_format = format_description!( + version = 2, + "[first + [[year]-[month]-[day] [hour]:[minute]:[second][optional [.[subsecond]]]Z] + [[year]-[month]-[day] [hour]:[minute]:[second][optional [.[subsecond]]]z] + [[year]-[month]-[day] [hour]:[minute]:[second][optional [.[subsecond]]][offset_hour]:[offset_minute]] + ]" + ); + + time::OffsetDateTime::parse(&s, offset_format).ok() + } else { None }; - if let Ok(parsed) = parsed_with_tz { + if let Some(parsed) = parsed_with_tz { *s = parsed.format(&time::format_description::well_known::Rfc3339).unwrap(); - } else if let Ok(parsed) = parsed_no_tz { + } else if let Some(parsed) = parsed_no_tz { *s = parsed.assume_offset(default_offset).format(&time::format_description::well_known::Rfc3339).unwrap(); } } @@ -76,9 +78,7 @@ pub enum DatetimeSanitizeError { } pub fn sanitize_datetime(config: &ParseConfig, output: Output) -> Result { - eprintln!("sanitize_datetime"); let offset = time::UtcOffset::parse(&config.default_offset, format_description!("[offset_hour]:[offset_minute]")).map_err(DatetimeSanitizeError::OffsetParseError)?; - eprintln!("offset: {:?}", offset); let sanitizer = DatetimeSanitizer { from: output, default_offset: offset, diff --git a/crates/parser/tests/sanitize_test.rs b/crates/parser/tests/sanitize_test.rs index 8dacd082d9..bf6659ef20 100644 --- a/crates/parser/tests/sanitize_test.rs +++ b/crates/parser/tests/sanitize_test.rs @@ -1,7 +1,5 @@ mod testutil; -use chrono::DateTime; - use parser::ParseConfig; use testutil::{input_for_file, run_test}; diff --git a/go/parser/config.go b/go/parser/config.go index 5395eba9b4..e0f2ff53e5 100644 --- a/go/parser/config.go +++ b/go/parser/config.go @@ -18,7 +18,7 @@ type Config struct { Compression string `json:"compression,omitempty"` ContentType string `json:"contentType,omitempty"` ContentEncoding string `json:"contentEncoding,omitempty"` - DefaultTimezone string `json:"defaultTimezone,omitempty"` + DefaultOffset string `json:"defaultOffset,omitempty"` } func (c *Config) Copy() Config { @@ -40,7 +40,7 @@ func (c *Config) Copy() Config { Compression: c.Compression, ContentType: c.ContentType, ContentEncoding: c.ContentEncoding, - DefaultTimezone: c.DefaultTimezone, + DefaultOffset: c.DefaultOffset, } }