diff --git a/crates/parser/src/format/sanitize/datetime.rs b/crates/parser/src/format/sanitize/datetime.rs index b02bcd7be5..747a6d7ff9 100644 --- a/crates/parser/src/format/sanitize/datetime.rs +++ b/crates/parser/src/format/sanitize/datetime.rs @@ -1,4 +1,4 @@ -use crate::{ParseConfig, Output, format::ParseResult, ParseError}; +use crate::{ParseConfig, Output, format::ParseResult}; use time::macros::format_description; use serde_json::Value; @@ -11,6 +11,11 @@ struct DatetimeSanitizer { fn datetime_to_rfc3339(val: &mut Value, default_offset: time::UtcOffset) { match val { Value::String(s) => { + // We first try to parse a more relaxed format that allows all the different formats we + // support. At this stage we are trying to see if the value we see is a timestamp that + // we can parse at all. If we are successful at parsing this value, then we try to + // parse a more specific format for timestamps *with timezone*. If we are successful, + // we use the parsed timezone, otherwise we use the default offset provided. let primitive_format = format_description!( version = 2, "[year]-[month]-[day][optional [T]][optional [ ]][hour]:[minute]:[second][optional [.[subsecond]]][optional [Z]][optional [z]][optional [[offset_hour]:[offset_minute]]]" @@ -22,9 +27,9 @@ fn datetime_to_rfc3339(val: &mut Value, default_offset: time::UtcOffset) { let offset_format = format_description!( version = 2, "[first - [[year]-[month]-[day] [hour]:[minute]:[second][optional [.[subsecond]]]Z] - [[year]-[month]-[day] [hour]:[minute]:[second][optional [.[subsecond]]]z] - [[year]-[month]-[day] [hour]:[minute]:[second][optional [.[subsecond]]][offset_hour]:[offset_minute]] + [[year]-[month]-[day][optional [T]][optional [ ]][hour]:[minute]:[second][optional [.[subsecond]]]Z] + [[year]-[month]-[day][optional [T]][optional [ ]][hour]:[minute]:[second][optional [.[subsecond]]]z] + [[year]-[month]-[day][optional [T]][optional [ ]][hour]:[minute]:[second][optional [.[subsecond]]][offset_hour]:[offset_minute]] ]" ); @@ -64,9 +69,7 @@ impl Iterator for DatetimeSanitizer { datetime_to_rfc3339(&mut val, self.default_offset); Ok(val) } - Err(e) => { - Err(ParseError::Parse(Box::new(e))) - } + e => e }) } } diff --git a/crates/parser/tests/examples/datetimes-naive.csv b/crates/parser/tests/examples/datetimes-naive.csv deleted file mode 100644 index 3ecbf245be..0000000000 --- a/crates/parser/tests/examples/datetimes-naive.csv +++ /dev/null @@ -1,2 +0,0 @@ -"no_timezone", "no_timezone_fractional", "no_t", "no_t_fractional", "no_t_large_fractional" -"2020-01-01T00:00:00","2020-01-01T00:00:00.000","2020-01-01 00:00:00","2020-01-01 00:00:00.000","2020-01-01 00:00:00.000000000" diff --git a/crates/parser/tests/examples/datetimes.csv b/crates/parser/tests/examples/datetimes.csv deleted file mode 100644 index dca3e0b3cd..0000000000 --- a/crates/parser/tests/examples/datetimes.csv +++ /dev/null @@ -1,2 +0,0 @@ -"no_timezone", "no_timezone_fractional", "rfc3339", "timezone_offset", "no_t", "no_t_fractional", "no_t_fractional_large" -"2020-01-01T00:00:00","2020-01-01T00:00:00.000","2020-01-01T00:00:00Z","2020-01-01 00:00:00+00:00","2020-01-01 00:00:00","2020-01-01 00:00:00.000","2020-01-01 00:00:00.000000000" diff --git a/crates/parser/tests/sanitize_test.rs b/crates/parser/tests/sanitize_test.rs index bf6659ef20..393cf8dd34 100644 --- a/crates/parser/tests/sanitize_test.rs +++ b/crates/parser/tests/sanitize_test.rs @@ -1,13 +1,22 @@ mod testutil; +use std::fs::File; +use std::io::Write; + use parser::ParseConfig; use testutil::{input_for_file, run_test}; +use tempfile::tempdir; + +fn test_sanitize(description: &str, input: &str, expected: &str, default_offset: &str) { + let dir = tempdir().unwrap(); + let path = dir.path().join("sanitize-test.csv"); + let mut f = File::create(path.clone()).unwrap(); + writeln!(f, "header").unwrap(); + writeln!(f, "\"{}\"", input).unwrap(); -#[test] -fn sanitize_datetime_to_rfc3339() { - let path = "tests/examples/datetimes.csv"; let cfg = ParseConfig { - filename: Some(path.to_string()), + filename: Some(path.to_string_lossy().to_string()), + default_offset: default_offset.to_string(), ..Default::default() }; @@ -15,29 +24,30 @@ fn sanitize_datetime_to_rfc3339() { let output = run_test(&cfg, input); output.assert_success(1); - let expected_first_row = "2020-01-01T00:00:00Z"; for value in output.parsed[0].as_object().unwrap().values() { - assert_eq!(expected_first_row, value.as_str().unwrap()) + assert_eq!(expected, value.as_str().unwrap(), "{}", description) } } #[test] -fn sanitize_datetime_to_rfc3339_offset() { - let path = "tests/examples/datetimes-naive.csv"; - let cfg = ParseConfig { - default_offset: "-05:00".to_string(), - filename: Some(path.to_string()), - ..Default::default() - }; - - let input = input_for_file(path); - let output = run_test(&cfg, input); - output.assert_success(1); +fn sanitize_datetime_to_rfc3339() { + // With Timezone + test_sanitize("tz rfc3339 utc" , "2020-01-01T12:34:56Z" , "2020-01-01T12:34:56Z" , "+00:00"); + test_sanitize("tz rfc3339 offset" , "2020-01-01T12:34:56-04:00" , "2020-01-01T12:34:56-04:00" , "+00:00"); + test_sanitize("tz rfc3339 fractional" , "2020-01-01T12:34:56.999999999Z" , "2020-01-01T12:34:56.999999999Z" , "+00:00"); + test_sanitize("tz rfc3339 fractional + offset" , "2020-01-01T12:34:56.999999999-04:00" , "2020-01-01T12:34:56.999999999-04:00" , "+00:00"); + test_sanitize("tz spaced fractional + offset" , "2020-01-01 12:34:56.999999999-04:00" , "2020-01-01T12:34:56.999999999-04:00" , "+00:00"); + test_sanitize("tz spaced fractional + utc" , "2020-01-01 12:34:56.999999999Z" , "2020-01-01T12:34:56.999999999Z" , "+00:00"); + test_sanitize("tz spaced offset" , "2020-01-01 12:34:56-04:00" , "2020-01-01T12:34:56-04:00" , "+00:00"); + test_sanitize("tz spaced utc" , "2020-01-01 12:34:56Z" , "2020-01-01T12:34:56Z" , "+00:00"); - let expected_first_row = "2020-01-01T00:00:00-05:00"; - for value in output.parsed[0].as_object().unwrap().values() { - assert_eq!(expected_first_row, value.as_str().unwrap()) - } + // Without Timezone + test_sanitize("naive t" , "2020-01-01T12:34:56" , "2020-01-01T12:34:56Z" , "+00:00"); + test_sanitize("naive t + fractional" , "2020-01-01T12:34:56.999999999" , "2020-01-01T12:34:56.999999999Z" , "+00:00"); + test_sanitize("naive t + fractional 2" , "2020-01-01T12:34:56.999999999" , "2020-01-01T12:34:56.999999999+04:00" , "+04:00"); + test_sanitize("naive space" , "2020-01-01 12:34:56" , "2020-01-01T12:34:56Z" , "+00:00"); + test_sanitize("naive space + fractional" , "2020-01-01 12:34:56.999999999" , "2020-01-01T12:34:56.999999999Z" , "+00:00"); + test_sanitize("naive space + fractional 2" , "2020-01-01 12:34:56.999999999" , "2020-01-01T12:34:56.999999999+04:00" , "+04:00"); } #[test]