-
Notifications
You must be signed in to change notification settings - Fork 60
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
parser: sanitize timestamps to RFC3339
- Loading branch information
Showing
12 changed files
with
277 additions
and
3 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
use crate::{ParseConfig, Output, format::ParseResult, ParseError}; | ||
use chrono::{DateTime, FixedOffset, SecondsFormat}; | ||
use chrono_tz::Tz; | ||
use serde_json::Value; | ||
|
||
struct DatetimeSanitizer { | ||
from: Output, | ||
default_timezone: Tz, | ||
} | ||
|
||
const NAIVE_FORMATS: [&'static str; 4] = [ | ||
"%Y-%m-%dT%H:%M:%S", | ||
"%Y-%m-%dT%H:%M:%S%.3f", | ||
"%Y-%m-%d %H:%M:%S%.3f", | ||
"%Y-%m-%d %H:%M:%S", | ||
]; | ||
|
||
const FORMATS: [&'static str; 2] = [ | ||
"%Y-%m-%d %H:%M:%S%.3f%:z", | ||
"%Y-%m-%d %H:%M:%S%:z", | ||
]; | ||
|
||
fn datetime_to_rfc3339(val: &mut Value, default_timezone: Tz) { | ||
match val { | ||
Value::String(s) => { | ||
let mut parsed: Option<DateTime<FixedOffset>> = None; | ||
|
||
for f in FORMATS { | ||
parsed = parsed.or_else(|| | ||
chrono::DateTime::parse_from_str(&s, f).ok() | ||
) | ||
} | ||
|
||
if let Some(ts) = parsed { | ||
*s = ts.to_rfc3339_opts(SecondsFormat::AutoSi, true); | ||
} else { | ||
let mut naive_parsed: Option<DateTime<Tz>> = None; | ||
|
||
for f in NAIVE_FORMATS { | ||
naive_parsed = naive_parsed.or_else(|| | ||
chrono::NaiveDateTime::parse_from_str(&s, f).map(|d| d.and_local_timezone(default_timezone).unwrap()).ok() | ||
) | ||
} | ||
|
||
if let Some(ts) = naive_parsed { | ||
*s = ts.to_rfc3339_opts(SecondsFormat::AutoSi, true); | ||
} | ||
} | ||
} | ||
|
||
Value::Array(vec) => { | ||
vec.iter_mut().for_each(|item| { | ||
datetime_to_rfc3339(item, default_timezone) | ||
}) | ||
} | ||
|
||
Value::Object(map) => { | ||
map.iter_mut().for_each(|(_k, v)| { | ||
datetime_to_rfc3339(v, default_timezone) | ||
}) | ||
} | ||
|
||
_ => {} | ||
} | ||
} | ||
|
||
impl Iterator for DatetimeSanitizer { | ||
type Item = ParseResult; | ||
|
||
fn next(&mut self) -> Option<Self::Item> { | ||
let next = self.from.next()?; | ||
Some(match next { | ||
Ok(mut val) => { | ||
datetime_to_rfc3339(&mut val, self.default_timezone); | ||
Ok(val) | ||
} | ||
Err(e) => { | ||
Err(ParseError::Parse(Box::new(e))) | ||
} | ||
}) | ||
} | ||
} | ||
|
||
#[derive(Debug, thiserror::Error)] | ||
pub enum DatetimeSanitizeError { | ||
#[error("could not parse timezone as a valid IANA timezone")] | ||
TimezoneParseError(String), | ||
} | ||
|
||
pub fn sanitize_datetime(config: &ParseConfig, output: Output) -> Result<Output, DatetimeSanitizeError> { | ||
let tz: Tz = config.default_timezone.parse().map_err(DatetimeSanitizeError::TimezoneParseError)?; | ||
let sanitizer = DatetimeSanitizer { | ||
from: output, | ||
default_timezone: tz, | ||
}; | ||
|
||
return Ok(Box::new(sanitizer)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
use crate::{ParseConfig, Output}; | ||
|
||
pub mod datetime; | ||
|
||
#[derive(Debug, thiserror::Error)] | ||
pub enum SanitizeError { | ||
#[error("sanitizing datetimes: {0}")] | ||
DatetimeSanitizeError(#[from] datetime::DatetimeSanitizeError), | ||
} | ||
|
||
pub fn sanitize_output(config: &ParseConfig, output: Output) -> Result<Output, SanitizeError> { | ||
datetime::sanitize_datetime(config, output).map_err(SanitizeError::DatetimeSanitizeError) | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
"no_timezone", "no_timezone_fractional", "no_t", "no_t_fractional" | ||
"2020-01-01T00:00:00","2020-01-01T00:00:00.000","2020-01-01 00:00:00","2020-01-01 00:00:00.000" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
"no_timezone", "no_timezone_fractional", "rfc3339", "timezone_offset", "no_t", "no_t_fractional" | ||
"2020-01-01T00:00:00","2020-01-01T00:00:00.000","2020-01-01T00:00:00Z","2020-01-01 00:00:00+00:00","2020-01-01 00:00:00","2020-01-01 00:00:00.000" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
mod testutil; | ||
|
||
use chrono::DateTime; | ||
|
||
use parser::ParseConfig; | ||
use testutil::{input_for_file, run_test}; | ||
|
||
#[test] | ||
fn sanitize_datetime_to_rfc3339() { | ||
let path = "tests/examples/datetimes.csv"; | ||
let cfg = ParseConfig { | ||
default_timezone: "UTC".to_string(), | ||
filename: Some(path.to_string()), | ||
..Default::default() | ||
}; | ||
|
||
let input = input_for_file(path); | ||
let output = run_test(&cfg, input); | ||
output.assert_success(1); | ||
|
||
let expected_first_row = DateTime::parse_from_rfc3339("2020-01-01T00:00:00Z").unwrap(); | ||
for value in output.parsed[0].as_object().unwrap().values() { | ||
assert_eq!(expected_first_row, DateTime::parse_from_rfc3339(value.as_str().unwrap()).unwrap()) | ||
} | ||
} | ||
|
||
#[test] | ||
fn sanitize_datetime_to_rfc3339_iana_timezone() { | ||
let path = "tests/examples/datetimes-naive.csv"; | ||
let cfg = ParseConfig { | ||
default_timezone: "America/New_York".to_string(), | ||
filename: Some(path.to_string()), | ||
..Default::default() | ||
}; | ||
|
||
let input = input_for_file(path); | ||
let output = run_test(&cfg, input); | ||
output.assert_success(1); | ||
|
||
let expected_first_row = DateTime::parse_from_rfc3339("2020-01-01T00:00:00-05:00").unwrap(); | ||
for value in output.parsed[0].as_object().unwrap().values() { | ||
assert_eq!(expected_first_row, DateTime::parse_from_rfc3339(value.as_str().unwrap()).unwrap()) | ||
} | ||
} |
Oops, something went wrong.