Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Convert to given time zone in .str.to_datetime when values are offset-aware #16742

Merged
merged 2 commits into from
Jun 10, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 17 additions & 3 deletions crates/polars-core/src/chunked_array/temporal/datetime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -206,17 +206,31 @@ impl DatetimeChunked {
}

/// Change the underlying [`TimeUnit`]. This does not modify the data.
pub fn set_time_unit(&mut self, tu: TimeUnit) {
self.2 = Some(Datetime(tu, self.time_zone().clone()))
pub fn set_time_unit(&mut self, time_unit: TimeUnit) {
self.2 = Some(Datetime(time_unit, self.time_zone().clone()))
}

/// Change the underlying [`TimeZone`]. This does not modify the data.
/// This does not validate the time zone - it's up to the caller to verify that it's
/// already been validated.
#[cfg(feature = "timezones")]
pub fn set_time_zone(&mut self, time_zone: TimeZone) -> PolarsResult<()> {
validate_time_zone(&time_zone)?;
self.2 = Some(Datetime(self.time_unit(), Some(time_zone)));
Ok(())
}

/// Change the underlying [`TimeUnit`] and [`TimeZone`]. This does not modify the data.
/// This does not validate the time zone - it's up to the caller to verify that it's
/// already been validated.
#[cfg(feature = "timezones")]
pub fn set_time_unit_and_time_zone(
&mut self,
time_unit: TimeUnit,
time_zone: TimeZone,
) -> PolarsResult<()> {
self.2 = Some(Datetime(time_unit, Some(time_zone)));
Ok(())
}
}

#[cfg(test)]
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/chunked_array/temporal/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ static FIXED_OFFSET_PATTERN: &str = r#"(?x)
static FIXED_OFFSET_RE: Lazy<Regex> = Lazy::new(|| Regex::new(FIXED_OFFSET_PATTERN).unwrap());

#[cfg(feature = "timezones")]
pub(crate) fn validate_time_zone(tz: &str) -> PolarsResult<()> {
pub fn validate_time_zone(tz: &str) -> PolarsResult<()> {
match tz.parse::<Tz>() {
Ok(_) => Ok(()),
Err(_) => {
Expand Down
3 changes: 3 additions & 0 deletions crates/polars-plan/src/dsl/function_expr/datetime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ use arrow::temporal_conversions::{MICROSECONDS, MILLISECONDS, NANOSECONDS, SECON
#[cfg(feature = "timezones")]
use chrono_tz::Tz;
#[cfg(feature = "timezones")]
use polars_core::chunked_array::temporal::validate_time_zone;
#[cfg(feature = "timezones")]
use polars_time::base_utc_offset as base_utc_offset_fn;
#[cfg(feature = "timezones")]
use polars_time::dst_offset as dst_offset_fn;
Expand Down Expand Up @@ -343,6 +345,7 @@ pub(super) fn convert_time_zone(s: &Series, time_zone: &TimeZone) -> PolarsResul
match s.dtype() {
DataType::Datetime(_, _) => {
let mut ca = s.datetime()?.clone();
validate_time_zone(time_zone)?;
ca.set_time_zone(time_zone.clone())?;
Ok(ca.into_series())
},
Expand Down
30 changes: 12 additions & 18 deletions crates/polars-plan/src/dsl/function_expr/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,22 @@ use std::borrow::Cow;
use arrow::legacy::utils::CustomIterTools;
#[cfg(feature = "timezones")]
use once_cell::sync::Lazy;
#[cfg(feature = "regex")]
use regex::{escape, Regex};
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};

#[cfg(feature = "timezones")]
static TZ_AWARE_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(%z)|(%:z)|(%::z)|(%:::z)|(%#z)|(^%\+$)").unwrap());

use polars_core::chunked_array::temporal::validate_time_zone;
use polars_core::utils::handle_casting_failures;
#[cfg(feature = "dtype-struct")]
use polars_utils::format_smartstring;
use regex::{escape, Regex};
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};

use super::*;
use crate::{map, map_as_slice};

#[cfg(feature = "timezones")]
static TZ_AWARE_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(%z)|(%:z)|(%::z)|(%:::z)|(%#z)|(^%\+$)").unwrap());

#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Clone, PartialEq, Debug, Eq, Hash)]
pub enum StringFunction {
Expand Down Expand Up @@ -652,16 +652,10 @@ fn to_datetime(
Some(format) => TZ_AWARE_RE.is_match(format),
_ => false,
};
if let (Some(tz), true) = (time_zone, tz_aware) {
if tz != "UTC" {
polars_bail!(
ComputeError:
"if using strftime/to_datetime with a time-zone-aware format, the output will be in UTC. Please either drop the time zone from the function call, or set it to UTC. \
If you are trying to convert the output to a different time zone, please use `convert_time_zone`."
)
}
};

#[cfg(feature = "timezones")]
if let Some(time_zone) = time_zone {
validate_time_zone(time_zone)?;
}
let out = if options.exact {
datetime_strings
.as_datetime(
Expand Down
21 changes: 6 additions & 15 deletions crates/polars-time/src/chunkedarray/string/infer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -452,25 +452,16 @@ pub(crate) fn to_datetime(
.find_map(|opt_val| opt_val.and_then(infer_pattern_datetime_single))
.ok_or_else(|| polars_err!(parse_fmt_idk = "date"))?;
let mut infer = DatetimeInfer::<Int64Type>::try_from_with_unit(pattern, Some(tu))?;
if pattern == Pattern::DatetimeYMDZ
&& tz.is_some()
&& tz.map(|x| x.as_str()) != Some("UTC")
{
polars_bail!(ComputeError: "offset-aware datetimes are converted to UTC. \
Please either drop the time zone from the function call, or set it to UTC. \
To convert to a different time zone, please use `convert_time_zone`.")
}
match pattern {
#[cfg(feature = "timezones")]
Pattern::DatetimeYMDZ => infer.coerce_string(ca).datetime().map(|ca| {
let mut ca = ca.clone();
ca.set_time_unit(tu);
polars_ops::prelude::replace_time_zone(
&ca,
Some("UTC"),
_ambiguous,
NonExistent::Raise,
)
// `tz` has already been validated.
ca.set_time_unit_and_time_zone(
tu,
tz.cloned().unwrap_or_else(|| "UTC".to_string()),
)?;
Ok(ca)
})?,
_ => infer.coerce_string(ca).datetime().map(|ca| {
let mut ca = ca.clone();
Expand Down
7 changes: 5 additions & 2 deletions crates/polars-time/src/chunkedarray/string/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ pub trait StringMethods: AsString {
NonExistent::Raise,
),
#[cfg(feature = "timezones")]
(true, _) => Ok(ca.into_datetime(tu, Some("UTC".to_string()))),
(true, tz) => Ok(ca.into_datetime(tu, tz.cloned().or_else(|| Some("UTC".to_string())))),
_ => Ok(ca.into_datetime(tu, None)),
}
}
Expand Down Expand Up @@ -305,7 +305,10 @@ pub trait StringMethods: AsString {
Ok(string_ca
.apply_generic(|opt_s| convert.eval(opt_s?, use_cache))
.with_name(string_ca.name())
.into_datetime(tu, Some("UTC".to_string())))
.into_datetime(
tu,
Some(tz.map(|x| x.to_string()).unwrap_or("UTC".to_string())),
))
}
#[cfg(not(feature = "timezones"))]
{
Expand Down
12 changes: 11 additions & 1 deletion py-polars/polars/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,17 @@ def to_datetime(
`"%F %T%.3f"` => `Datetime("ms")`. If no fractional second component is
found, the default is `"us"`.
time_zone
Time zone for the resulting Datetime column.
Time zone for the resulting Datetime column. Rules are:

- If inputs are tz-naive and `time_zone` is None, the result time zone is
`None`.
- If inputs are offset-aware and `time_zone` is None, inputs are converted
to `'UTC'` and the result time zone is `'UTC'`.
- If inputs are offset-aware and `time_zone` is given, inputs are converted
to `time_zone` and the result time zone is `time_zone`.
- If inputs are tz-naive and `time_zone` is given, input time zones are
replaced with (not converted to!) `time_zone`, and the result time zone
is `time_zone`.
strict
Raise an error if any conversion fails.
exact
Expand Down
12 changes: 11 additions & 1 deletion py-polars/polars/series/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,17 @@ def to_datetime(
`"%F %T%.3f"` => `Datetime("ms")`. If no fractional second component is
found, the default is `"us"`.
time_zone
Time zone for the resulting Datetime column.
Time zone for the resulting Datetime column. Rules are:

- If inputs are tz-naive and `time_zone` is None, the result time zone is
`None`.
- If inputs are offset-aware and `time_zone` is None, inputs are converted
to `'UTC'` and the result time zone is `'UTC'`.
- If inputs are offset-aware and `time_zone` is given, inputs are converted
to `time_zone` and the result time zone is `time_zone`.
- If inputs are tz-naive and `time_zone` is given, input time zones are
replaced with (not converted to!) `time_zone`, and the result time zone
is `time_zone`.
strict
Raise an error if any conversion fails.
exact
Expand Down
12 changes: 7 additions & 5 deletions py-polars/tests/unit/dataframe/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import polars.selectors as cs
from polars._utils.construction import iterable_to_pydf
from polars.datatypes import DTYPE_TEMPORAL_UNITS, INTEGER_DTYPES
from polars.exceptions import ComputeError, TimeZoneAwareConstructorWarning
from polars.exceptions import TimeZoneAwareConstructorWarning
from polars.testing import (
assert_frame_equal,
assert_frame_not_equal,
Expand Down Expand Up @@ -2502,10 +2502,12 @@ def test_init_vs_strptime_consistency_raises() -> None:
[datetime(2020, 1, 1, tzinfo=timezone(timedelta(hours=-8)))],
dtype=pl.Datetime("us", "US/Pacific"),
)
with pytest.raises(ComputeError, match=msg):
pl.Series(["2020-01-01 00:00-08:00"]).str.strptime(
pl.Datetime("us", "US/Pacific")
)
result = (
pl.Series(["2020-01-01 00:00-08:00"])
.str.strptime(pl.Datetime("us", "US/Pacific"))
.item()
)
assert result == datetime(2020, 1, 1, 0, 0, tzinfo=ZoneInfo(key="US/Pacific"))


def test_init_physical_with_timezone() -> None:
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/datatypes/test_temporal.py
Original file line number Diff line number Diff line change
Expand Up @@ -1205,7 +1205,7 @@ def test_strptime_with_invalid_tz() -> None:
pl.Series(["2020-01-01 03:00:00"]).str.strptime(pl.Datetime("us", "foo"))
with pytest.raises(
ComputeError,
match="Please either drop the time zone from the function call, or set it to UTC",
match="unable to parse time zone: 'foo'",
):
pl.Series(["2020-01-01 03:00:00+01:00"]).str.strptime(
pl.Datetime("us", "foo"), "%Y-%m-%d %H:%M:%S%z"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import sys
from datetime import datetime
from typing import TYPE_CHECKING

Expand All @@ -8,7 +9,16 @@
from hypothesis import given

import polars as pl
from polars.dependencies import _ZONEINFO_AVAILABLE
from polars.exceptions import ComputeError
from polars.testing import assert_series_equal

if sys.version_info >= (3, 9):
from zoneinfo import ZoneInfo
elif _ZONEINFO_AVAILABLE:
# Import from submodule due to typing issue with backports.zoneinfo package:
# https://github.com/pganssle/zoneinfo/issues/125
from backports.zoneinfo._zoneinfo import ZoneInfo

if TYPE_CHECKING:
from hypothesis.strategies import DrawFn
Expand Down Expand Up @@ -152,3 +162,34 @@ def test_cast_to_time_and_combine(d: datetime, tu: TimeUnit) -> None:
assert [d.date() for d in datetimes] == res["dt"].to_list()
assert [d.time() for d in datetimes] == res["tm"].to_list()
assert datetimes == res["dtm"].to_list()


def test_to_datetime_aware_values_aware_dtype() -> None:
s = pl.Series(["2020-01-01T01:12:34+01:00"])
expected = pl.Series([datetime(2020, 1, 1, 5, 57, 34)]).dt.replace_time_zone(
"Asia/Kathmandu"
)

# When Polars infers the format
result = s.str.to_datetime(time_zone="Asia/Kathmandu")
assert_series_equal(result, expected)

# When the format is provided
result = s.str.to_datetime(format="%Y-%m-%dT%H:%M:%S%z", time_zone="Asia/Kathmandu")
assert_series_equal(result, expected)

# With `exact=False`
result = s.str.to_datetime(
format="%Y-%m-%dT%H:%M:%S%z", time_zone="Asia/Kathmandu", exact=False
)
assert_series_equal(result, expected)

# Check consistency with Series constructor
# TODO: remove `raises`, after https://github.com/pola-rs/polars/pull/16828.
with pytest.raises(ValueError, match="Please either drop"):
result = pl.Series(
[datetime(2020, 1, 1, 5, 57, 34, tzinfo=ZoneInfo("Asia/Kathmandu"))],
dtype=pl.Datetime("us", "Asia/Kathmandu"),
)
# TODO: uncomment, after https://github.com/pola-rs/polars/pull/16828.
# assert_series_equal(result, expected)
11 changes: 6 additions & 5 deletions py-polars/tests/unit/operations/namespaces/test_strptime.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,11 +300,12 @@ def test_infer_tz_aware_with_utc(time_unit: TimeUnit) -> None:


def test_infer_tz_aware_raises() -> None:
stinodego marked this conversation as resolved.
Show resolved Hide resolved
msg = "Please either drop the time zone from the function call, or set it to UTC"
with pytest.raises(ComputeError, match=msg):
pl.Series(["2020-01-02T04:00:00+02:00"]).str.to_datetime(
time_unit="us", time_zone="Europe/Vienna"
)
result = (
pl.Series(["2020-01-02T04:00:00+02:00"])
.str.to_datetime(time_unit="us", time_zone="Europe/Vienna")
.item()
)
assert result == datetime(2020, 1, 2, 3, tzinfo=ZoneInfo("Europe/Vienna"))


@pytest.mark.parametrize(
Expand Down