Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix reading CSV in commands #470

Merged
merged 8 commits into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fix `IForestOutlierTransform` failed with ignored `target` column ([#460](https://github.com/etna-team/etna/pull/460))
- Add lower limit for `typing_extension` versions ([#458](https://github.com/etna-team/etna/pull/458))
- Fix `ModelDecomposeTransform` import without `prophet` module ([#459](https://github.com/etna-team/etna/pull/459))
-
- Convert `segment` to string during reading csv in `backtest` and `forecast` commands ([#470](https://github.com/etna-team/etna/pull/470))
-
-
- Fix holidays during loading datasets `traffic_2008_10T` and `traffic_2008_hourly` ([#462](https://github.com/etna-team/etna/pull/462))
Expand Down
4 changes: 2 additions & 2 deletions etna/commands/backtest_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,13 @@ def backtest(
freq_init = freq
parse_dates = ["timestamp"]

df_timeseries = pd.read_csv(target_path, parse_dates=parse_dates)
df_timeseries = pd.read_csv(target_path, parse_dates=parse_dates, dtype={"segment": str})
d-a-bunin marked this conversation as resolved.
Show resolved Hide resolved
df_timeseries = TSDataset.to_dataset(df_timeseries)

df_exog = None
k_f: Union[Literal["all"], Sequence[Any]] = ()
if exog_path:
df_exog = pd.read_csv(exog_path, parse_dates=parse_dates)
df_exog = pd.read_csv(exog_path, parse_dates=parse_dates, dtype={"segment": str})
df_exog = TSDataset.to_dataset(df_exog)
k_f = "all" if not known_future else known_future

Expand Down
5 changes: 2 additions & 3 deletions etna/commands/forecast_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,14 +129,13 @@ def forecast(
freq_init = freq
parse_dates = ["timestamp"]

df_timeseries = pd.read_csv(target_path, parse_dates=parse_dates)

df_timeseries = pd.read_csv(target_path, parse_dates=parse_dates, dtype={"segment": str})
d-a-bunin marked this conversation as resolved.
Show resolved Hide resolved
df_timeseries = TSDataset.to_dataset(df_timeseries)

df_exog = None
k_f: Union[Literal["all"], Sequence[Any]] = ()
if exog_path:
df_exog = pd.read_csv(exog_path, parse_dates=parse_dates)
df_exog = pd.read_csv(exog_path, parse_dates=parse_dates, dtype={"segment": str})
df_exog = TSDataset.to_dataset(df_exog)
k_f = "all" if not known_future else known_future

Expand Down
35 changes: 35 additions & 0 deletions tests/test_commands/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,3 +232,38 @@ def small_ts():
df = pd.DataFrame({"segment": ["segment_0"], "timestamp": [pd.Timestamp("2020-01-01")], "target": [1]})
df = TSDataset.to_dataset(df=df)
return TSDataset(df=df, freq="D")


@pytest.fixture
def base_timeseries_numeric_segments_path():
df = pd.DataFrame(
{
"timestamp": list(pd.date_range("2021-06-01", periods=100)) * 2,
"target": np.arange(200),
# segments with numeric names and leading zeros
"segment": ["01234"] * 100 + ["12345"] * 100,
}
)
tmp = NamedTemporaryFile("w")
df.to_csv(tmp, index=False)
tmp.flush()
yield Path(tmp.name)
tmp.close()


@pytest.fixture
def base_timeseries_numeric_segments_exog_path():
df_regressors = pd.DataFrame(
{
"timestamp": list(pd.date_range("2021-06-01", periods=120)) * 2,
"regressor_1": np.arange(240),
"regressor_2": np.arange(240) + 5,
# segments with numeric names and leading zeros
"segment": ["01234"] * 120 + ["12345"] * 120,
}
)
tmp = NamedTemporaryFile("w")
df_regressors.to_csv(tmp, index=False)
tmp.flush()
yield Path(tmp.name)
tmp.close()
54 changes: 54 additions & 0 deletions tests/test_commands/test_backtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,3 +180,57 @@ def test_backtest_estimate_n_folds(
)
forecast_df = pd.read_csv(tmp_output_path / "forecast.csv")
assert forecast_df["fold_number"].nunique() == expected


def test_backtest_with_numeric_segments(
base_pipeline_yaml_path,
base_backtest_yaml_path,
base_timeseries_numeric_segments_path,
):
target = pd.read_csv(base_timeseries_numeric_segments_path, dtype={"segment": str})
segments = target["segment"].unique()

tmp_output = TemporaryDirectory()
tmp_output_path = Path(tmp_output.name)
run(
[
"etna",
"backtest",
str(base_pipeline_yaml_path),
str(base_backtest_yaml_path),
str(base_timeseries_numeric_segments_path),
"D",
str(tmp_output_path),
]
)
df_forecast = pd.read_csv(tmp_output_path / "forecast.csv", dtype={"segment": str})
output_segments = df_forecast["segment"].unique()
assert set(segments) == set(output_segments)


def test_backtest_with_numeric_segments_with_exog(
base_pipeline_yaml_path,
base_backtest_yaml_path,
base_timeseries_numeric_segments_path,
base_timeseries_numeric_segments_exog_path,
):
target = pd.read_csv(base_timeseries_numeric_segments_path, dtype={"segment": str})
segments = target["segment"].unique()

tmp_output = TemporaryDirectory()
tmp_output_path = Path(tmp_output.name)
run(
[
"etna",
"backtest",
str(base_pipeline_yaml_path),
str(base_backtest_yaml_path),
str(base_timeseries_numeric_segments_path),
"D",
str(tmp_output_path),
str(base_timeseries_numeric_segments_exog_path),
]
)
df_forecast = pd.read_csv(tmp_output_path / "forecast.csv", dtype={"segment": str})
output_segments = df_forecast["segment"].unique()
assert set(segments) == set(output_segments)
56 changes: 56 additions & 0 deletions tests/test_commands/test_forecast.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,3 +409,59 @@ def test_forecast_with_estimate_n_folds(

assert all(x in df_output.columns for x in ["target_0.025", "target_0.975"])
assert len(df_output) == 4 * 2 # 4 predictions for 2 segments


def test_forecast_with_numeric_segments(
base_pipeline_yaml_path,
base_timeseries_numeric_segments_path,
):
target = pd.read_csv(base_timeseries_numeric_segments_path, dtype={"segment": str})
segments = target["segment"].unique()

tmp_output = NamedTemporaryFile("w")
tmp_output_path = Path(tmp_output.name)
run(
[
"etna",
"forecast",
str(base_pipeline_yaml_path),
str(base_timeseries_numeric_segments_path),
"D",
str(tmp_output_path),
],
)
df_output = pd.read_csv(tmp_output_path, dtype={"segment": str})
output_segments = df_output["segment"].unique()
assert set(segments) == set(output_segments)


@pytest.mark.parametrize(
"pipeline_path_name",
("base_pipeline_yaml_path", "base_ensemble_yaml_path"),
)
def test_forecast_with_numeric_segments_with_exog(
pipeline_path_name,
base_timeseries_numeric_segments_path,
base_timeseries_numeric_segments_exog_path,
request,
):
target = pd.read_csv(base_timeseries_numeric_segments_path, dtype={"segment": str})
segments = target["segment"].unique()

tmp_output = NamedTemporaryFile("w")
tmp_output_path = Path(tmp_output.name)
pipeline_path = request.getfixturevalue(pipeline_path_name)
run(
[
"etna",
"forecast",
str(pipeline_path),
str(base_timeseries_numeric_segments_path),
"D",
str(tmp_output_path),
str(base_timeseries_numeric_segments_exog_path),
],
)
df_output = pd.read_csv(tmp_output_path, dtype={"segment": str})
output_segments = df_output["segment"].unique()
assert set(segments) == set(output_segments)
Loading