diff --git a/CHANGELOG.md b/CHANGELOG.md index 532165868..44f7324ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -54,7 +54,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fix `IForestOutlierTransform` failed with ignored `target` column ([#460](https://github.com/etna-team/etna/pull/460)) - Add lower limit for `typing_extension` versions ([#458](https://github.com/etna-team/etna/pull/458)) - Fix `ModelDecomposeTransform` import without `prophet` module ([#459](https://github.com/etna-team/etna/pull/459)) -- +- Convert `segment` to string during reading csv in `backtest` and `forecast` commands ([#470](https://github.com/etna-team/etna/pull/470)) - - - Fix holidays during loading datasets `traffic_2008_10T` and `traffic_2008_hourly` ([#462](https://github.com/etna-team/etna/pull/462)) diff --git a/etna/commands/backtest_command.py b/etna/commands/backtest_command.py index 4b8c3ba73..70107b94d 100644 --- a/etna/commands/backtest_command.py +++ b/etna/commands/backtest_command.py @@ -82,13 +82,13 @@ def backtest( freq_init = freq parse_dates = ["timestamp"] - df_timeseries = pd.read_csv(target_path, parse_dates=parse_dates) + df_timeseries = pd.read_csv(target_path, parse_dates=parse_dates, dtype={"segment": str}) df_timeseries = TSDataset.to_dataset(df_timeseries) df_exog = None k_f: Union[Literal["all"], Sequence[Any]] = () if exog_path: - df_exog = pd.read_csv(exog_path, parse_dates=parse_dates) + df_exog = pd.read_csv(exog_path, parse_dates=parse_dates, dtype={"segment": str}) df_exog = TSDataset.to_dataset(df_exog) k_f = "all" if not known_future else known_future diff --git a/etna/commands/forecast_command.py b/etna/commands/forecast_command.py index 6b03fbb25..bad93dc95 100644 --- a/etna/commands/forecast_command.py +++ b/etna/commands/forecast_command.py @@ -129,14 +129,13 @@ def forecast( freq_init = freq parse_dates = ["timestamp"] - df_timeseries = pd.read_csv(target_path, parse_dates=parse_dates) - + df_timeseries = pd.read_csv(target_path, parse_dates=parse_dates, dtype={"segment": str}) df_timeseries = TSDataset.to_dataset(df_timeseries) df_exog = None k_f: Union[Literal["all"], Sequence[Any]] = () if exog_path: - df_exog = pd.read_csv(exog_path, parse_dates=parse_dates) + df_exog = pd.read_csv(exog_path, parse_dates=parse_dates, dtype={"segment": str}) df_exog = TSDataset.to_dataset(df_exog) k_f = "all" if not known_future else known_future diff --git a/tests/test_commands/conftest.py b/tests/test_commands/conftest.py index 16d75f31b..d5ff3c06c 100644 --- a/tests/test_commands/conftest.py +++ b/tests/test_commands/conftest.py @@ -232,3 +232,38 @@ def small_ts(): df = pd.DataFrame({"segment": ["segment_0"], "timestamp": [pd.Timestamp("2020-01-01")], "target": [1]}) df = TSDataset.to_dataset(df=df) return TSDataset(df=df, freq="D") + + +@pytest.fixture +def base_timeseries_numeric_segments_path(): + df = pd.DataFrame( + { + "timestamp": list(pd.date_range("2021-06-01", periods=100)) * 2, + "target": np.arange(200), + # segments with numeric names and leading zeros + "segment": ["01234"] * 100 + ["12345"] * 100, + } + ) + tmp = NamedTemporaryFile("w") + df.to_csv(tmp, index=False) + tmp.flush() + yield Path(tmp.name) + tmp.close() + + +@pytest.fixture +def base_timeseries_numeric_segments_exog_path(): + df_regressors = pd.DataFrame( + { + "timestamp": list(pd.date_range("2021-06-01", periods=120)) * 2, + "regressor_1": np.arange(240), + "regressor_2": np.arange(240) + 5, + # segments with numeric names and leading zeros + "segment": ["01234"] * 120 + ["12345"] * 120, + } + ) + tmp = NamedTemporaryFile("w") + df_regressors.to_csv(tmp, index=False) + tmp.flush() + yield Path(tmp.name) + tmp.close() diff --git a/tests/test_commands/test_backtest.py b/tests/test_commands/test_backtest.py index 8f9a5248e..c0a6669dd 100644 --- a/tests/test_commands/test_backtest.py +++ b/tests/test_commands/test_backtest.py @@ -180,3 +180,57 @@ def test_backtest_estimate_n_folds( ) forecast_df = pd.read_csv(tmp_output_path / "forecast.csv") assert forecast_df["fold_number"].nunique() == expected + + +def test_backtest_with_numeric_segments( + base_pipeline_yaml_path, + base_backtest_yaml_path, + base_timeseries_numeric_segments_path, +): + target = pd.read_csv(base_timeseries_numeric_segments_path, dtype={"segment": str}) + segments = target["segment"].unique() + + tmp_output = TemporaryDirectory() + tmp_output_path = Path(tmp_output.name) + run( + [ + "etna", + "backtest", + str(base_pipeline_yaml_path), + str(base_backtest_yaml_path), + str(base_timeseries_numeric_segments_path), + "D", + str(tmp_output_path), + ] + ) + df_forecast = pd.read_csv(tmp_output_path / "forecast.csv", dtype={"segment": str}) + output_segments = df_forecast["segment"].unique() + assert set(segments) == set(output_segments) + + +def test_backtest_with_numeric_segments_with_exog( + base_pipeline_yaml_path, + base_backtest_yaml_path, + base_timeseries_numeric_segments_path, + base_timeseries_numeric_segments_exog_path, +): + target = pd.read_csv(base_timeseries_numeric_segments_path, dtype={"segment": str}) + segments = target["segment"].unique() + + tmp_output = TemporaryDirectory() + tmp_output_path = Path(tmp_output.name) + run( + [ + "etna", + "backtest", + str(base_pipeline_yaml_path), + str(base_backtest_yaml_path), + str(base_timeseries_numeric_segments_path), + "D", + str(tmp_output_path), + str(base_timeseries_numeric_segments_exog_path), + ] + ) + df_forecast = pd.read_csv(tmp_output_path / "forecast.csv", dtype={"segment": str}) + output_segments = df_forecast["segment"].unique() + assert set(segments) == set(output_segments) diff --git a/tests/test_commands/test_forecast.py b/tests/test_commands/test_forecast.py index be19d2df8..be737635a 100644 --- a/tests/test_commands/test_forecast.py +++ b/tests/test_commands/test_forecast.py @@ -409,3 +409,59 @@ def test_forecast_with_estimate_n_folds( assert all(x in df_output.columns for x in ["target_0.025", "target_0.975"]) assert len(df_output) == 4 * 2 # 4 predictions for 2 segments + + +def test_forecast_with_numeric_segments( + base_pipeline_yaml_path, + base_timeseries_numeric_segments_path, +): + target = pd.read_csv(base_timeseries_numeric_segments_path, dtype={"segment": str}) + segments = target["segment"].unique() + + tmp_output = NamedTemporaryFile("w") + tmp_output_path = Path(tmp_output.name) + run( + [ + "etna", + "forecast", + str(base_pipeline_yaml_path), + str(base_timeseries_numeric_segments_path), + "D", + str(tmp_output_path), + ], + ) + df_output = pd.read_csv(tmp_output_path, dtype={"segment": str}) + output_segments = df_output["segment"].unique() + assert set(segments) == set(output_segments) + + +@pytest.mark.parametrize( + "pipeline_path_name", + ("base_pipeline_yaml_path", "base_ensemble_yaml_path"), +) +def test_forecast_with_numeric_segments_with_exog( + pipeline_path_name, + base_timeseries_numeric_segments_path, + base_timeseries_numeric_segments_exog_path, + request, +): + target = pd.read_csv(base_timeseries_numeric_segments_path, dtype={"segment": str}) + segments = target["segment"].unique() + + tmp_output = NamedTemporaryFile("w") + tmp_output_path = Path(tmp_output.name) + pipeline_path = request.getfixturevalue(pipeline_path_name) + run( + [ + "etna", + "forecast", + str(pipeline_path), + str(base_timeseries_numeric_segments_path), + "D", + str(tmp_output_path), + str(base_timeseries_numeric_segments_exog_path), + ], + ) + df_output = pd.read_csv(tmp_output_path, dtype={"segment": str}) + output_segments = df_output["segment"].unique() + assert set(segments) == set(output_segments)