diff --git a/etna/datasets/internal_datasets.py b/etna/datasets/internal_datasets.py index 6fcd12a33..fdf367efb 100644 --- a/etna/datasets/internal_datasets.py +++ b/etna/datasets/internal_datasets.py @@ -347,6 +347,7 @@ def read_data(path: Path, part: str) -> np.ndarray: targets = np.concatenate([targets_train, targets_test], axis=0) targets = targets[np.argsort(ts_indecies)].reshape(-1, 963) + # federal holidays and days with anomalies drop_days = [ date(2008, 1, 1), date(2008, 1, 21), diff --git a/tests/test_datasets/test_internal_datasets.py b/tests/test_datasets/test_internal_datasets.py index a9761a343..de96d9992 100644 --- a/tests/test_datasets/test_internal_datasets.py +++ b/tests/test_datasets/test_internal_datasets.py @@ -81,189 +81,181 @@ def test_not_present_part(): @pytest.mark.parametrize( "dataset_name, expected_shape, expected_min_timestamp, expected_max_timestamp, dataset_parts", [ - # pytest.param( - # "electricity_15T", - # (139896 + 360, 370), - # pd.to_datetime("2011-01-01 00:15:00"), - # pd.to_datetime("2015-01-01 00:00:00"), - # ("train", "test"), - # marks=pytest.mark.skip(reason="Dataset is too large for testing in GitHub."), - # ), - # ( - # "m4_hourly", - # (960 + 48, 414), - # 0, - # 1007, - # ("train", "test"), - # ), - # pytest.param( - # "m4_daily", - # (9919 + 14, 4227), - # 0, - # 9932, - # ("train", "test"), - # marks=pytest.mark.skip(reason="Dataset is too large for testing in GitHub."), - # ), - # ( - # "m4_weekly", - # (2597 + 13, 359), - # 0, - # 2609, - # ("train", "test"), - # ), - # pytest.param( - # "m4_monthly", - # (2794 + 18, 48000), - # 0, - # 2811, - # ("train", "test"), - # marks=pytest.mark.skip(reason="Dataset is too large for testing in GitHub."), - # ), - # ( - # "m4_quarterly", - # (866 + 8, 24000), - # 0, - # 873, - # ("train", "test"), - # ), - # ( - # "m4_yearly", - # (835 + 6, 23000), - # 0, - # 840, - # ("train", "test"), - # ), - # pytest.param( - # "traffic_2008_10T", - # (65376 + 144, 963), - # pd.to_datetime("2008-01-01 00:00:00"), - # pd.to_datetime("2009-03-30 23:50:00"), - # ("train", "test"), - # marks=pytest.mark.skip(reason="Dataset is too large for testing in GitHub."), - # ), + pytest.param( + "electricity_15T", + (139896 + 360, 370), + pd.to_datetime("2011-01-01 00:15:00"), + pd.to_datetime("2015-01-01 00:00:00"), + ("train", "test"), + marks=pytest.mark.skip(reason="Dataset is too large for testing in GitHub."), + ), + ( + "m4_hourly", + (960 + 48, 414), + 0, + 1007, + ("train", "test"), + ), + pytest.param( + "m4_daily", + (9919 + 14, 4227), + 0, + 9932, + ("train", "test"), + marks=pytest.mark.skip(reason="Dataset is too large for testing in GitHub."), + ), + ( + "m4_weekly", + (2597 + 13, 359), + 0, + 2609, + ("train", "test"), + ), + pytest.param( + "m4_monthly", + (2794 + 18, 48000), + 0, + 2811, + ("train", "test"), + marks=pytest.mark.skip(reason="Dataset is too large for testing in GitHub."), + ), + ( + "m4_quarterly", + (866 + 8, 24000), + 0, + 873, + ("train", "test"), + ), ( + "m4_yearly", + (835 + 6, 23000), + 0, + 840, + ("train", "test"), + ), + pytest.param( + "traffic_2008_10T", + (65376 + 144, 963), + pd.to_datetime("2008-01-01 00:00:00"), + pd.to_datetime("2009-03-30 23:50:00"), + ("train", "test"), + marks=pytest.mark.skip(reason="Dataset is too large for testing in GitHub."), + ), + pytest.param( "traffic_2008_hourly", (10896 + 24, 963), pd.to_datetime("2008-01-01 00:00:00"), pd.to_datetime("2009-03-30 23:00:00"), ("train", "test"), + marks=pytest.mark.skip(reason="Dataset is too large for testing in GitHub."), + ), + pytest.param( + "traffic_2015_hourly", + (17520 + 24, 862), + pd.to_datetime("2015-01-01 00:00:00"), + pd.to_datetime("2016-12-31 23:00:00"), + ("train", "test"), + marks=pytest.mark.skip(reason="Dataset is too large for testing in GitHub."), + ), + ( + "m3_monthly", + (126 + 18, 2856), + 0, + 143, + ("train", "test"), + ), + ( + "m3_quarterly", + (64 + 8, 1512), + 0, + 71, + ("train", "test"), + ), + ( + "m3_other", + (96 + 8, 348), + 0, + 103, + ("train", "test"), + ), + ( + "m3_yearly", + (41 + 6, 1290), + 0, + 46, + ("train", "test"), + ), + ( + "tourism_monthly", + (309 + 24, 732), + 0, + 332, + ("train", "test"), + ), + ( + "tourism_quarterly", + (122 + 8, 854), + 0, + 129, + ("train", "test"), + ), + ( + "tourism_yearly", + (43 + 4, 1036), + 0, + 46, + ("train", "test"), + ), + ( + "weather_10T", + (52560 + 144, 21), + pd.to_datetime("2020-01-01 00:10:00"), + pd.to_datetime("2021-01-01 00:00:00"), + ("train", "test"), + ), + ( + "ETTm1", + (66800 + 2880, 7), + pd.to_datetime("2016-07-01 00:00:00"), + pd.to_datetime("2018-06-26 19:45:00"), + ("train", "test"), + ), + ( + "ETTm2", + (66800 + 2880, 7), + pd.to_datetime("2016-07-01 00:00:00"), + pd.to_datetime("2018-06-26 19:45:00"), + ("train", "test"), + ), + ( + "ETTh1", + (16700 + 720, 7), + pd.to_datetime("2016-07-01 00:00:00"), + pd.to_datetime("2018-06-26 19:00:00"), + ("train", "test"), + ), + ( + "ETTh2", + (16700 + 720, 7), + pd.to_datetime("2016-07-01 00:00:00"), + pd.to_datetime("2018-06-26 19:00:00"), + ("train", "test"), + ), + pytest.param( + "IHEPC_T", + (2075259, 7), + pd.to_datetime("2006-12-16 17:24:00"), + pd.to_datetime("2010-11-26 21:02:00"), + tuple(), + marks=pytest.mark.skip(reason="Dataset is too large for testing in GitHub."), + ), + ( + "australian_wine_sales_monthly", + (176, 1), + pd.to_datetime("1980-01-01 00:00:00"), + pd.to_datetime("1994-08-01 00:00:00"), + tuple(), ), - # TODO: revert - # pytest.param( - # "traffic_2008_hourly", - # (10896 + 24, 963), - # pd.to_datetime("2008-01-01 00:00:00"), - # pd.to_datetime("2009-03-30 23:00:00"), - # ("train", "test"), - # marks=pytest.mark.skip(reason="Dataset is too large for testing in GitHub."), - # ), - # pytest.param( - # "traffic_2015_hourly", - # (17520 + 24, 862), - # pd.to_datetime("2015-01-01 00:00:00"), - # pd.to_datetime("2016-12-31 23:00:00"), - # ("train", "test"), - # marks=pytest.mark.skip(reason="Dataset is too large for testing in GitHub."), - # ), - # ( - # "m3_monthly", - # (126 + 18, 2856), - # 0, - # 143, - # ("train", "test"), - # ), - # ( - # "m3_quarterly", - # (64 + 8, 1512), - # 0, - # 71, - # ("train", "test"), - # ), - # ( - # "m3_other", - # (96 + 8, 348), - # 0, - # 103, - # ("train", "test"), - # ), - # ( - # "m3_yearly", - # (41 + 6, 1290), - # 0, - # 46, - # ("train", "test"), - # ), - # ( - # "tourism_monthly", - # (309 + 24, 732), - # 0, - # 332, - # ("train", "test"), - # ), - # ( - # "tourism_quarterly", - # (122 + 8, 854), - # 0, - # 129, - # ("train", "test"), - # ), - # ( - # "tourism_yearly", - # (43 + 4, 1036), - # 0, - # 46, - # ("train", "test"), - # ), - # ( - # "weather_10T", - # (52560 + 144, 21), - # pd.to_datetime("2020-01-01 00:10:00"), - # pd.to_datetime("2021-01-01 00:00:00"), - # ("train", "test"), - # ), - # ( - # "ETTm1", - # (66800 + 2880, 7), - # pd.to_datetime("2016-07-01 00:00:00"), - # pd.to_datetime("2018-06-26 19:45:00"), - # ("train", "test"), - # ), - # ( - # "ETTm2", - # (66800 + 2880, 7), - # pd.to_datetime("2016-07-01 00:00:00"), - # pd.to_datetime("2018-06-26 19:45:00"), - # ("train", "test"), - # ), - # ( - # "ETTh1", - # (16700 + 720, 7), - # pd.to_datetime("2016-07-01 00:00:00"), - # pd.to_datetime("2018-06-26 19:00:00"), - # ("train", "test"), - # ), - # ( - # "ETTh2", - # (16700 + 720, 7), - # pd.to_datetime("2016-07-01 00:00:00"), - # pd.to_datetime("2018-06-26 19:00:00"), - # ("train", "test"), - # ), - # pytest.param( - # "IHEPC_T", - # (2075259, 7), - # pd.to_datetime("2006-12-16 17:24:00"), - # pd.to_datetime("2010-11-26 21:02:00"), - # tuple(), - # marks=pytest.mark.skip(reason="Dataset is too large for testing in GitHub."), - # ), - # ( - # "australian_wine_sales_monthly", - # (176, 1), - # pd.to_datetime("1980-01-01 00:00:00"), - # pd.to_datetime("1994-08-01 00:00:00"), - # tuple(), - # ), ], ) def test_dataset_statistics(