From e05e45545af3e53dfd8caf548697b5ccf281b2dc Mon Sep 17 00:00:00 2001 From: Guanjie Shen <75445106+guanjieshen@users.noreply.github.com> Date: Mon, 7 Feb 2022 09:11:56 -0700 Subject: [PATCH] fix: typo in interpolation code (#139) * new changes * updated upsample * updated upsample * updated upsample * committing read_yaml * adding class1 with stacking * adding class1 with stacking * removing streams * removing streams * adding anomaly detection yaml support * making database configurable * making database configurable * making database configurable * added option for empty string prefix * added option for empty string prefix * added option for empty string prefix * removing anomaly detection in branch * remove anomaly detection code test file * merging resample * removing dbl tempo egg files * removing dbl tempo egg files * removing dbl tempo egg files * removing dbl tempo egg files * removing dbl tempo egg files * feat: add interpolation * feat(interpolation): add support for multuple partitions, and target columns * test: add interpolation zero fill test * test: add additional interpolation tests * chore: convert linear interpolation to use spark native functions * chore: allow for interpolation to be called directly from the TSDF object * Fourier transform functionality release Q42021 (#111) * fourier transformation functionality in tempo * fourier transform method docstrings added * fourier transform unit test added * updating readme with the fourier transform usage and the fourier function's variable naming pattern standard * Updating requirements * minor logic correction of naming the data column as 'val' * adding the corrected buildTestDF and also adding pyarrow in requirements.txt * Fourier unit test fixed and contributions information updated * data column in tests and logic is corrected with the name changed to tdval * original contribution restoration * bringing the pandas_udf method inside the calling method to ensure the reference is not lost in the executors * committing the correct timestep variable position * adding self to timestep * inherit timestep directly from parameter * tidying up the codebase * removed the set_timestep method as it is no longer required * removing the unnecessary orderby * adding order by inside the pandas function * removed the redundant imports * chore: update series fill logic * chore: change default behaviour for target_cols * chore: rename to be more consistent with pandas and the tsdf class * chore(interpolation): make show if interpolated column optional * chore(interpolation): remove caching * Update README.md * Troubleshooting (#2) * Refactor interpolation code to remove joins, and double` resample` * Added additional test coverage to interpolation code * Updated `test` folder structure Co-authored-by: Guanjie Shen * chore: add additional comments * chore: update branches in test.yml * fix: update interpolate_column params * chore: add interpolation details in readme.md * chore: update main readme.md * chore: update main readme.md * Merge branch 'master' of github.com:guanjieshen/tempo * chore: make readme more consistent * chore: add build and downloads badge to readme * fixing workflows * changes * fix: fourier test java error * fix: try to configure netty changes so tests for fourier will work * change * housekeeping: organize imports on tsdf.py * chore(interpolation): change back to bfill, change forward to ffill * interpolation: add the ability to call interpolate after resample * housekeeping: add missing type hint * chore(interpolate): update readme * chore: update interpolation documentation to be more clear * adding one unit test * rebase * Bump numpy from 1.19.1 to 1.21.0 (#123) Bumps [numpy](https://github.com/numpy/numpy) from 1.19.1 to 1.21.0. - [Release notes](https://github.com/numpy/numpy/releases) - [Changelog](https://github.com/numpy/numpy/blob/main/doc/HOWTO_RELEASE.rst.txt) - [Commits](https://github.com/numpy/numpy/compare/v1.19.1...v1.21.0) --- updated-dependencies: - dependency-name: numpy dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Bump scipy from 1.7.2 to 1.7.3 (#126) Bumps [scipy](https://github.com/scipy/scipy) from 1.7.2 to 1.7.3. - [Release notes](https://github.com/scipy/scipy/releases) - [Commits](https://github.com/scipy/scipy/compare/v1.7.2...v1.7.3) --- updated-dependencies: - dependency-name: scipy dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Bump python-dateutil from 2.8.1 to 2.8.2 (#127) Bumps [python-dateutil](https://github.com/dateutil/dateutil) from 2.8.1 to 2.8.2. - [Release notes](https://github.com/dateutil/dateutil/releases) - [Changelog](https://github.com/dateutil/dateutil/blob/master/NEWS) - [Commits](https://github.com/dateutil/dateutil/compare/2.8.1...2.8.2) --- updated-dependencies: - dependency-name: python-dateutil dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * fix license classifier (#130) * fix images for PyPI (#131) * Bump ipython from 7.28.0 to 8.0.1 (#128) Bumps [ipython](https://github.com/ipython/ipython) from 7.28.0 to 8.0.1. - [Release notes](https://github.com/ipython/ipython/releases) - [Commits](https://github.com/ipython/ipython/compare/7.28.0...8.0.1) --- updated-dependencies: - dependency-name: ipython dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Bump pyspark from 3.0.0 to 3.2.1 (#132) Bumps [pyspark](https://github.com/apache/spark) from 3.0.0 to 3.2.1. - [Release notes](https://github.com/apache/spark/releases) - [Commits](https://github.com/apache/spark/compare/v3.0.0...v3.2.1) --- updated-dependencies: - dependency-name: pyspark dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * updated requirements file (#125) * updated requirements file * adding working directory * update the packages_dir location for pypi-publish * downgrade python for pypi packaging * update look in python dist folder * removing release creation (github) * Bump pandas from 1.1.0 to 1.4.0 (#133) Bumps [pandas](https://github.com/pandas-dev/pandas) from 1.1.0 to 1.4.0. - [Release notes](https://github.com/pandas-dev/pandas/releases) - [Changelog](https://github.com/pandas-dev/pandas/blob/main/RELEASE.md) - [Commits](https://github.com/pandas-dev/pandas/compare/v1.1.0...v1.4.0) --- updated-dependencies: - dependency-name: pandas dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Bump wheel from 0.34.2 to 0.37.1 (#110) Bumps [wheel](https://github.com/pypa/wheel) from 0.34.2 to 0.37.1. - [Release notes](https://github.com/pypa/wheel/releases) - [Changelog](https://github.com/pypa/wheel/blob/main/docs/news.rst) - [Commits](https://github.com/pypa/wheel/compare/0.34.2...0.37.1) --- updated-dependencies: - dependency-name: wheel dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Bump py4j from 0.10.9 to 0.10.9.3 (#102) Bumps [py4j](https://github.com/bartdag/py4j) from 0.10.9 to 0.10.9.3. - [Release notes](https://github.com/bartdag/py4j/releases) - [Commits](https://github.com/bartdag/py4j/compare/0.10.9...0.10.9.3) --- updated-dependencies: - dependency-name: py4j dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Bump six from 1.15.0 to 1.16.0 (#88) Bumps [six](https://github.com/benjaminp/six) from 1.15.0 to 1.16.0. - [Release notes](https://github.com/benjaminp/six/releases) - [Changelog](https://github.com/benjaminp/six/blob/master/CHANGES) - [Commits](https://github.com/benjaminp/six/compare/1.15.0...1.16.0) --- updated-dependencies: - dependency-name: six dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * bump python to 3.9 (#134) * bump python to 3.9 * update text config * testing * Update requirements.txt * Update requirements.txt * Update requirements.txt * Update requirements.txt * Update requirements.txt * Update requirements.txt * set new release for license update (#135) * fix(interpolate): fix typo preventing custom ts column name; also add test to verify * chore: re-add sampled_input for interpol.py * linting: run linter on interpol.py * chore: bump up version to 0.1.5 Co-authored-by: Ricardo Portilla Co-authored-by: Guanjie Shen Co-authored-by: Souvik Pratiher <70095944+Spratiher9@users.noreply.github.com> Co-authored-by: rportilla-databricks <38080604+rportilla-databricks@users.noreply.github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Serge Smertin <259697+nfx@users.noreply.github.com> --- python/setup.py | 2 +- python/tempo/interpol.py | 4 ++-- python/tests/interpol_tests.py | 16 ++++++++++++---- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/python/setup.py b/python/setup.py index cb1c4927..abd31735 100644 --- a/python/setup.py +++ b/python/setup.py @@ -6,7 +6,7 @@ setuptools.setup( name='dbl-tempo', - version='0.1.4', + version='0.1.5', author='Ricardo Portilla, Tristan Nixon, Max Thone, Sonali Guleria', author_email='labs@databricks.com', description='Spark Time Series Utility Package', diff --git a/python/tempo/interpol.py b/python/tempo/interpol.py index a980ee9b..793d065b 100644 --- a/python/tempo/interpol.py +++ b/python/tempo/interpol.py @@ -294,7 +294,7 @@ def interpolate( sampled_input: DataFrame = tsdf.resample( freq=freq, func=func, metricCols=target_cols ).df - + # Fill timeseries for nearest values time_series_filled = self.__generate_time_series_fill( sampled_input, partition_cols, ts_col @@ -305,7 +305,7 @@ def interpolate( add_column_time: DataFrame = time_series_filled for column in target_cols: add_column_time = add_column_time.withColumn( - f"event_ts_{column}", + f"{ts_col}_{column}", when(col(column).isNull(), None).otherwise(col(ts_col)), ) add_column_time = self.__generate_column_time_fill( diff --git a/python/tests/interpol_tests.py b/python/tests/interpol_tests.py index db262f6e..96eed603 100644 --- a/python/tests/interpol_tests.py +++ b/python/tests/interpol_tests.py @@ -257,6 +257,7 @@ def test_back_fill_interpolation(self): expected_df: DataFrame = self.buildTestDF(self.expected_schema, expected_data) + actual_df: DataFrame = self.interpolate_helper.interpolate( tsdf=self.simple_input_tsdf, partition_cols=["partition_a", "partition_b"], @@ -465,17 +466,24 @@ def test_interpolation_using_custom_params(self): [ StructField("partition_a", StringType()), StructField("partition_b", StringType()), - StructField("event_ts", StringType(), False), + StructField("other_ts_col", StringType(), False), StructField("value_a", DoubleType()), StructField("is_ts_interpolated", BooleanType(), False), StructField("is_interpolated_value_a", BooleanType(), False), ] ) - expected_df: DataFrame = self.buildTestDF(expected_schema, expected_data) + # Modify input DataFrame using different ts_col + expected_df: DataFrame = self.buildTestDF(expected_schema, expected_data, ts_cols = ["other_ts_col"]) - actual_df: DataFrame = self.simple_input_tsdf.interpolate( - ts_col="event_ts", + input_tsdf = TSDF( + self.simple_input_tsdf.df.withColumnRenamed("event_ts","other_ts_col"), + partition_cols=["partition_a", "partition_b"], + ts_col="other_ts_col", + ) + + actual_df: DataFrame = input_tsdf.interpolate( + ts_col="other_ts_col", show_interpolated=True, partition_cols=["partition_a", "partition_b"], target_cols=["value_a"],