Skip to content

Commit

Permalink
fixing unit tests
Browse files Browse the repository at this point in the history
  • Loading branch information
manu-sj committed Jul 3, 2024
1 parent c020210 commit e87331e
Show file tree
Hide file tree
Showing 17 changed files with 221 additions and 256 deletions.
19 changes: 1 addition & 18 deletions python/hsfs/core/training_dataset_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
from hsfs.core import (
tags_api,
training_dataset_api,
transformation_function_engine,
)


Expand All @@ -38,11 +37,6 @@ def __init__(self, feature_store_id):
feature_store_id
)
self._tags_api = tags_api.TagsApi(feature_store_id, self.ENTITY_TYPE)
self._transformation_function_engine = (
transformation_function_engine.TransformationFunctionEngine(
feature_store_id
)
)

def save(self, training_dataset, features, user_write_options):
if isinstance(features, query.Query):
Expand All @@ -53,9 +47,6 @@ def save(self, training_dataset, features, user_write_options):
)
for label_name in training_dataset.label
]
self._transformation_function_engine.attach_transformation_fn(
training_dataset
)
else:
features = engine.get_instance().convert_to_default_dataframe(features)
training_dataset._features = (
Expand All @@ -66,19 +57,11 @@ def save(self, training_dataset, features, user_write_options):
if feature.name == label_name:
feature.label = True

# check if user provided transformation functions and throw error as transformation functions work only
# with query objects
if training_dataset.transformation_functions:
raise ValueError(
"Transformation functions can only be applied to training datasets generated from Query object"
)

if len(training_dataset.splits) > 0 and training_dataset.train_split is None:
training_dataset.train_split = "train"
warnings.warn(
"Training dataset splits were defined but no `train_split` (the name of the split that is going to be "
"used for training) was provided. Setting this property to `train`. The statistics of this "
"split will be used for transformation functions.",
"used for training) was provided. Setting this property to `train`. ",
stacklevel=1,
)

Expand Down
2 changes: 1 addition & 1 deletion python/hsfs/engine/python.py
Original file line number Diff line number Diff line change
Expand Up @@ -966,7 +966,7 @@ def get_training_data(
# training_dataset_obj, feature_view_obj, training_dataset_version
# )
return self._apply_transformation_function(
training_dataset_obj.transformation_functions, df
feature_view_obj.transformation_functions, df
)

def split_labels(
Expand Down
24 changes: 19 additions & 5 deletions python/hsfs/hopsworks_udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ def _get_output_column_names(self) -> str:
"""
if self._udf_type == UDFType.MODEL_DEPENDENT:
_BASE_COLUMN_NAME = (
f'{self.function_name}_{"-".join(self.transformation_features)}_'
f'{self.function_name}_{"_".join(self.transformation_features)}_'
)
if len(self.return_types) > 1:
return [
Expand Down Expand Up @@ -655,10 +655,14 @@ def from_response_json(
transformation_features = [
feature.strip() for feature in json_decamelized["transformation_features"]
]
dropped_features = [
dropped_feature.strip()
for dropped_feature in json_decamelized["dropped_features"]
]
dropped_features = (
[
dropped_feature.strip()
for dropped_feature in json_decamelized["dropped_features"]
]
if "dropped_features" in json_decamelized
else None
)
statistics_features = (
[
feature.strip()
Expand All @@ -671,6 +675,16 @@ def from_response_json(
# Reconstructing statistics arguments.
arg_list, _, _, _ = HopsworksUdf._parse_function_signature(function_source_code)

transformation_features = (
arg_list if not transformation_features else transformation_features
)

if dropped_features:
dropped_features = [
transformation_features[arg_list.index(dropped_feature)]
for dropped_feature in dropped_features
]

if statistics_features:
transformation_features = [
TransformationFeature(
Expand Down
19 changes: 0 additions & 19 deletions python/hsfs/training_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
statistics_engine,
training_dataset_api,
training_dataset_engine,
transformation_function_engine,
vector_server,
)
from hsfs.statistics_config import StatisticsConfig
Expand Down Expand Up @@ -538,7 +537,6 @@ def __init__(
from_query=None,
querydto=None,
label=None,
transformation_functions=None,
train_split=None,
time_split_size=None,
extra_filter=None,
Expand Down Expand Up @@ -580,7 +578,6 @@ def __init__(
self._querydto = querydto
self._feature_store_id = featurestore_id
self._feature_store_name = featurestore_name
self._transformation_functions = transformation_functions

self._training_dataset_api = training_dataset_api.TrainingDatasetApi(
featurestore_id
Expand All @@ -592,9 +589,6 @@ def __init__(
featurestore_id, self.ENTITY_TYPE
)
self._code_engine = code_engine.CodeEngine(featurestore_id, self.ENTITY_TYPE)
self._transformation_function_engine = (
transformation_function_engine.TransformationFunctionEngine(featurestore_id)
)
self._vector_server = vector_server.VectorServer(
featurestore_id, features=self._features
)
Expand Down Expand Up @@ -1084,19 +1078,6 @@ def feature_store_name(self) -> str:
"""Name of the feature store in which the feature group is located."""
return self._feature_store_name

@property
def transformation_functions(self):
"""Set transformation functions."""
if self._id is not None and self._transformation_functions is None:
self._transformation_functions = (
self._transformation_function_engine.get_td_transformation_fn(self)
)
return self._transformation_functions

@transformation_functions.setter
def transformation_functions(self, transformation_functions):
self._transformation_functions = transformation_functions

@property
def serving_keys(self) -> Set[str]:
"""Set of primary key names that is used as keys in input dict object for `get_serving_vector` method."""
Expand Down
76 changes: 0 additions & 76 deletions python/tests/core/test_feature_view_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,7 @@
from hsfs.constructor.query import Query
from hsfs.core import arrow_flight_client, feature_view_engine
from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics
from hsfs.hopsworks_udf import udf
from hsfs.storage_connector import BigQueryConnector, StorageConnector
from hsfs.transformation_function import TransformationFunction


engine.init("python")
Expand Down Expand Up @@ -349,9 +347,6 @@ def test_get_name(self, mocker):
feature_store_id = 99

mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi")
mocker.patch(
"hsfs.core.feature_view_engine.FeatureViewEngine.get_attached_transformation_fn"
)

fv_engine = feature_view_engine.FeatureViewEngine(
feature_store_id=feature_store_id
Expand Down Expand Up @@ -387,9 +382,6 @@ def test_get_name_version(self, mocker):
feature_store_id = 99

mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi")
mocker.patch(
"hsfs.core.feature_view_engine.FeatureViewEngine.get_attached_transformation_fn"
)

fv_engine = feature_view_engine.FeatureViewEngine(
feature_store_id=feature_store_id
Expand Down Expand Up @@ -555,74 +547,6 @@ def test_get_batch_query_string_pit_query(self, mocker):
assert mock_fv_api.return_value.get_batch_query.call_count == 1
assert mock_qc_api.return_value.construct_query.call_count == 1

def test_get_attached_transformation_fn(self, mocker):
# Arrange
feature_store_id = 99

mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi")

fv_engine = feature_view_engine.FeatureViewEngine(
feature_store_id=feature_store_id
)

@udf(int)
def test2(col1):
return col1 + 1

tf = TransformationFunction(
featurestore_id=10,
hopsworks_udf=test2,
)

mock_fv_api.return_value.get_attached_transformation_fn.return_value = [tf]

# Act
result = fv_engine.get_attached_transformation_fn(name="fv_name", version=1)

# Assert
assert result == [tf]
assert mock_fv_api.return_value.get_attached_transformation_fn.call_count == 1

def test_get_attached_transformation_fn_multiple(self, mocker):
# Arrange
feature_store_id = 99

mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi")

fv_engine = feature_view_engine.FeatureViewEngine(
feature_store_id=feature_store_id
)

@udf(int)
def test1(col1):
return col1 + 1

tf1 = TransformationFunction(
featurestore_id=10,
hopsworks_udf=test1,
)

@udf(int)
def test2(col1):
return col1 + 2

tf2 = TransformationFunction(
featurestore_id=10,
hopsworks_udf=test2,
)

mock_fv_api.return_value.get_attached_transformation_fn.return_value = [
tf1,
tf2,
]

# Act
result = fv_engine.get_attached_transformation_fn(name="fv_name", version=1)

# Assert
assert result == [tf1, tf2]
assert mock_fv_api.return_value.get_attached_transformation_fn.call_count == 1

def test_create_training_dataset(self, mocker):
# Arrange
feature_store_id = 99
Expand Down
72 changes: 2 additions & 70 deletions python/tests/core/test_training_dataset_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,13 @@
# limitations under the License.
#

import pytest
from hsfs import (
feature_group,
training_dataset,
training_dataset_feature,
transformation_function,
)
from hsfs.constructor import query
from hsfs.core import training_dataset_engine
from hsfs.hopsworks_udf import udf


class TestTrainingDatasetEngine:
Expand All @@ -32,9 +29,6 @@ def test_save(self, mocker):
feature_store_id = 99

mocker.patch("hsfs.client.get_instance")
mocker.patch(
"hsfs.core.transformation_function_engine.TransformationFunctionEngine"
)
mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance")
mock_td_api = mocker.patch("hsfs.core.training_dataset_api.TrainingDatasetApi")

Expand Down Expand Up @@ -76,9 +70,6 @@ def test_save_query(self, mocker, backend_fixtures):
mocker.patch("hsfs.client.get_instance")
mocker.patch("hsfs.engine.get_type")

mocker.patch(
"hsfs.core.transformation_function_engine.TransformationFunctionEngine"
)
mocker.patch("hsfs.engine.get_instance")
mock_td_api = mocker.patch("hsfs.core.training_dataset_api.TrainingDatasetApi")

Expand Down Expand Up @@ -107,70 +98,12 @@ def test_save_query(self, mocker, backend_fixtures):
assert td._features[0].label is True
assert td._features[1].label is True

def test_save_transformation_functions(self, mocker):
# Arrange
feature_store_id = 99

mocker.patch("hsfs.client.get_instance")
mocker.patch(
"hsfs.core.transformation_function_engine.TransformationFunctionEngine"
)
mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance")
mock_td_api = mocker.patch("hsfs.core.training_dataset_api.TrainingDatasetApi")

@udf(int)
def plus_one(a):
return a + 1

tf = transformation_function.TransformationFunction(
hopsworks_udf=plus_one, featurestore_id=99
)

td = training_dataset.TrainingDataset(
name="test",
version=1,
data_format="CSV",
featurestore_id=feature_store_id,
splits={},
label=["f", "f_wrong"],
transformation_functions=tf,
)

td_engine = training_dataset_engine.TrainingDatasetEngine(feature_store_id)

f = training_dataset_feature.TrainingDatasetFeature(
name="f", type="str", label=False
)
f1 = training_dataset_feature.TrainingDatasetFeature(
name="f1", type="int", label=False
)

features = [f, f1]

mock_engine_get_instance.return_value.parse_schema_training_dataset.return_value = features

# Act
with pytest.raises(ValueError) as e_info:
td_engine.save(training_dataset=td, features=None, user_write_options=None)

# Assert
assert mock_td_api.return_value.post.call_count == 0
assert len(td._features) == 2
assert td._features[0].label is True
assert td._features[1].label is False
assert (
str(e_info.value)
== "Transformation functions can only be applied to training datasets generated from Query object"
)

def test_save_splits(self, mocker):
# Arrange
feature_store_id = 99

mocker.patch("hsfs.client.get_instance")
mocker.patch(
"hsfs.core.transformation_function_engine.TransformationFunctionEngine"
)

mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance")
mock_td_api = mocker.patch("hsfs.core.training_dataset_api.TrainingDatasetApi")
mock_warning = mocker.patch("warnings.warn")
Expand Down Expand Up @@ -209,8 +142,7 @@ def test_save_splits(self, mocker):
assert (
mock_warning.call_args[0][0]
== "Training dataset splits were defined but no `train_split` (the name of the split that is going to be "
"used for training) was provided. Setting this property to `train`. The statistics of this "
"split will be used for transformation functions."
"used for training) was provided. Setting this property to `train`. "
)

def test_insert(self, mocker):
Expand Down
Loading

0 comments on commit e87331e

Please sign in to comment.