Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

field filter array dereference #302

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions exetera/core/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def get_spans(self):
"""
raise NotImplementedError("Please use get_spans() on specific fields, not the field base class.")

def apply_filter(self, filter_to_apply, dstfld=None):
def apply_filter(self, filter_to_apply, target=None, in_place=False):
"""
Apply filter on the field.
"""
Expand All @@ -143,6 +143,27 @@ def _ensure_valid(self):
if not self._valid_reference:
raise ValueError("This field no longer refers to a valid underlying field object")

def __getitem__(self, item:Union[list, tuple, np.ndarray]):
if isinstance(item, slice):
# TODO
Copy link
Member

@atbenmurray atbenmurray Jul 25, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can forward to the self.data getitem in the general case and then wrap the result in a mem field

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The data can be acquired with self.data[slice] I think, this can then be put into a mem field.

pass
elif isinstance(item, int):
# TODO
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can get an array of a single value with self.data[[item]] and then put that into a mem field.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same for this

pass
elif isinstance(item, (list, tuple, np.ndarray)):
allBooleanFlag = True
for x in item:
if not isinstance(x, bool):
allBooleanFlag = False
break

if allBooleanFlag:
filter_to_apply = np.array(item, dtype='bool') if not isinstance(item, np.ndarray) else item
return self.apply_filter(filter_to_apply, target=None, in_place=False)
else:
index_to_apply = np.array(item, dtype=np.int64) if not isinstance(item, np.ndarray) else item
return self.apply_index(index_to_apply, target=None, in_place=False)


class MemoryField(Field):

Expand Down Expand Up @@ -210,7 +231,7 @@ def __bool__(self):
# if f is not None:
return True

def apply_filter(self, filter_to_apply, dstfld=None):
def apply_filter(self, filter_to_apply, target=None, in_place=False):
"""
Apply filter on the field.
"""
Expand Down
54 changes: 54 additions & 0 deletions tests/test_fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -2288,3 +2288,57 @@ def test_argsort(self, creator, name, kwargs, data):
else:
with self.assertRaises(ValueError):
fields.argsort(f)


ARRAY_DEREFERENCE_FILTER_TESTS = [
([True, False, True], "create_indexed_string", {}, ['a', 'bb', 'ccc']),
# (WHERE_BOOLEAN_COND, "create_indexed_string", {}, WHERE_INDEXED_STRING_FIELD_DATA),
# (WHERE_BOOLEAN_COND, "create_indexed_string", {}, WHERE_INDEXED_STRING_FIELD_DATA),
# (WHERE_BOOLEAN_COND, "create_indexed_string", {}, WHERE_INDEXED_STRING_FIELD_DATA),
([True, False, True], "create_fixed_string", {"length": 3}, ['a', 'b', 'c']),
([True, False, True], "create_numeric", {"nformat": "int8"}, [20,30,40]),
([True, False, True], "create_categorical", {"nformat": "int32", "key": {"a": 1, "b": 2, "c": 3}}, [1,2,3])
]

ARRAY_DEREFERENCE_INDEX_TESTS = [
([0, 2], "create_indexed_string", {}, ['a', 'bb', 'ccc']),
# (WHERE_BOOLEAN_COND, "create_indexed_string", {}, WHERE_INDEXED_STRING_FIELD_DATA),
# (WHERE_BOOLEAN_COND, "create_indexed_string", {}, WHERE_INDEXED_STRING_FIELD_DATA),
# (WHERE_BOOLEAN_COND, "create_indexed_string", {}, WHERE_INDEXED_STRING_FIELD_DATA),
([0, 2], "create_fixed_string", {"length": 3}, ['a', 'b', 'c']),
([0, 2], "create_numeric", {"nformat": "int8"}, [20,30,40]),
([0, 2], "create_categorical", {"nformat": "int32", "key": {"a": 1, "b": 2, "c": 3}}, [1,2,3])
]

class TestArrayDereferenceFunctions(SessionTestCase):

def assertIfMemFieldAndIfSameTypeAsField(self, memfield, field):
self.assertIsInstance(memfield, fields.MemoryField)
if not (isinstance(field, fields.IndexedStringField) and isinstance(memfield, fields.IndexedStringMemField)) \
and not (isinstance(field, fields.FixedStringField) and isinstance(memfield, fields.FixedStringMemField)) \
and not (isinstance(field, fields.NumericField) and isinstance(memfield, fields.NumericMemField)) \
and not (isinstance(field, fields.CategoricalField) and isinstance(memfield, fields.CategoricalMemField)):
raise AssertionError(f"{type(memfield)} is not the MemField for {type(field)}")


@parameterized.expand(ARRAY_DEREFERENCE_FILTER_TESTS)
def test_field_filter_dereference(self, filter, creator, kwargs, data):
f = self.setup_field(self.df, creator, 'f', (), kwargs, data)
result = f[filter]

filter_to_apply = filter if isinstance(filter, np.ndarray) else np.array(filter, dtype=np.int8)
expected_result = f.apply_filter(filter_to_apply, target=None, in_place=False)

self.assertIfMemFieldAndIfSameTypeAsField(result, f)
np.testing.assert_array_equal(result.data[:], expected_result.data[:])

@parameterized.expand(ARRAY_DEREFERENCE_INDEX_TESTS)
def test_field_index_dereference(self, index, creator, kwargs, data):
f = self.setup_field(self.df, creator, 'f', (), kwargs, data)
result = f[index]

index_to_apply = index if isinstance(index, np.ndarray) else np.array(index, dtype=np.int8)
expected_result = f.apply_index(index_to_apply, target=None, in_place=False)

self.assertIfMemFieldAndIfSameTypeAsField(result, f)
np.testing.assert_array_equal(result.data[:], expected_result.data[:])