Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

20/isin array #34

Open
wants to merge 32 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
bfe5fad
Initial framework for isin_array.
chaburkland Mar 6, 2021
fc068df
Commit to save work. Not working.
chaburkland Mar 9, 2021
e98d0fd
Finishes initial iteration for non-object, 1D or 2D, unique arrays
chaburkland Mar 10, 2021
a0aa83e
Lints.
chaburkland Mar 10, 2021
01e0496
Lints.
chaburkland Mar 10, 2021
329258d
Adds a more explicit failure message.
chaburkland Mar 10, 2021
64286d9
Adds more info to debug msg.
chaburkland Mar 10, 2021
2cb35fd
Adds test for failing windows.
chaburkland Mar 10, 2021
996b47a
Adds initial support for object dtypes.
chaburkland Mar 10, 2021
b546e34
Cleans up ref counts and error handling in AK_isin_array_object.
chaburkland Mar 12, 2021
71d2fa2
Adds AK_GOTO_ON_NOT. Cleans up variable names. Handles refcounts & er…
chaburkland Mar 12, 2021
6c07d65
Partial commit to save work. Working on infrastructure for non-unique…
chaburkland Mar 13, 2021
376c49a
Fixes compiler warnings & rearranges order of function declarations
chaburkland Mar 13, 2021
d306991
Completes initial pass to have compiling code. Not working yet.
chaburkland Mar 13, 2021
38eaeb7
Fixes some reference count issues, type inconsistencies, and adds a b…
chaburkland Mar 13, 2021
ac476ad
Adds an alternative method to simply use numpy :(
chaburkland Mar 13, 2021
d3b818c
Adds a lot more performance testing infrastructure.
chaburkland Mar 14, 2021
e6b1c85
Finishes perf setup.
chaburkland Mar 14, 2021
52c029c
Adds work for missing trailing False for non-unique arrays.
chaburkland Mar 14, 2021
904cdf6
Finishes debug testing on 2D non-unique branch.
chaburkland Mar 14, 2021
886ec4a
Lints.
chaburkland Mar 14, 2021
db5bf34
Updates macro to support previous Python versions.
chaburkland Mar 14, 2021
4aafd21
Cleans up a lot of duplicate code.
chaburkland Mar 15, 2021
29a4ec7
Fixes some c-compiler warnings I think?
chaburkland Mar 15, 2021
d9687ae
Begins to clean up bad memory mgmt.
chaburkland Mar 18, 2021
baa5421
Rips out manual in1d impl. Removes GOTO macro. Other misc changes.
chaburkland Mar 18, 2021
b161f01
Update arraykit.c
chaburkland Mar 26, 2021
1ddc9ad
Merge branch 'master' into 20/isin_array
chaburkland Mar 26, 2021
da43f86
Modifies object reference code to be better.
chaburkland Mar 29, 2021
b242227
Merge branch 'master' into 20/isin_array
chaburkland Apr 9, 2021
a1c7ec9
Improves object array iteration. Renames a macro. Ignores *.diff files.
chaburkland Apr 9, 2021
ca9762a
Merge branch 'master' into 20/isin_array
chaburkland Apr 26, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
235 changes: 215 additions & 20 deletions arraykit.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
# define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION

# include "numpy/arrayobject.h"
# include "numpy/arrayscalars.h" // Needed for Datetime scalar expansions
# include "numpy/ufuncobject.h"

//------------------------------------------------------------------------------
// Macros
Expand Down Expand Up @@ -41,6 +43,18 @@
return NULL;\
} while (0)

// To simplify lines merely checking for `!value`
# define AK_CHECK_NOT(obj) \
if (!obj) { \
return NULL; \
}

// Print & flush out an arbitrary Python object
# define AK_PPRINT(obj) \
printf(""#obj""); printf(": "); PyObject_Print(obj, stdout, 0); printf("\n"); fflush(stdout);

// A simple `DEBUG` print & flush
# define AK_DEBUG printf("DEBUG\n"); fflush(stdout);

# if defined __GNUC__ || defined __clang__
# define AK_LIKELY(X) __builtin_expect(!!(X), 1)
Expand Down Expand Up @@ -98,9 +112,8 @@ PyArray_Descr*
AK_ResolveDTypeIter(PyObject *dtypes)
{
PyObject *iterator = PyObject_GetIter(dtypes);
if (iterator == NULL) {
return NULL;
}
AK_CHECK_NOT(iterator);

PyArray_Descr *resolved = NULL;
PyArray_Descr *dtype;
while ((dtype = (PyArray_Descr*) PyIter_Next(iterator))) {
Expand Down Expand Up @@ -237,11 +250,8 @@ static PyObject *
resolve_dtype(PyObject *Py_UNUSED(m), PyObject *args)
{
PyArray_Descr *d1, *d2;
if (!PyArg_ParseTuple(args, "O!O!:resolve_dtype",
&PyArrayDescr_Type, &d1, &PyArrayDescr_Type, &d2))
{
return NULL;
}
AK_CHECK_NOT(PyArg_ParseTuple(args, "O!O!:resolve_dtype",
&PyArrayDescr_Type, &d1, &PyArrayDescr_Type, &d2));
return (PyObject *)AK_ResolveDTypes(d1, d2);
}

Expand All @@ -251,6 +261,195 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg)
return (PyObject *)AK_ResolveDTypeIter(arg);
}


//------------------------------------------------------------------------------
// isin

static PyObject *
AK_isin_array_dtype_use_np(PyArrayObject *array, PyArrayObject *other, int assume_unique)
{
PyObject* result = NULL;

PyObject* args = PyTuple_Pack(2, (PyObject*)array, (PyObject*)other);
AK_CHECK_NOT(args);

PyObject* kwarg = PyDict_New();
if (!kwarg) {
Py_DECREF(args);
return NULL;
}

PyObject* assume_unique_obj = PyLong_FromLong((long)assume_unique);
if (!assume_unique_obj) {
goto failure;
}

int success = PyDict_SetItemString(kwarg, "assume_unique", assume_unique_obj);
Py_DECREF(assume_unique_obj);
if (success == -1) {
goto failure;
}

PyObject* numpy = PyImport_ImportModule("numpy");
if (!numpy) {
goto failure;
}

PyObject* func = PyObject_GetAttrString(numpy, PyArray_NDIM(array) == 1 ? "in1d": "isin");
Py_DECREF(numpy);
if (!func) {
goto failure;
}

result = PyObject_Call(func, args, kwarg);
Py_DECREF(func);
if (!result) {
goto failure;
}

if (0) {
failure:
// These will always exist.
Py_DECREF(args);
Py_DECREF(kwarg);
}
chaburkland marked this conversation as resolved.
Show resolved Hide resolved

return result;
}

static PyObject *
AK_isin_array_object(PyArrayObject *array, PyArrayObject *other)
{
/* Algorithm:

for loc, element in loc_iter(array):
result[loc] = element in set(other)
*/

// 0. Deallocate on failure
PyObject* compare_elements = NULL;
PyArrayObject* result = NULL;
NpyIter *iter = NULL;

// 1. Capture original array shape for return value
int array_ndim = PyArray_NDIM(array);
npy_intp* array_dims = PyArray_DIMS(array);

compare_elements = PyFrozenSet_New((PyObject*)other);
AK_CHECK_NOT(compare_elements);

// 2: Construct empty array
result = (PyArrayObject*)PyArray_Empty(
array_ndim, // nd
array_dims, // dims
PyArray_DescrFromType(NPY_BOOL), // dtype
0); // is_f_order
if (!result) {
goto failure;
}

// 3. Set up iteration
// https://numpy.org/doc/stable/reference/c-api/iterator.html?highlight=npyiter_multinew#simple-iteration-example
iter = NpyIter_New(array,
NPY_ITER_READONLY | NPY_ITER_REFS_OK | NPY_ITER_EXTERNAL_LOOP,
NPY_KEEPORDER,
NPY_NO_CASTING,
NULL);
if (!iter) {
goto failure;
}

NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL);
if (!iternext) {
goto failure;
}

char** dataptr = NpyIter_GetDataPtrArray(iter);
npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter);
npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter);

// 4. Iterate!
int i = 0;
do {
int j = 0;
char* data = *dataptr;
npy_intp size = *sizeptr;
npy_intp stride = *strideptr;

while (size--) {
PyObject* obj;
memcpy(&obj, data, sizeof(obj));
if (!obj) {
goto failure;
}
Py_INCREF(obj);

// 5. Assign into result whether or not the element exists in the set
// int found = PySequence_Contains(compare_elements, ((PyObject**)data)[0]);
int found = PySequence_Contains(compare_elements, obj);
Py_DECREF(obj);

if (found == -1) {
goto failure;
}

if (array_ndim == 1){
*(npy_bool *) PyArray_GETPTR1(result, j) = (npy_bool)found;
}
else {
*(npy_bool *) PyArray_GETPTR2(result, i, j) = (npy_bool)found;
}

data += stride;
++j;
}

++i;
// Increment the iterator to the next inner loop
} while(iternext(iter));

Py_DECREF(compare_elements);
NpyIter_Deallocate(iter);

return (PyObject*)result;

failure:
Py_DECREF(compare_elements);
Py_XDECREF(result);
if (iter != NULL) {
NpyIter_Deallocate(iter);
}
return NULL;
}

static PyObject *
isin_array(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
{
int array_is_unique, other_is_unique;
PyArrayObject *array, *other;

static char *kwlist[] = {"array", "array_is_unique", "other", "other_is_unique", NULL};

AK_CHECK_NOT(PyArg_ParseTupleAndKeywords(args, kwargs, "O!iO!i:isin_array",
kwlist,
&PyArray_Type, &array, &array_is_unique,
&PyArray_Type, &other, &other_is_unique));

if (PyArray_NDIM(other) != 1) {
return PyErr_Format(PyExc_TypeError, "Expected other to be 1-dimensional");
}

PyArray_Descr* array_dtype = PyArray_DTYPE(array);
PyArray_Descr* other_dtype = PyArray_DTYPE(other);

// Use Python sets to handle object arrays
if (PyDataType_ISOBJECT(array_dtype) || PyDataType_ISOBJECT(other_dtype)) {
return AK_isin_array_object(array, other);
}
// Use numpy in1d logic for dtype arrays
return AK_isin_array_dtype_use_np(array, other, array_is_unique && other_is_unique);
}

//------------------------------------------------------------------------------
// ArrayGO
//------------------------------------------------------------------------------
Expand Down Expand Up @@ -316,13 +515,10 @@ ArrayGO_new(PyTypeObject *cls, PyObject *args, PyObject *kwargs)
int parsed = PyArg_ParseTupleAndKeywords(
args, kwargs, "O|$p:ArrayGO", argnames, &iterable, &own_iterable
);
if (!parsed) {
return NULL;
}
AK_CHECK_NOT(parsed);

ArrayGOObject *self = (ArrayGOObject *)cls->tp_alloc(cls, 0);
if (!self) {
return NULL;
}
AK_CHECK_NOT(self);

if (PyArray_Check(iterable)) {
if (!PyDataType_ISOBJECT(PyArray_DESCR((PyArrayObject *)iterable))) {
Expand Down Expand Up @@ -365,9 +561,8 @@ ArrayGO_append(ArrayGOObject *self, PyObject *value)
{
if (!self->list) {
self->list = PyList_New(1);
if (!self->list) {
return NULL;
}
AK_CHECK_NOT(self->list);

Py_INCREF(value);
PyList_SET_ITEM(self->list, 0, value);
}
Expand All @@ -383,9 +578,8 @@ ArrayGO_extend(ArrayGOObject *self, PyObject *values)
{
if (!self->list) {
self->list = PySequence_List(values);
if (!self->list) {
return NULL;
}
AK_CHECK_NOT(self->list);

Py_RETURN_NONE;
}
Py_ssize_t len = PyList_Size(self->list);
Expand Down Expand Up @@ -527,6 +721,7 @@ static PyMethodDef arraykit_methods[] = {
{"row_1d_filter", row_1d_filter, METH_O, NULL},
{"resolve_dtype", resolve_dtype, METH_VARARGS, NULL},
{"resolve_dtype_iter", resolve_dtype_iter, METH_O, NULL},
{"isin_array", (PyCFunction)isin_array, METH_VARARGS | METH_KEYWORDS, NULL},
{NULL},
};

Expand Down
6 changes: 6 additions & 0 deletions arraykit.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,9 @@ def column_1d_filter(__array: np.array) -> np.ndarray: ...
def row_1d_filter(__array: np.array) -> np.ndarray: ...
def resolve_dtype(__d1: np.dtype, __d2: np.dtype) -> np.dtype: ...
def resolve_dtype_iter(__dtypes: tp.Iterable[np.dtype]) -> np.dtype: ...
def isin_array(*,
array: np.ndarray,
array_is_unique: bool,
other: np.ndarray,
other_is_unique: bool,
) -> np.ndarray: ...
Loading