Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Add a mutual information CQM selector #16

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,6 @@ dmypy.json

# Pyre type checker
.pyre/

# cython
dwave/plugins/sklearn/*.cpp
117 changes: 117 additions & 0 deletions dwave/plugins/sklearn/nearest_neighbors.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# distutils: language = c++
import numpy as np
cimport numpy as np
from libcpp.vector cimport vector

cdef extern from "math.h":
double log(double x) nogil
Comment on lines +5 to +7
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
cdef extern from "math.h":
double log(double x) nogil
from libc.math cimport log

You can see it here. If you're wondering how to find what Cython actually has already implemented... the answer is that I literally search the source files every. single. time. 😄


Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You'll get a pretty big performance boost (at the cost of safety) by using

Suggested change
@cython.boundscheck(False) # Deactivate bounds checking
@cython.wraparound(False) # Deactivate negative indexing.

see Cython for NumPy users which is an excellent tutorial.

def calculate_mi(np.ndarray[np.float_t,ndim = 2] X,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In modern Cython, they want us to use Typed Memoryviews. So for instance here it would be

Suggested change
def calculate_mi(np.ndarray[np.float_t,ndim = 2] X,
def calculate_mi(np.float_t[:] X,

np.ndarray[np.float_t,ndim = 1] y,
unsigned long refinement_factor = 5):

cdef unsigned long n_obs = X.shape[0]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In general, I have found it far less headache when going back and forth between NumPy and Cython to use fixed-width types everywhere. So

Suggested change
cdef unsigned long n_obs = X.shape[0]
cdef np.uint64_t n_obs = X.shape[0]

Cython, because everything gets moderated through an intermediate generated file, tends to get confused about typing, and keeping everything fixed tends to help.

cdef float log_n_obs = log(n_obs)

cdef unsigned long n_variables = X.shape[1]

if(n_obs != y.shape[0]):
raise ValueError("y is the wrong shape")

cdef long sub_index_size = refinement_factor*int(np.round(log(n_obs))) + 2
cdef total_state_space_size = pow(sub_index_size,3)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you want pow?


cdef np.ndarray[np.float_t, ndim = 2] X_and_y = np.zeros((n_obs, n_variables + 1), dtype = float)

X_and_y[:,:-1] = X
X_and_y[:,-1] = y

cdef np.ndarray[unsigned long, ndim = 2] assignments = assign_nn(X_and_y, sub_index_size)
cdef np.ndarray[unsigned long, ndim = 4] assignment_counts_x = np.zeros((sub_index_size, sub_index_size, n_variables,n_variables), dtype=np.uint)
cdef np.ndarray[unsigned long, ndim = 2] assignment_counts_x_univariate = np.zeros((sub_index_size, n_variables), dtype=np.uint)
cdef np.ndarray[unsigned long, ndim = 1] assignment_counts_y = np.zeros((sub_index_size), dtype=np.uint)
cdef np.ndarray[unsigned long, ndim = 5] assignment_counts_joint = np.zeros((sub_index_size, sub_index_size, sub_index_size, n_variables, n_variables), dtype=np.uint)
cdef np.ndarray[unsigned long, ndim = 3] assignment_counts_joint_univariate = np.zeros((sub_index_size, sub_index_size, n_variables), dtype=np.uint)

cdef np.ndarray[double, ndim = 2] mi_matrix = np.zeros((n_variables, n_variables))

for variable_1 in range(n_variables):
for variable_2 in range(n_variables):
if variable_1 > variable_2:
for obs in range(n_obs):
assignment_counts_x[assignments[obs, variable_1],assignments[obs, variable_2],variable_1, variable_2] += 1
assignment_counts_joint[assignments[obs, variable_1],assignments[obs, variable_2], assignments[obs, -1],variable_1, variable_2] += 1

for obs in range(n_obs):
assignment_counts_x_univariate[assignments[obs, variable_1], variable_1] += 1
assignment_counts_joint_univariate[assignments[obs, variable_1], assignments[obs, -1], variable_1] += 1

for obs in range(n_obs):
assignment_counts_y[assignments[obs, -1]] += 1

for variable_1 in range(n_variables):
for variable_2 in range(n_variables):
if variable_1 > variable_2:
for i in range(sub_index_size):
for j in range(sub_index_size):
for k in range(sub_index_size):
if assignment_counts_joint[i,j,k,variable_1, variable_2] > 0:
ijk_term = (<float> assignment_counts_joint[i,j,k,variable_1, variable_2])
ijk_term *= log(ijk_term) - log(assignment_counts_x[i,j,variable_1, variable_2]) - log(assignment_counts_y[k]) + log_n_obs
mi_matrix[variable_1, variable_2] += ijk_term
mi_matrix[variable_2, variable_1] += ijk_term
for i in range(sub_index_size):
for k in range(sub_index_size):
if assignment_counts_joint_univariate[i,k,variable_1] > 0:
ik_term = (<float> assignment_counts_joint_univariate[i,k,variable_1])
ik_term *= log(ik_term) - log(assignment_counts_x_univariate[i,variable_1]) - log(assignment_counts_y[k]) + log_n_obs
mi_matrix[variable_1, variable_1] += ik_term

return mi_matrix/n_obs

cdef assign_nn(np.ndarray[np.float_t,ndim = 2] X,
long sub_index_size,
short seed = 12121):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In general, it's preferred to to make the default seed None.

Also, in this case you're better off not specifying a C type for seed. This is because it's only passed to a python function, np.random.default_rng(seed), below. So if it starts out as a C-type, it will have to be converted to a Python type first. Of course, in this case the performance hit is so minor as to be pointless... but the other advantage of using object here rather than short is it supports None or another rng.


cdef unsigned long n_obs = X.shape[0]
cdef unsigned long n_variables = X.shape[1]

rng = np.random.default_rng(seed)

cdef long[::1] full_index = np.zeros(n_obs, dtype=long)

for i in range(n_obs):
full_index[i] = i
Comment on lines +81 to +84
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
cdef long[::1] full_index = np.zeros(n_obs, dtype=long)
for i in range(n_obs):
full_index[i] = i
cdef np.int64_t[::1] full_index = np.ones(n_obs, dtype=np.int64)


cdef long[::1] sub_index = rng.choice(full_index, sub_index_size)

# numpy floats are c doubles
cdef np.ndarray[double, ndim=2] sort_values_x = X[sub_index,:].astype(float)

cdef np.ndarray[unsigned long,ndim = 2] X_out = np.zeros((n_obs, n_variables), dtype=np.uint)

cdef unsigned long start_index = (<unsigned long> sub_index_size) >> 1

#print("start index: ", start_index)
for i in range(sort_values_x.shape[1]):
sort_values_x[:,i] = np.sort(sort_values_x[:,i])

for obs in range(X.shape[0]):
X_out[obs, i] = query(X[obs, i], sort_values_x[:,i],start_index)

return X_out

cdef unsigned long query(double query_number,np.ndarray[double, ndim=1] query_list, unsigned long idx):

if query_list.shape[0] == 1:
return idx
cdef unsigned long new_len = (<unsigned long> query_list.shape[0]) >> 1
cdef double pivot_value = query_list[new_len]
cdef unsigned long new_idx_unit = (<unsigned long> new_len) >> 1
if query_number < pivot_value:
#print("query : ", query_number, " pivot: ", pivot_value, " new index: ", idx - new_idx_unit)
return query(query_number, query_list[:new_len], idx - new_idx_unit)
else:
#print("query : ", query_number, " pivot: ", pivot_value, " new index: ", idx + new_idx_unit)
return query(query_number, query_list[new_len:], idx + new_idx_unit)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you may be able to get tail recursion if instead of using query_list[new_len:] you pass the index. query_list[new_len:] makes a new object, a view, so it doesn't copy the underlying data, but it's still a new object being created.


33 changes: 31 additions & 2 deletions dwave/plugins/sklearn/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from sklearn.utils.validation import check_is_fitted

from dwave.plugins.sklearn.utilities import corrcoef
from dwave.plugins.sklearn.nearest_neighbors import calculate_mi

__all__ = ["SelectFromQuadraticModel"]

Expand Down Expand Up @@ -218,6 +219,34 @@ def correlation_cqm(
cqm.set_objective((*it.multi_index, x) for x in it if x)

return cqm

@staticmethod
def mutual_information_cqm(
X: npt.ArrayLike,
y: npt.ArrayLike,
*,
alpha: float,
num_features: int,
strict: bool = True,
) -> dimod.ConstrainedQuadraticModel:

cqm = dimod.ConstrainedQuadraticModel()
cqm.add_variables(dimod.BINARY, X.shape[1])

# add the k-hot constraint
cqm.add_constraint(
((v, 1) for v in cqm.variables),
'==' if strict else '<=',
num_features,
label=f"{num_features}-hot",
)

mi_matrix = calculate_mi(X, y)

it = np.nditer(mi_matrix, flags=['multi_index'], op_flags=[['readonly']])
cqm.set_objective((*it.multi_index, x) for x in it if x)

return cqm

def fit(
self,
Expand Down Expand Up @@ -277,8 +306,8 @@ def fit(

if self.method == "correlation":
cqm = self.correlation_cqm(X, y, num_features=num_features, alpha=alpha)
# elif self.method == "mutual information":
# cqm = self.mutual_information_cqm(X, y, num_features=num_features)
elif self.method == "mutual information":
cqm = self.mutual_information_cqm(X, y, num_features=num_features)
else:
raise ValueError(f"only methods {self.acceptable_methods} are implemented")

Expand Down
10 changes: 9 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,13 @@
# limitations under the License.

from setuptools import setup
from Cython.Build import cythonize
import numpy

setup()
setup(
install_requires=[
"numpy",
],
ext_modules = cythonize(["./dwave/plugins/sklearn/nearest_neighbors.pyx"]),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
ext_modules = cythonize(["./dwave/plugins/sklearn/nearest_neighbors.pyx"]),
ext_modules = cythonize(["./dwave/plugins/sklearn/nearest_neighbors.pyx"], annotate=True),

Cython annotations are super helpful. We do some stuff with environment variables in some of our other packages (e.g. dimod) but there's no harm in just turning it on always. At some point I'll get around to making a PR to dimod to just change that to always be True.

include_dirs=[numpy.get_include()]
)