forked from recommenders-team/recommenders
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sparse.py
196 lines (144 loc) · 6.92 KB
/
sparse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
"""
Generate the user/item affinity matrix from a pandas dataframe and vice versa
"""
import pandas as pd
import numpy as np
import itertools
from scipy.sparse import coo_matrix
import logging
# import default parameters
from reco_utils.common.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
DEFAULT_RATING_COL,
DEFAULT_TIMESTAMP_COL,
DEFAULT_PREDICTION_COL,
)
# for logging
log = logging.getLogger(__name__)
class AffinityMatrix:
# initialize class parameters
def __init__(
self,
DF,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_rating=DEFAULT_RATING_COL,
col_pred=DEFAULT_PREDICTION_COL,
save_path=None,
):
"""Generate the user/item affinity matrix from a pandas dataframe and vice versa
Args:
DF (pd.DataFrame): a dataframe containing the data
col_user (str): default name for user column
col_item (str): default name for item column
col_rating (str): default name for rating columns
save_path (str): default path to save item/user maps
"""
self.df = DF # dataframe
# pandas DF parameters
self.col_item = col_item
self.col_user = col_user
self.col_rating = col_rating
self.col_pred = col_pred
# Options to save the model for future use
self.save_path = save_path
def _gen_index(self):
"""
Generate the user/item index:
map_users, map_items: dictionaries mapping the original user/item index to matrix indices
map_back_users, map_back_items: dictionaries to map back the matrix elements to the original
dataframe indices
Basic mechanics:
As a first step we retieve the unique elements in the dataset. In this way we can take care
of either completely missing rows (a user with no ratings) or completely missing columns
(an item that has not being reviewed by anyone). The original indices in the dataframe are
then mapped to an ordered, contiguous integer series to generate a compact matrix representation.
Functions to map back to the original indices are also provided and can be saved in order to use
a pretrained model.
"""
# sort entries by user index
self.df_ = self.df.sort_values(by=[self.col_user])
# find unique user and item index
unique_users = self.df_[self.col_user].unique()
unique_items = self.df_[self.col_item].unique()
self.Nusers = len(unique_users)
self.Nitems = len(unique_items)
# create a dictionary to map unique users/items to hashed values to generate the matrix
self.map_users = {x: i for i, x in enumerate(unique_users)}
self.map_items = {x: i for i, x in enumerate(unique_items)}
# map back functions used to get back the original dataframe
self.map_back_users = {i: x for i, x in enumerate(unique_users)}
self.map_back_items = {i: x for i, x in enumerate(unique_items)}
self.df_.loc[:, "hashedItems"] = self.df_[self.col_item].map(self.map_items)
self.df_.loc[:, "hashedUsers"] = self.df_[self.col_user].map(self.map_users)
# optionally save the inverse dictionary to work with trained models
if self.save_path is not None:
np.save(self.save_path + "/user_dict", self.map_users)
np.save(self.save_path + "/item_dict", self.map_items)
np.save(self.save_path + "/user_back_dict", self.map_back_users)
np.save(self.save_path + "/item_back_dict", self.map_back_items)
def gen_affinity_matrix(self):
"""
Generate the user/item affinity matrix
Returns:
AM: user-affinity matrix of dimensions (Nusers, Nitems) in numpy format. Unrated movies
are assigned a value of 0.
Basic mechanics:
As a first step, two new columns are added to the input DF, containing the index maps
generated by the gen_index() method. The new indices, together with the ratings, are
then used to generate the user/item affinity matrix using scipy's sparse matrix method
coo_matrix; for reference see:
https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html
The input format is: coo_matrix((data, (rows, columns)), shape=(rows, columns))
"""
log.info("Generating the user/item affinity matrix...")
self._gen_index()
ratings = self.df_[self.col_rating] # ratings
itm_id = self.df_["hashedItems"] # itm_id serving as columns
usr_id = self.df_["hashedUsers"] # usr_id serving as rows
# generate a sparse matrix representation using scipy's coo_matrix and convert to array format
self.AM = coo_matrix(
(ratings, (usr_id, itm_id)), shape=(self.Nusers, self.Nitems)
).toarray()
# ---------------------print the degree of sparsness of the matrix------------------------------
zero = (self.AM == 0).sum() # number of unrated items
total = self.AM.shape[0] * self.AM.shape[1] # number of elements in the matrix
sparsness = zero / total * 100 # Percentage of zeros in the matrix
log.info("Matrix generated, sparseness percentage: %d" % sparsness)
return self.AM
def map_back_sparse(self, X, kind):
"""
Map back the user/affinity matrix to a pd dataframe
Args:
X (np.array, int32): user/item affinity matrix
kind (string): specify if the output values are ratings or predictions
Returns:
out_df (pandas dataframe): the generated pandas dataframe
"""
m, n = X.shape
# 1) Create a DF from a sparse matrix
# obtain the non zero items
items = [np.asanyarray(np.where(X[i, :] != 0)).flatten() for i in range(m)]
ratings = [X[i, items[i]] for i in range(m)] # obtain the non-zero ratings
# Creates user ids following the DF format
userids = []
for i in range(0, m):
userids.extend([i] * len(items[i]))
# Flatten the lists to follow the DF input format
items = list(itertools.chain.from_iterable(items))
ratings = list(itertools.chain.from_iterable(ratings))
if kind == "ratings":
col_out = self.col_rating
else:
col_out = self.col_pred
# create a df
out_df = pd.DataFrame.from_dict(
{self.col_user: userids, self.col_item: items, col_out: ratings}
)
# 2) map back user/item ids to their original value
out_df[self.col_user] = out_df[self.col_user].map(self.map_back_users)
out_df[self.col_item] = out_df[self.col_item].map(self.map_back_items)
return out_df