Skip to content

Commit

Permalink
Add PyTorch code to movielens.md
Browse files Browse the repository at this point in the history
  • Loading branch information
KuanHaoHuang committed Aug 7, 2023
1 parent f1c5af5 commit d761547
Showing 1 changed file with 92 additions and 3 deletions.
95 changes: 92 additions & 3 deletions chapter_recommender-systems/movielens.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,18 @@ import os
import pandas as pd
```

```{.python .input}
#@tab pytorch
from d2l import torch as d2l
import torch
import os
import pandas as pd
```

Then, we download the MovieLens 100k dataset and load the interactions as `DataFrame`.

```{.python .input n=2}
#@tab mxnet
#@tab all
#@save
d2l.DATA_HUB['ml-100k'] = (
'https://files.grouplens.org/datasets/movielens/ml-100k.zip',
Expand All @@ -43,7 +51,7 @@ def read_data_ml100k():
Let's load up the data and inspect the first five records manually. It is an effective way to learn the data structure and verify that they have been loaded properly.

```{.python .input n=3}
#@tab mxnet
#@tab all
data, num_users, num_items = read_data_ml100k()
sparsity = 1 - len(data) / (num_users * num_items)
print(f'number of users: {num_users}, number of items: {num_items}')
Expand All @@ -56,7 +64,7 @@ We can see that each line consists of four columns, including "user id" 1-943, "
We then plot the distribution of the count of different ratings. As expected, it appears to be a normal distribution, with most ratings centered at 3-4.

```{.python .input n=4}
#@tab mxnet
#@tab all
d2l.plt.hist(data['rating'], bins=5, ec='black')
d2l.plt.xlabel('Rating')
d2l.plt.ylabel('Count')
Expand Down Expand Up @@ -95,6 +103,33 @@ def split_data_ml100k(data, num_users, num_items,
return train_data, test_data
```

```{.python .input}
#@tab pytorch
#@save
def split_data_ml100k(data, num_users, num_items,
split_mode='random', test_ratio=0.1):
"""Split the dataset in random mode or seq-aware mode."""
if split_mode == 'seq-aware':
train_items, test_items, train_list = {}, {}, []
for line in data.itertuples():
u, i, rating, time = line[1], line[2], line[3], line[4]
train_items.setdefault(u, []).append((u, i, rating, time))
if u not in test_items or test_items[u][-1] < time:
test_items[u] = (i, rating, time)
for u in range(1, num_users + 1):
train_list.extend(sorted(train_items[u], key=lambda k: k[3]))
test_data = [(key, *value) for key, value in test_items.items()]
train_data = [item for item in train_list if item not in test_data]
train_data = pd.DataFrame(train_data)
test_data = pd.DataFrame(test_data)
else:
mask = [True if x == 1 else False for x in torch.rand(
(len(data))) < 1 - test_ratio]
neg_mask = [not x for x in mask]
train_data, test_data = data[mask], data[neg_mask]
return train_data, test_data
```

Note that it is good practice to use a validation set in practice, apart from only a test set. However, we omit that for the sake of brevity. In this case, our test set can be regarded as our held-out validation set.

## Loading the data
Expand All @@ -120,6 +155,25 @@ def load_data_ml100k(data, num_users, num_items, feedback='explicit'):
return users, items, scores, inter
```

```{.python .input}
#@tab pytorch
#@save
def load_data_ml100k(data, num_users, num_items, feedback='explicit'):
users, items, scores = [], [], []
inter = torch.zeros((num_items, num_users)) if feedback == 'explicit' else {}
for line in data.itertuples():
user_index, item_index = int(line[1] - 1), int(line[2] - 1)
score = int(line[3]) if feedback == 'explicit' else 1
users.append(user_index)
items.append(item_index)
scores.append(score)
if feedback == 'implicit':
inter.setdefault(user_index, []).append(item_index)
else:
inter[item_index, user_index] = score
return users, items, scores, inter
```

Afterwards, we put the above steps together and it will be used in the next section. The results are wrapped with `Dataset` and `DataLoader`. Note that the `last_batch` of `DataLoader` for training data is set to the `rollover` mode (The remaining samples are rolled over to the next epoch.) and orders are shuffled.

```{.python .input n=7}
Expand All @@ -146,6 +200,41 @@ def split_and_load_ml100k(split_mode='seq-aware', feedback='explicit',
return num_users, num_items, train_iter, test_iter
```

```{.python .input}
#@tab pytorch
#@save
def split_and_load_ml100k(split_mode='seq-aware', feedback='explicit',
test_ratio=0.1, batch_size=256):
data, num_users, num_items = read_data_ml100k()
train_data, test_data = split_data_ml100k(
data, num_users, num_items, split_mode, test_ratio)
train_u, train_i, train_r, _ = load_data_ml100k(
train_data, num_users, num_items, feedback)
test_u, test_i, test_r, _ = load_data_ml100k(
test_data, num_users, num_items, feedback)
class ML100KDataset(torch.utils.data.Dataset):
def __init__(self, users, items, ratings):
assert len(users) == len(items) == len(ratings)
self.users = users
self.items = items
self.ratings = ratings
def __getitem__(self, index):
return (self.users[index], self.items[index],
self.ratings[index])
def __len__(self):
return len(self.users)
train_set = ML100KDataset(train_u, train_i, train_r)
test_set = ML100KDataset(test_u, test_i, test_r)
train_iter = torch.utils.data.DataLoader(train_set, batch_size, shuffle=True,
drop_last=True)
test_iter = torch.utils.data.DataLoader(test_set, batch_size)
return num_users, num_items, train_iter, test_iter
```

## Summary

* MovieLens datasets are widely used for recommendation research. It is public available and free to use.
Expand Down

0 comments on commit d761547

Please sign in to comment.