From d761547225f6bd72bff0dbe9813770124686824c Mon Sep 17 00:00:00 2001 From: Kuan-Hao Huang Date: Mon, 7 Aug 2023 00:47:03 +0800 Subject: [PATCH] Add PyTorch code to movielens.md --- chapter_recommender-systems/movielens.md | 95 +++++++++++++++++++++++- 1 file changed, 92 insertions(+), 3 deletions(-) diff --git a/chapter_recommender-systems/movielens.md b/chapter_recommender-systems/movielens.md index 95d3e0fa8f..fb8e971242 100644 --- a/chapter_recommender-systems/movielens.md +++ b/chapter_recommender-systems/movielens.md @@ -18,10 +18,18 @@ import os import pandas as pd ``` +```{.python .input} +#@tab pytorch +from d2l import torch as d2l +import torch +import os +import pandas as pd +``` + Then, we download the MovieLens 100k dataset and load the interactions as `DataFrame`. ```{.python .input n=2} -#@tab mxnet +#@tab all #@save d2l.DATA_HUB['ml-100k'] = ( 'https://files.grouplens.org/datasets/movielens/ml-100k.zip', @@ -43,7 +51,7 @@ def read_data_ml100k(): Let's load up the data and inspect the first five records manually. It is an effective way to learn the data structure and verify that they have been loaded properly. ```{.python .input n=3} -#@tab mxnet +#@tab all data, num_users, num_items = read_data_ml100k() sparsity = 1 - len(data) / (num_users * num_items) print(f'number of users: {num_users}, number of items: {num_items}') @@ -56,7 +64,7 @@ We can see that each line consists of four columns, including "user id" 1-943, " We then plot the distribution of the count of different ratings. As expected, it appears to be a normal distribution, with most ratings centered at 3-4. ```{.python .input n=4} -#@tab mxnet +#@tab all d2l.plt.hist(data['rating'], bins=5, ec='black') d2l.plt.xlabel('Rating') d2l.plt.ylabel('Count') @@ -95,6 +103,33 @@ def split_data_ml100k(data, num_users, num_items, return train_data, test_data ``` +```{.python .input} +#@tab pytorch +#@save +def split_data_ml100k(data, num_users, num_items, + split_mode='random', test_ratio=0.1): + """Split the dataset in random mode or seq-aware mode.""" + if split_mode == 'seq-aware': + train_items, test_items, train_list = {}, {}, [] + for line in data.itertuples(): + u, i, rating, time = line[1], line[2], line[3], line[4] + train_items.setdefault(u, []).append((u, i, rating, time)) + if u not in test_items or test_items[u][-1] < time: + test_items[u] = (i, rating, time) + for u in range(1, num_users + 1): + train_list.extend(sorted(train_items[u], key=lambda k: k[3])) + test_data = [(key, *value) for key, value in test_items.items()] + train_data = [item for item in train_list if item not in test_data] + train_data = pd.DataFrame(train_data) + test_data = pd.DataFrame(test_data) + else: + mask = [True if x == 1 else False for x in torch.rand( + (len(data))) < 1 - test_ratio] + neg_mask = [not x for x in mask] + train_data, test_data = data[mask], data[neg_mask] + return train_data, test_data +``` + Note that it is good practice to use a validation set in practice, apart from only a test set. However, we omit that for the sake of brevity. In this case, our test set can be regarded as our held-out validation set. ## Loading the data @@ -120,6 +155,25 @@ def load_data_ml100k(data, num_users, num_items, feedback='explicit'): return users, items, scores, inter ``` +```{.python .input} +#@tab pytorch +#@save +def load_data_ml100k(data, num_users, num_items, feedback='explicit'): + users, items, scores = [], [], [] + inter = torch.zeros((num_items, num_users)) if feedback == 'explicit' else {} + for line in data.itertuples(): + user_index, item_index = int(line[1] - 1), int(line[2] - 1) + score = int(line[3]) if feedback == 'explicit' else 1 + users.append(user_index) + items.append(item_index) + scores.append(score) + if feedback == 'implicit': + inter.setdefault(user_index, []).append(item_index) + else: + inter[item_index, user_index] = score + return users, items, scores, inter +``` + Afterwards, we put the above steps together and it will be used in the next section. The results are wrapped with `Dataset` and `DataLoader`. Note that the `last_batch` of `DataLoader` for training data is set to the `rollover` mode (The remaining samples are rolled over to the next epoch.) and orders are shuffled. ```{.python .input n=7} @@ -146,6 +200,41 @@ def split_and_load_ml100k(split_mode='seq-aware', feedback='explicit', return num_users, num_items, train_iter, test_iter ``` +```{.python .input} +#@tab pytorch +#@save +def split_and_load_ml100k(split_mode='seq-aware', feedback='explicit', + test_ratio=0.1, batch_size=256): + data, num_users, num_items = read_data_ml100k() + train_data, test_data = split_data_ml100k( + data, num_users, num_items, split_mode, test_ratio) + train_u, train_i, train_r, _ = load_data_ml100k( + train_data, num_users, num_items, feedback) + test_u, test_i, test_r, _ = load_data_ml100k( + test_data, num_users, num_items, feedback) + + class ML100KDataset(torch.utils.data.Dataset): + def __init__(self, users, items, ratings): + assert len(users) == len(items) == len(ratings) + self.users = users + self.items = items + self.ratings = ratings + + def __getitem__(self, index): + return (self.users[index], self.items[index], + self.ratings[index]) + + def __len__(self): + return len(self.users) + + train_set = ML100KDataset(train_u, train_i, train_r) + test_set = ML100KDataset(test_u, test_i, test_r) + train_iter = torch.utils.data.DataLoader(train_set, batch_size, shuffle=True, + drop_last=True) + test_iter = torch.utils.data.DataLoader(test_set, batch_size) + return num_users, num_items, train_iter, test_iter +``` + ## Summary * MovieLens datasets are widely used for recommendation research. It is public available and free to use.