From 8a8ab92d1b4ea99e4e10129aeae7bed1467c4e64 Mon Sep 17 00:00:00 2001 From: Erfan Date: Fri, 18 Aug 2023 16:55:51 -0700 Subject: [PATCH] Add Myket Android Application Install Dataset --- README.md | 4 +++ conversion_tools/src/extended_dataset.py | 37 ++++++++++++++++++++++ conversion_tools/src/utils.py | 3 +- conversion_tools/usage/Myket.md | 40 ++++++++++++++++++++++++ 4 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 conversion_tools/usage/Myket.md diff --git a/README.md b/README.md index 7aeec52..4d0866d 100644 --- a/README.md +++ b/README.md @@ -174,6 +174,9 @@ These datasets contain measurements of clothing fit from ModCloth. * [RentTheRunway](https://cseweb.ucsd.edu/~jmcauley/datasets.html#clothing_fit): These datasets contain measurements of clothing fit from [RentTheRunway](https://www.renttherunway.com). +### Android Applications +- [Myket](https://github.com/erfanloghmani/myket-android-application-market-dataset): +This dataset contains information on application install interactions of users in the [Myket](https://myket.ir/) Android application market. The dataset contains 694,121 install interactions for 10,000 anonymized users and 7,988 applications. It also has application features like an approximate number of installs, average ratings, and category. ## Datasets information statistics @@ -215,6 +218,7 @@ These datasets contain measurements of clothing fit from [RentTheRunway](https:/ | 32 | RateBeer | 29,265 | 110,369 | 2,924,163 | 99\.9095% | Overall Rating
\[0,20\] | √ | | √ | √ | | 33 | RentTheRunway | 105,571 | 5,850 | 192,544 | 99\.9688% | Rating
\[0,10\] | √ | √ | √ | √ | | 34 | [Twitch](https://github.com/RUCAIBox/RecommenderSystems-Datasets/tree/master/dataset_info/Twitch) | 15,524,309 | 6,161,666 | 474,676,929 | 99\.9995% | Click | | | | √ | +| 35 | [Myket](https://github.com/erfanloghmani/myket-android-application-market-dataset) | 10,000 | 7,988 | 694121 | 99\.1312% | Install | √ | | √ | | ### CTR Datasets diff --git a/conversion_tools/src/extended_dataset.py b/conversion_tools/src/extended_dataset.py index 2fdabd5..a8f9a9c 100644 --- a/conversion_tools/src/extended_dataset.py +++ b/conversion_tools/src/extended_dataset.py @@ -1857,6 +1857,43 @@ def load_inter_data(self): return pd.read_csv(self.inter_file, delimiter=self.sep, header=None, engine='python') +class MyketDataset(BaseDataset): + def __init__(self, input_path, output_path): + super(MyketDataset, self).__init__(input_path, output_path) + self.dataset_name = 'myket' + + # input path + self.inter_file = os.path.join(self.input_path, 'myket.csv') + self.item_file = os.path.join(self.input_path, 'app_info_sample.csv') + + self.sep = ',' + + # output path + self.output_inter_file, self.output_item_file, self.output_user_file = self.get_output_files() + + # selected feature fields + self.inter_fields = { + 0: 'user_id:token', + 1: 'item_id:token', + 2: 'timestamp:float', + } + + self.item_fields = { + 0: 'item_id:token', + 1: 'installs:float', + 2: 'rating:float', + 3: 'rating_count:float', + 5: 'category:token_seq', + } + + def load_inter_data(self): + return pd.read_csv(self.inter_file, delimiter=self.sep, engine='python', index_col=False) + + def load_item_data(self): + return pd.read_csv(self.item_file, delimiter=self.sep, engine='python', index_col=False) + + + class JESTERDataset(BaseDataset): def __init__(self, input_path, output_path): super(JESTERDataset, self).__init__(input_path, output_path) diff --git a/conversion_tools/src/utils.py b/conversion_tools/src/utils.py index ea4882c..8729aa4 100644 --- a/conversion_tools/src/utils.py +++ b/conversion_tools/src/utils.py @@ -63,7 +63,8 @@ 'mind_large_dev': 'MINDLargeDevDataset', 'mind_small_train': 'MINDSmallTrainDataset', 'mind_small_dev': 'MINDSmallDevDataset', - 'cosmetics': 'CosmeticsDataset' + 'cosmetics': 'CosmeticsDataset', + 'myket': 'MyketDataset', } click_dataset = { diff --git a/conversion_tools/usage/Myket.md b/conversion_tools/usage/Myket.md new file mode 100644 index 0000000..e6d1f80 --- /dev/null +++ b/conversion_tools/usage/Myket.md @@ -0,0 +1,40 @@ +# Myket + +1.Clone the repository and install requirements. +(If you have already done this, please move to the step 2.) + +``` +git clone https://github.com/RUCAIBox/RecDatasets + +cd RecDatasets/conversion_tools + +pip install -r requirements.txt +``` + +2.Download the Myket Dataset and move the dataset files. +(If you have already done this, please move to the step 3.) + +``` +wget https://raw.githubusercontent.com/erfanloghmani/myket-android-application-market-dataset/main/myket.csv +wget https://raw.githubusercontent.com/erfanloghmani/myket-android-application-market-dataset/main/app_info_sample.csv + +mkdir myket-data + +mv myket.csv ./pinterest-data/ +mv app_info_sample.csv ./pinterest-data/ +``` + +3.Go the ``conversion_tools/`` directory +and run the following command to get the atomic files of Pinterest dataset. + +``` +python run.py --dataset myket \ +--input_path myket-data --output_path output_data/pinterest-data \ +--convert_inter --convert_item +``` + +`input_path` is the path of the input decompressed pinterest file + +`output_path` is the path to store converted atomic files + + `convert_inter`, `convert_item` Myket can be converted to 'myket.inter' and 'myket.item' atomic files