From 54c97fd300c2615eb53fcfd4fdc7879b98f19703 Mon Sep 17 00:00:00 2001 From: iggy Date: Thu, 19 Jan 2023 14:45:27 +0900 Subject: [PATCH] . --- ... for Buffalo's input data-checkpoint.ipynb | 313 +++++++++ .../1. Matrix Factorization-checkpoint.ipynb | 647 ++++++++++++++++++ .../2. Cofactor-checkpoint.ipynb | 401 +++++++++++ .../3. skip-gram-checkpoint.ipynb | 341 +++++++++ ...ce comparison over models-checkpoint.ipynb | 236 +++++++ .../5. KakaoBrunch12M-checkpoint.ipynb} | 12 +- ...a Transform for Buffalo's input data.ipynb | 8 +- .../1. Matrix Factorization.ipynb | 163 +++-- examples/jupyter-examples/2. Cofactor.ipynb | 145 ++-- examples/jupyter-examples/3. skip-gram.ipynb | 60 +- .... Performance comparison over models.ipynb | 55 +- 11 files changed, 2164 insertions(+), 217 deletions(-) create mode 100644 examples/jupyter-examples/.ipynb_checkpoints/0. Data Transform for Buffalo's input data-checkpoint.ipynb create mode 100644 examples/jupyter-examples/.ipynb_checkpoints/1. Matrix Factorization-checkpoint.ipynb create mode 100644 examples/jupyter-examples/.ipynb_checkpoints/2. Cofactor-checkpoint.ipynb create mode 100644 examples/jupyter-examples/.ipynb_checkpoints/3. skip-gram-checkpoint.ipynb create mode 100644 examples/jupyter-examples/.ipynb_checkpoints/4. Performance comparison over models-checkpoint.ipynb rename examples/jupyter-examples/{5. KakaoBrunch12M.ipynb => .ipynb_checkpoints/5. KakaoBrunch12M-checkpoint.ipynb} (96%) diff --git a/examples/jupyter-examples/.ipynb_checkpoints/0. Data Transform for Buffalo's input data-checkpoint.ipynb b/examples/jupyter-examples/.ipynb_checkpoints/0. Data Transform for Buffalo's input data-checkpoint.ipynb new file mode 100644 index 0000000..da63bd3 --- /dev/null +++ b/examples/jupyter-examples/.ipynb_checkpoints/0. Data Transform for Buffalo's input data-checkpoint.ipynb @@ -0,0 +1,313 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download movielens 1M dataset(https://grouplens.org/datasets/movielens/)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Archive: ml-1m.zip\n", + " creating: data/ml-1m/\n", + " inflating: data/ml-1m/movies.dat \n", + " inflating: data/ml-1m/ratings.dat \n", + " inflating: data/ml-1m/README \n", + " inflating: data/ml-1m/users.dat \n" + ] + } + ], + "source": [ + "!mkdir data\n", + "!wget -q http://www.grouplens.org/system/files/ml-1m.zip ./data\n", + "!unzip -o ml-1m -d data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Transform ml-1m dataset into Matrix Market Form" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you are not familiar with mm(matrix market) format, refer [this](http://networkrepository.com/mtx-matrix-market-format.html)\n", + "\n", + "If you need to know further on how buffalo handle data, check [Documentation on database of Buffalo](https://buffalo-recsys.readthedocs.io/en/latest/intro.html#database)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from scipy.io import mmwrite\n", + "from scipy.io import mmread\n", + "from scipy.sparse import csr_matrix" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "ratings = pd.read_csv(\"data/ml-1m/ratings.dat\", header=None, sep=\"::\", engine='python')\n", + "ratings.columns = [\"uid\", \"iid\", \"rating\", \"timestamp\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "movies = pd.read_csv('data/ml-1m/movies.dat', header=None, sep=\"::\", engine='python', encoding='latin-1')\n", + "movies.columns = ['iid', 'movie_name', 'genre']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "buffalo iid does not support string with utf-8 encoding and having spaces.\n", + "\n", + "Therefore, we have to replace spaces and utf-8 text." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def parse_moviename(movie_name):\n", + " return movie_name.replace(' ', '_').encode('utf-8').decode('ascii', 'ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "iid_to_movie_name = dict(zip(movies.iid.tolist(), movies.movie_name.tolist()))\n", + "iid_to_movie_name = {iid: parse_moviename(movie_name) for (iid, movie_name) in iid_to_movie_name.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "uid_to_idx = {uid: idx for (idx, uid) in enumerate(ratings.uid.unique().tolist())}\n", + "iid_to_idx = {iid: idx for (idx, iid) in enumerate(ratings.iid.unique().tolist())}\n", + "idx_to_movie_name = {idx:iid_to_movie_name[iid] for (iid, idx) in iid_to_idx.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Examples of movie names\n", + "\n", + "[index 30] movie_name: Antz_(1998)\n", + "[index 31] movie_name: Girl,_Interrupted_(1999)\n", + "[index 32] movie_name: Hercules_(1997)\n", + "[index 33] movie_name: Aladdin_(1992)\n", + "[index 34] movie_name: Mulan_(1998)\n" + ] + } + ], + "source": [ + "print(\"Examples of movie names\\n\")\n", + "\n", + "for i in range(30, 35):\n", + " print(\"[index %d] movie_name: %s\" % (i, idx_to_movie_name[i]))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "row, col, dat = ratings.uid.tolist(), ratings.iid.tolist(), ratings.rating.tolist()\n", + "row = [uid_to_idx[r] for r in row]\n", + "col = [iid_to_idx[c] for c in col]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "train_matrix = csr_matrix((dat, (row,col)), shape=(1 + np.max(row), 1 + np.max(col)))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(6040, 3706)\n" + ] + } + ], + "source": [ + "print(train_matrix.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### To transform csr matrix into matrix market format easily, we use mmwrite (matrix market write)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "mmwrite('data/ml-1m/main', train_matrix)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"data/ml-1m/uid\", \"w\") as f:\n", + " for uid in uid_to_idx:\n", + " print(uid, file=f)\n", + "\n", + "with open(\"data/ml-1m/iid\", \"w\") as f:\n", + " for iid, movie_name in idx_to_movie_name.items():\n", + " print(movie_name, file=f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Transform ml-1m dataset into Stream format" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Stream file format used in buffalo contains lines lists, having space as delimiter.\n", + "\n", + "One line is ordered list of items that each user interacted (ordered by time)\n", + "\n", + "This is useful when the order between interactions are considered(e.g., word2vec, Cofactor).\n", + "\n", + "See `2. Cofactor` or `3. Word2vec` to see the case where Stream format data is used\n", + "\n", + "If you need to know further on Stream format data, check [Documentation on database of Buffalo](https://buffalo-recsys.readthedocs.io/en/latest/intro.html#database)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "ratings_as_list = ratings.sort_values(by='timestamp').groupby('uid').iid.apply(list).reset_index()\n", + "uid = ratings_as_list.uid.tolist()\n", + "seen_iids = ratings_as_list.iid.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "seen_iids = [' '.join([iid_to_movie_name[iid] for iid in iids]) for iids in seen_iids]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Girl,_Interrupted_(1999) Titanic_(1997) Back_to_the_Future_(1985) Cinderella_(1950) Meet_Joe_Black_(1998) Last_Days_of_Disco,_The_(1998) Erin_Brockovich_(2000) To_Kill_a_Mockingbird_(1962) Christmas_Story,_A_(1983) Star_Wars:_Episode_IV_-_A_New_Hope_(1977) Wallace_&_Gromit:_The_Best_of_Aardman_Animation_(1996) One_Flew_Over_the_Cuckoo's_Nest_(1975) Wizard_of_Oz,_The_(1939) Fargo_(1996) Run_Lola_Run_(Lola_rennt)_(1998) Rain_Man_(1988) Saving_Private_Ryan_(1998) Awakenings_(1990) Gigi_(1958) Sound_of_Music,_The_(1965) Driving_Miss_Daisy_(1989) Mary_Poppins_(1964) Bambi_(1942) Apollo_13_(1995) E.T._the_Extra-Terrestrial_(1982) My_Fair_Lady_(1964) Ben-Hur_(1959) Big_(1988) Dead_Poets_Society_(1989) Sixth_Sense,_The_(1999) James_and_the_Giant_Peach_(1996) Ferris_Bueller's_Day_Off_(1986) Secret_Garden,_The_(1993) Toy_Story_2_(1999) Airplane!_(1980) Dumbo_(1941) Pleasantville_(1998) Princess_Bride,_The_(1987) Snow_White_and_the_Seven_Dwarfs_(1937) Miracle_on_34th_Street_(1947) Ponette_(1996) Schindler's_List_(1993) Close_Shave,_A_(1995) Beauty_and_the_Beast_(1991) Aladdin_(1992) Toy_Story_(1995) Tarzan_(1999) Hunchback_of_Notre_Dame,_The_(1996) Antz_(1998) Bug's_Life,_A_(1998) Mulan_(1998) Hercules_(1997) Pocahontas_(1995)\n" + ] + } + ], + "source": [ + "print(seen_iids[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"data/ml-1m/stream\", \"w\") as f:\n", + " for iid_list in seen_iids:\n", + " print(iid_list, file=f)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/jupyter-examples/.ipynb_checkpoints/1. Matrix Factorization-checkpoint.ipynb b/examples/jupyter-examples/.ipynb_checkpoints/1. Matrix Factorization-checkpoint.ipynb new file mode 100644 index 0000000..762e2da --- /dev/null +++ b/examples/jupyter-examples/.ipynb_checkpoints/1. Matrix Factorization-checkpoint.ipynb @@ -0,0 +1,647 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Matrix Factorization\n", + "In this notebook, we show how to run [ALS](http://yifanhu.net/PUB/cf.pdf) and [BPR-MF](https://arxiv.org/pdf/1205.2618.pdf) with Buffalo" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import buffalo\n", + "from buffalo import ALS, BPRMF\n", + "from buffalo import aux, log\n", + "from buffalo import ALSOption, BPRMFOption\n", + "from buffalo import MatrixMarketOptions\n", + "log.set_log_level(1) # set log level 3 or higher to check more information" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "MODEL_TO_USE = \"ALS\"\n", + "# MODEL_TO_USE = \"BPR\"# un-comment this if you want to use BPR" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### To Run buffalo model, you have to set two options.\n", + " - model option\n", + " - data option" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model Option" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "if MODEL_TO_USE == \"ALS\":\n", + " opt = ALSOption().get_default_option() \n", + "elif MODEL_TO_USE == \"BPR\":\n", + " opt = BPRMFOption().get_default_option()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "you may change other the option values\n", + "```\n", + " opt.key = val\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "for example, one can set validation option." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "opt.evaluation_on_learning = True\n", + "opt.validation = aux.Option({'topk': 10})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`opt.validation = aux.Option({'topk': 10})` means we evaluate the model using validation data by top@10 metric\n", + "\n", + "`opt.evaluation_on_learning = True` makes Buffalo model do evaluation during training\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Options are shown below" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'evaluation_on_learning': True,\n", + " 'compute_loss_on_training': True,\n", + " 'early_stopping_rounds': 0,\n", + " 'save_best': False,\n", + " 'evaluation_period': 1,\n", + " 'save_period': 10,\n", + " 'random_seed': 0,\n", + " 'validation': {'topk': 10},\n", + " 'adaptive_reg': False,\n", + " 'save_factors': False,\n", + " 'accelerator': False,\n", + " 'd': 20,\n", + " 'num_iters': 10,\n", + " 'num_workers': 1,\n", + " 'hyper_threads': 256,\n", + " 'num_cg_max_iters': 3,\n", + " 'reg_u': 0.1,\n", + " 'reg_i': 0.1,\n", + " 'alpha': 8,\n", + " 'optimizer': 'manual_cg',\n", + " 'cg_tolerance': 1e-10,\n", + " 'block_size': 32,\n", + " 'eps': 1e-10,\n", + " 'model_path': '',\n", + " 'data_opt': {}}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "opt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To see full description of options, see `Algooption`, `ALSOption`, and `BPROption` in `buffalo/algo/options.py`\n", + "\n", + "an option of one model is different from an option of other type of model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Option" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "data_opt = MatrixMarketOptions().get_default_option()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Similar to model option, data option can be set in this way.\n", + "```\n", + " data_opt.key = val\n", + "```\n", + "\n", + "\n", + "You must set `data_opt.input.main` option.\n", + "\n", + "This should be the path of input data(matrix market or stream)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "data_opt.input.main = 'data/ml-1m/main.mtx'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Additionally, we can set list of itemids, and list of userids also\n", + "\n", + "By doing so, you can query similar users/items or recommendations by itemids or userids." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "data_opt.input.iid = 'data/ml-1m/iid'\n", + "data_opt.input.uid = 'data/ml-1m/uid'" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'type': 'matrix_market',\n", + " 'input': {'main': 'data/ml-1m/main.mtx',\n", + " 'uid': 'data/ml-1m/uid',\n", + " 'iid': 'data/ml-1m/iid'},\n", + " 'data': {'internal_data_type': 'matrix',\n", + " 'validation': {'name': 'sample', 'p': 0.01, 'max_samples': 500},\n", + " 'batch_mb': 1024,\n", + " 'use_cache': False,\n", + " 'tmp_dir': '/tmp/',\n", + " 'path': './mm.h5py',\n", + " 'disk_based': False}}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_opt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Open Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### You can open data in two ways\n", + "- open data when initializing model\n", + "- open data directly\n", + "\n", + "There is no difference" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### open data when initializing model" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "model = ALS(ALSOption().get_default_option(), data_opt=data_opt)\n", + "del model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "it opens the data when loading model (indirect way)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### open data directly" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import buffalo" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "data = buffalo.data.load(data_opt)\n", + "data.create()\n", + "model = ALS(ALSOption().get_default_option(), data=data)\n", + "del data\n", + "del model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "it opens data dirctly, and passes the opened data to the model\n", + "afterwards, we will use opened data " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "data = buffalo.data.load(data_opt)\n", + "data.create()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "if MODEL_TO_USE == \"ALS\":\n", + " model = ALS(opt, data=data)\n", + "elif MODEL_TO_USE == \"BPR\":\n", + " model = BPRMF(opt, data=data)\n", + "model.initialize()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "val_res = model.train()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'train_loss': 0.2804447780030053,\n", + " 'val_ndcg': 0.053509737512824056,\n", + " 'val_map': 0.036605582307829496,\n", + " 'val_accuracy': 0.10280898876404494,\n", + " 'val_auc': 0.5500847197037205,\n", + " 'val_rmse': 2.9060066759494854,\n", + " 'val_error': 2.713486196756363}" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "val_res" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Saving and Loading model" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mkdir: model: File exists\r\n" + ] + } + ], + "source": [ + "!mkdir model" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "\n", + "model.save(\"model/model-ml-1m\")\n", + "del model\n", + "if MODEL_TO_USE == \"ALS\":\n", + " model = ALS()\n", + "elif MODEL_TO_USE == \"BPR\":\n", + " model = BPRMF()\n", + "model.load(\"model/model-ml-1m\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Recommendation for users" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "for user 61, recommendations are \n", + "items ['Rules_of_Engagement_(2000)', 'Remember_the_Titans_(2000)', 'Skulls,_The_(2000)'].\n", + "\n", + "for user 62, recommendations are \n", + "items ['Midnight_in_the_Garden_of_Good_and_Evil_(1997)', 'Bonnie_and_Clyde_(1967)', 'Coming_Home_(1978)'].\n", + "\n", + "for user 63, recommendations are \n", + "items ['Eyes_Wide_Shut_(1999)', 'Summer_of_Sam_(1999)', 'Go_(1999)'].\n", + "\n", + "for user 64, recommendations are \n", + "items ['Jurassic_Park_(1993)', 'Braveheart_(1995)', 'Star_Wars:_Episode_VI_-_Return_of_the_Jedi_(1983)'].\n", + "\n", + "for user 65, recommendations are \n", + "items ['Air_Force_One_(1997)', 'Patriot,_The_(2000)', 'Backdraft_(1991)'].\n", + "\n", + "for user 66, recommendations are \n", + "items ['American_Beauty_(1999)', 'Star_Wars:_Episode_VI_-_Return_of_the_Jedi_(1983)', 'Braveheart_(1995)'].\n", + "\n", + "for user 67, recommendations are \n", + "items ['12_Angry_Men_(1957)', 'Grapes_of_Wrath,_The_(1940)', 'Bridge_on_the_River_Kwai,_The_(1957)'].\n", + "\n", + "for user 68, recommendations are \n", + "items ['Wrong_Trousers,_The_(1993)', 'Close_Shave,_A_(1995)', 'Grand_Day_Out,_A_(1992)'].\n", + "\n", + "for user 69, recommendations are \n", + "items ['Dead_Man_Walking_(1995)', 'Hamlet_(1996)', 'Malcolm_X_(1992)'].\n", + "\n" + ] + } + ], + "source": [ + "uids = [str(x) for x in range(61, 70)]\n", + "recommendation_result = model.topk_recommendation(uids, topk=3)\n", + "for uid, iids in recommendation_result.items():\n", + " print(f\"for user {uid}, recommendations are \", f\"\\nitems {iids}.\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Recommendation for users in given pools" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "for user 1, recommendations are \n", + "items ['Shanghai_Noon_(2000)', 'Frequency_(2000)', 'Remember_the_Titans_(2000)'].\n", + "\n", + "for user 2, recommendations are \n", + "items ['Remember_the_Titans_(2000)', 'Rules_of_Engagement_(2000)', 'Frequency_(2000)'].\n", + "\n", + "for user 3, recommendations are \n", + "items ['Shanghai_Noon_(2000)', 'Frequency_(2000)', 'Remember_the_Titans_(2000)'].\n", + "\n", + "for user 4, recommendations are \n", + "items ['Shanghai_Noon_(2000)', 'Frequency_(2000)', 'Gone_in_60_Seconds_(2000)'].\n", + "\n" + ] + } + ], + "source": [ + "pool = ['Rules_of_Engagement_(2000)', \n", + " 'Remember_the_Titans_(2000)', \n", + " 'Skulls,_The_(2000)', \n", + " '28_Days_(2000)', \n", + " 'Frequency_(2000)', \n", + " 'Gone_in_60_Seconds_(2000)', \n", + " 'What_Lies_Beneath_(2000)', \n", + " 'Reindeer_Games_(2000)', \n", + " 'Final_Destination_(2000)', \n", + " 'Shanghai_Noon_(2000)']\n", + "uids = [str(x) for x in range(5)]\n", + "recommendation_result = model.topk_recommendation(uids, topk=3, pool=pool)\n", + "for uid, iids in recommendation_result.items():\n", + " print(f\"for user {uid}, recommendations are \", f\"\\nitems {iids}.\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Recommendation results are chosen among items in given pool" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Find Most similar items" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Similar movies to Toy_Story_2_(1999)\n", + "01. 0.958 Toy_Story_(1995)\n", + "02. 0.957 Bug's_Life,_A_(1998)\n", + "03. 0.949 Shakespeare_in_Love_(1998)\n", + "04. 0.945 Being_John_Malkovich_(1999)\n", + "05. 0.935 Sixth_Sense,_The_(1999)\n" + ] + } + ], + "source": [ + "print('Similar movies to Toy_Story_2_(1999)')\n", + "similar_items = model.most_similar('Toy_Story_2_(1999)', 5)\n", + "for rank, (movie_name, score) in enumerate(similar_items):\n", + " print(f'{rank + 1:02d}. {score:.3f} {movie_name}')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Find Most similar items given pool" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "01. 0.467 Shanghai_Noon_(2000)\n", + "02. 0.435 Frequency_(2000)\n", + "03. 0.354 Gone_in_60_Seconds_(2000)\n", + "04. 0.320 28_Days_(2000)\n", + "05. 0.259 What_Lies_Beneath_(2000)\n", + "06. 0.186 Final_Destination_(2000)\n" + ] + } + ], + "source": [ + "pool = ['Rules_of_Engagement_(2000)', \n", + " 'Remember_the_Titans_(2000)', \n", + " 'Skulls,_The_(2000)', \n", + " '28_Days_(2000)', \n", + " 'Frequency_(2000)', \n", + " 'Gone_in_60_Seconds_(2000)', \n", + " 'What_Lies_Beneath_(2000)', \n", + " 'Reindeer_Games_(2000)', \n", + " 'Final_Destination_(2000)', \n", + " 'Shanghai_Noon_(2000)']\n", + "similar_items = model.most_similar('Toy_Story_2_(1999)', 5, pool=pool)\n", + "for rank, (movie_name, score) in enumerate(similar_items):\n", + " print(f'{rank + 1:02d}. {score:.3f} {movie_name}')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + }, + "vscode": { + "interpreter": { + "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/jupyter-examples/.ipynb_checkpoints/2. Cofactor-checkpoint.ipynb b/examples/jupyter-examples/.ipynb_checkpoints/2. Cofactor-checkpoint.ipynb new file mode 100644 index 0000000..75846f0 --- /dev/null +++ b/examples/jupyter-examples/.ipynb_checkpoints/2. Cofactor-checkpoint.ipynb @@ -0,0 +1,401 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cofactor\n", + "\n", + "Liang's extension of Alternating Least Squares Algorithm. [Factorization Meets the Item Embedding: Regularizing Matrix Factorization with Item Co-occurrence](https://dl.acm.org/doi/10.1145/2959100.2959182)\n", + "\n", + "It co-factorizes both user-item interaction matrix and SPPMI matrix(kind of item-item co-occurence matrix) with shared item matrix. It claims that two different matrix reveals different information, thus exploiting both matrix will be helpful." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from buffalo import CFR, CFROption, StreamOptions\n", + "from buffalo import aux, log" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'evaluation_on_learning': True,\n", + " 'compute_loss_on_training': True,\n", + " 'early_stopping_rounds': 0,\n", + " 'save_best': False,\n", + " 'evaluation_period': 1,\n", + " 'save_period': 10,\n", + " 'random_seed': 0,\n", + " 'validation': {},\n", + " 'save_factors': False,\n", + " 'd': 20,\n", + " 'num_iters': 10,\n", + " 'num_workers': 1,\n", + " 'num_cg_max_iters': 3,\n", + " 'cg_tolerance': 1e-10,\n", + " 'eps': 1e-10,\n", + " 'reg_u': 0.1,\n", + " 'reg_i': 0.1,\n", + " 'reg_c': 0.1,\n", + " 'alpha': 8.0,\n", + " 'l': 1.0,\n", + " 'optimizer': 'manual_cg',\n", + " 'model_path': '',\n", + " 'data_opt': {}}" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "opt = CFROption().get_default_option() # initialize default Cofactor option\n", + "opt # Check buffalo/algo/options.py to see further." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "data_opt = StreamOptions().get_default_option()\n", + "data_opt.data.sppmi = {\"windows\": 5, \"k\": 10}\n", + "data_opt.input.main = 'data/ml-1m/stream'\n", + "data_opt.input.uid = 'data/ml-1m/uid'\n", + "data_opt.input.iid = 'data/ml-1m/iid'\n", + "data_opt.data.value_prepro = aux.Option({'name': 'OneBased'})\n", + "data_opt.data.path = './2-cfr.h5py'\n", + "data_opt.data.internal_data_type = 'matrix'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[INFO ] 2023-01-19 14:15:45 [stream.py:279] Create database from stream data\n", + "[INFO ] 2023-01-19 14:15:45 [stream.py:103] gathering itemids from data/ml-1m/stream...\n", + "[INFO ] 2023-01-19 14:15:45 [stream.py:127] Found 3706 unique itemids\n", + "[INFO ] 2023-01-19 14:15:45 [stream.py:288] Creating working data...\n", + "[INFO ] 2023-01-19 14:15:47 [stream.py:296] Building data part...\n", + "[INFO ] 2023-01-19 14:15:47 [base.py:410] Building compressed triplets for rowwise...\n", + "[INFO ] 2023-01-19 14:15:47 [base.py:411] Preprocessing...\n", + "[INFO ] 2023-01-19 14:15:47 [base.py:414] In-memory Compressing ...\n", + "[INFO ] 2023-01-19 14:15:48 [base.py:294] Load triplet files. Total job files: 11\n", + "[INFO ] 2023-01-19 14:15:48 [base.py:444] Finished\n", + "[INFO ] 2023-01-19 14:15:48 [base.py:410] Building compressed triplets for colwise...\n", + "[INFO ] 2023-01-19 14:15:48 [base.py:411] Preprocessing...\n", + "[INFO ] 2023-01-19 14:15:48 [base.py:414] In-memory Compressing ...\n", + "[INFO ] 2023-01-19 14:15:48 [base.py:294] Load triplet files. Total job files: 11\n", + "[INFO ] 2023-01-19 14:15:48 [base.py:444] Finished\n", + "[INFO ] 2023-01-19 14:15:48 [stream.py:168] build sppmi (shift k: 10)\n", + "[INFO ] 2023-01-19 14:16:11 [stream.py:179] convert from /tmp/tmpub40v49r to /tmp/tmpwzynbnwf\n", + "[INFO ] 2023-01-19 14:16:11 [stream.py:182] sppmi nnz: 350162\n", + "[INFO ] 2023-01-19 14:16:11 [stream.py:186] Disk-based Compressing...\n", + "[INFO ] 2023-01-19 14:16:11 [base.py:339] Dividing into 20 chunks...\n", + "[INFO ] 2023-01-19 14:16:11 [base.py:349] Total job files: 20\n", + "[PROGRESS] 100.00% 0.0/0.0secs 2,290.91it/s\n", + "[INFO ] 2023-01-19 14:16:11 [stream.py:311] DB built on ./3-cfr.h5py\n", + "[INFO ] 2023-01-19 14:16:11 [cfr.py:59] CFR ({\n", + " \"evaluation_on_learning\": true,\n", + " \"compute_loss_on_training\": true,\n", + " \"early_stopping_rounds\": 0,\n", + " \"save_best\": false,\n", + " \"evaluation_period\": 1,\n", + " \"save_period\": 10,\n", + " \"random_seed\": 0,\n", + " \"validation\": {},\n", + " \"save_factors\": false,\n", + " \"d\": 20,\n", + " \"num_iters\": 10,\n", + " \"num_workers\": 1,\n", + " \"num_cg_max_iters\": 3,\n", + " \"cg_tolerance\": 1e-10,\n", + " \"eps\": 1e-10,\n", + " \"reg_u\": 0.1,\n", + " \"reg_i\": 0.1,\n", + " \"reg_c\": 0.1,\n", + " \"alpha\": 8.0,\n", + " \"l\": 1.0,\n", + " \"optimizer\": \"manual_cg\",\n", + " \"model_path\": \"\",\n", + " \"data_opt\": {}\n", + "})\n", + "[INFO ] 2023-01-19 14:16:11 [cfr.py:61] Stream Header(6040, 3706, 994169) Validation(6040 samples)\n" + ] + } + ], + "source": [ + "cofactor = CFR(opt, data_opt=data_opt)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "cofactor.initialize()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[INFO ] 2023-01-19 14:16:15 [buffered_data.py:72] Set data buffer size as 67108864(minimum required batch size is 251).\n", + "[INFO ] 2023-01-19 14:16:15 [cfr.py:214] Iteration 1: Loss 0.000 Elapsed 0.098 secs\n", + "[INFO ] 2023-01-19 14:16:15 [cfr.py:214] Iteration 2: Loss 0.000 Elapsed 0.095 secs\n", + "[INFO ] 2023-01-19 14:16:15 [cfr.py:214] Iteration 3: Loss 0.000 Elapsed 0.093 secs\n", + "[INFO ] 2023-01-19 14:16:16 [cfr.py:214] Iteration 4: Loss 0.000 Elapsed 0.093 secs\n", + "[INFO ] 2023-01-19 14:16:16 [cfr.py:214] Iteration 5: Loss 0.000 Elapsed 0.091 secs\n", + "[INFO ] 2023-01-19 14:16:16 [cfr.py:214] Iteration 6: Loss 0.000 Elapsed 0.093 secs\n", + "[INFO ] 2023-01-19 14:16:16 [cfr.py:214] Iteration 7: Loss 0.000 Elapsed 0.095 secs\n", + "[INFO ] 2023-01-19 14:16:16 [cfr.py:214] Iteration 8: Loss 0.000 Elapsed 0.095 secs\n", + "[INFO ] 2023-01-19 14:16:16 [cfr.py:214] Iteration 9: Loss 0.000 Elapsed 0.092 secs\n", + "[INFO ] 2023-01-19 14:16:16 [cfr.py:214] Iteration 10: Loss 0.000 Elapsed 0.099 secs\n" + ] + }, + { + "data": { + "text/plain": [ + "{'train_loss': 0.0}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cofactor.train()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Recommendation for users" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "for user 61, recommendations are \n", + "items ['Patriot,_The_(2000)', 'Frequency_(2000)', 'Shanghai_Noon_(2000)'].\n", + "\n", + "for user 62, recommendations are \n", + "items ['2001:_A_Space_Odyssey_(1968)', 'Bonnie_and_Clyde_(1967)', 'Close_Encounters_of_the_Third_Kind_(1977)'].\n", + "\n", + "for user 63, recommendations are \n", + "items ['Blair_Witch_Project,_The_(1999)', 'Eyes_Wide_Shut_(1999)', 'Austin_Powers:_The_Spy_Who_Shagged_Me_(1999)'].\n", + "\n", + "for user 64, recommendations are \n", + "items ['Jurassic_Park_(1993)', 'Terminator_2:_Judgment_Day_(1991)', 'Star_Wars:_Episode_VI_-_Return_of_the_Jedi_(1983)'].\n", + "\n", + "for user 65, recommendations are \n", + "items ['Braveheart_(1995)', 'Saving_Private_Ryan_(1998)', 'Patriot,_The_(2000)'].\n", + "\n", + "for user 66, recommendations are \n", + "items ['Jurassic_Park_(1993)', 'Braveheart_(1995)', 'Patriot,_The_(2000)'].\n", + "\n", + "for user 67, recommendations are \n", + "items ['Bridge_on_the_River_Kwai,_The_(1957)', 'To_Kill_a_Mockingbird_(1962)', 'North_by_Northwest_(1959)'].\n", + "\n", + "for user 68, recommendations are \n", + "items ['Being_John_Malkovich_(1999)', 'American_Beauty_(1999)', 'Shakespeare_in_Love_(1998)'].\n", + "\n", + "for user 69, recommendations are \n", + "items ['Good_Will_Hunting_(1997)', 'Dead_Man_Walking_(1995)', 'Apollo_13_(1995)'].\n", + "\n" + ] + } + ], + "source": [ + "uids = [str(x) for x in range(61, 70)]\n", + "recommendation_result = cofactor.topk_recommendation(uids, topk=3)\n", + "for uid, iids in recommendation_result.items():\n", + " print(f\"for user {uid}, recommendations are \", f\"\\nitems {iids}.\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Recommendation for users in given pools" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "for user 1, recommendations are \n", + "items ['Frequency_(2000)', 'Remember_the_Titans_(2000)', 'Shanghai_Noon_(2000)'].\n", + "\n", + "for user 2, recommendations are \n", + "items ['Rules_of_Engagement_(2000)', 'Shanghai_Noon_(2000)', 'Remember_the_Titans_(2000)'].\n", + "\n", + "for user 3, recommendations are \n", + "items ['Shanghai_Noon_(2000)', '28_Days_(2000)', 'Gone_in_60_Seconds_(2000)'].\n", + "\n", + "for user 4, recommendations are \n", + "items ['Shanghai_Noon_(2000)', 'Skulls,_The_(2000)', 'Gone_in_60_Seconds_(2000)'].\n", + "\n" + ] + } + ], + "source": [ + "pool = ['Rules_of_Engagement_(2000)', \n", + " 'Remember_the_Titans_(2000)', \n", + " 'Skulls,_The_(2000)', \n", + " '28_Days_(2000)', \n", + " 'Frequency_(2000)', \n", + " 'Gone_in_60_Seconds_(2000)', \n", + " 'What_Lies_Beneath_(2000)', \n", + " 'Reindeer_Games_(2000)', \n", + " 'Final_Destination_(2000)', \n", + " 'Shanghai_Noon_(2000)']\n", + "uids = [str(x) for x in range(5)]\n", + "recommendation_result = cofactor.topk_recommendation(uids, topk=3, pool=pool)\n", + "for uid, iids in recommendation_result.items():\n", + " print(f\"for user {uid}, recommendations are \", f\"\\nitems {iids}.\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Find Most similar items" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Similar movies to Toy_Story_2_(1999) in similar items\n", + "[(\"Bug's_Life,_A_(1998)\", 0.9336268), ('Toy_Story_(1995)', 0.910489), ('Shakespeare_in_Love_(1998)', 0.8634493), ('Babe_(1995)', 0.8460558), ('Groundhog_Day_(1993)', 0.8172974), ('Being_John_Malkovich_(1999)', 0.80523443), ('Sixth_Sense,_The_(1999)', 0.7978962), ('Galaxy_Quest_(1999)', 0.7975167), ('Election_(1999)', 0.79327524), ('South_Park:_Bigger,_Longer_and_Uncut_(1999)', 0.7679639)]\n", + "01. 0.934 Bug's_Life,_A_(1998)\n", + "02. 0.910 Toy_Story_(1995)\n", + "03. 0.863 Shakespeare_in_Love_(1998)\n", + "04. 0.846 Babe_(1995)\n", + "05. 0.817 Groundhog_Day_(1993)\n", + "06. 0.805 Being_John_Malkovich_(1999)\n", + "07. 0.798 Sixth_Sense,_The_(1999)\n", + "08. 0.798 Galaxy_Quest_(1999)\n", + "09. 0.793 Election_(1999)\n", + "10. 0.768 South_Park:_Bigger,_Longer_and_Uncut_(1999)\n" + ] + } + ], + "source": [ + "print('Similar movies to Toy_Story_2_(1999) in similar items')\n", + "similar_items = cofactor.most_similar('Toy_Story_2_(1999)', 10)\n", + "print(similar_items)\n", + "for rank, (movie_name, score) in enumerate(similar_items):\n", + " print(f'{rank + 1:02d}. {score:.3f} {movie_name}')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Find Most similar items given pool" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "01. 0.371 Gone_in_60_Seconds_(2000)\n", + "02. 0.357 28_Days_(2000)\n", + "03. 0.335 Frequency_(2000)\n", + "04. 0.332 Shanghai_Noon_(2000)\n", + "05. 0.283 What_Lies_Beneath_(2000)\n", + "06. 0.231 Final_Destination_(2000)\n" + ] + } + ], + "source": [ + "pool = ['Rules_of_Engagement_(2000)', \n", + " 'Remember_the_Titans_(2000)', \n", + " 'Skulls,_The_(2000)', \n", + " '28_Days_(2000)', \n", + " 'Frequency_(2000)', \n", + " 'Gone_in_60_Seconds_(2000)', \n", + " 'What_Lies_Beneath_(2000)', \n", + " 'Reindeer_Games_(2000)', \n", + " 'Final_Destination_(2000)', \n", + " 'Shanghai_Noon_(2000)']\n", + "similar_items = cofactor.most_similar('Toy_Story_2_(1999)', 5, pool=pool)\n", + "for rank, (movie_name, score) in enumerate(similar_items):\n", + " print(f'{rank + 1:02d}. {score:.3f} {movie_name}')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/jupyter-examples/.ipynb_checkpoints/3. skip-gram-checkpoint.ipynb b/examples/jupyter-examples/.ipynb_checkpoints/3. skip-gram-checkpoint.ipynb new file mode 100644 index 0000000..8a8eea8 --- /dev/null +++ b/examples/jupyter-examples/.ipynb_checkpoints/3. skip-gram-checkpoint.ipynb @@ -0,0 +1,341 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Buffalo's word2vec only supports skip-gram word2vec algorithm (No HS)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from buffalo import W2V, W2VOption\n", + "from buffalo import StreamOptions\n", + "from buffalo import aux, log\n", + "log.set_log_level(1) # set log level 3 or higher to check more information" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'evaluation_on_learning': False,\n", + " 'compute_loss_on_training': True,\n", + " 'early_stopping_rounds': 0,\n", + " 'save_best': False,\n", + " 'evaluation_period': 1,\n", + " 'save_period': 10,\n", + " 'random_seed': 0,\n", + " 'validation': {},\n", + " 'num_workers': 8,\n", + " 'num_iters': 15,\n", + " 'd': 100,\n", + " 'window': 5,\n", + " 'min_count': 2,\n", + " 'sample': 0.001,\n", + " 'num_negative_samples': 5,\n", + " 'lr': 0.025,\n", + " 'min_lr': 0.0001,\n", + " 'model_path': '',\n", + " 'data_opt': {}}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "opt = W2VOption().get_default_option()\n", + "opt.num_iters = 15\n", + "opt.num_workers = 8\n", + "opt.d = 100\n", + "opt.min_count = 2\n", + "opt.num_negative_samples = 5 # initialize default Word2vec option\n", + "opt # Check buffalo/algo/options.py to see further." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "data_opt = StreamOptions().get_default_option()\n", + "data_opt.input.main = 'data/ml-1m/stream'\n", + "data_opt.input.iid = 'data/ml-1m/iid'" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "w2v_model = W2V(opt, data_opt=data_opt)\n", + "w2v_model.initialize()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "w2v_model.train()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Similar movies to Lion King" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Similar movies to Lion_King,_The_(1994)\n", + "01. 0.792 Hunchback_of_Notre_Dame,_The_(1996)\n", + "02. 0.722 Mulan_(1998)\n", + "03. 0.721 Beauty_and_the_Beast_(1991)\n", + "04. 0.713 Hercules_(1997)\n", + "05. 0.688 Anastasia_(1997)\n" + ] + } + ], + "source": [ + "movie_name = \"Lion_King,_The_(1994)\"\n", + "print('Similar movies to', movie_name)\n", + "similar_items = w2v_model.most_similar(movie_name, 5)\n", + "for rank, (movie_name, score) in enumerate(similar_items):\n", + " print(f'{rank + 1:02d}. {score:.3f} {movie_name}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Results are all disney animations, as you expected. word2vec quite work well in recommendation domain." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### feature vector of SF movies" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2 movie name: Star_Trek:_Generations_(1994) score: 0.7971603\n", + "3 movie name: Star_Trek_VI:_The_Undiscovered_Country_(1991) score: 0.7936294\n", + "4 movie name: Rocketeer,_The_(1991) score: 0.76695454\n", + "5 movie name: Star_Trek:_Insurrection_(1998) score: 0.71424824\n", + "6 movie name: Superman_II_(1980) score: 0.69101596\n", + "7 movie name: Star_Trek:_First_Contact_(1996) score: 0.6870025\n", + "8 movie name: Demolition_Man_(1993) score: 0.6859702\n", + "9 movie name: Deep_Impact_(1998) score: 0.67796385\n" + ] + } + ], + "source": [ + "sf_wv = w2v_model.get_weighted_feature(\n", + " {\n", + " 'Star_Wars:_Episode_IV_-_A_New_Hope_(1977)': 0.3,\n", + " 'Stargate_(1994)': 0.3,\n", + " 'Starship_Troopers_(1997)' : 0.3\n", + " }\n", + ")\n", + "movie_names_to_filter = [\n", + " 'Star_Wars:_Episode_IV_-_A_New_Hope_(1977)',\n", + " 'Stargate_(1994)',\n", + " 'Starship_Troopers_(1997)'\n", + "] \n", + "\n", + "for i, (movie_name, score) in enumerate(w2v_model.most_similar(sf_wv, 10)):\n", + " if movie_name in movie_names_to_filter:\n", + " continue\n", + " print(i, \"movie name:\", movie_name, \"score:\", score)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "summing three SF movies, namely Star wars, Stargate, and Starship Troopers gives Star Trek!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Arithmetic among features" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "animation_wv = w2v_model.get_weighted_feature(\n", + " {\n", + " \"Bug's_Life,_A_(1998)\": 1,\n", + " 'Wallace_&_Gromit:_The_Best_of_Aardman_Animation_(1996)': 1,\n", + " 'Sleeping_Beauty_(1959)': 1,\n", + " 'Toy_Story_(1995)': 1,\n", + " 'South_Park:_Bigger,_Longer_and_Uncut_(1999)': 1,\n", + " 'Creature_Comforts_(1990)': 1,\n", + " 'Lion_King,_The_(1994)': 1,\n", + " 'Mulan_(1998)': 1\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First we need to sum and normalize vectors of animations." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "drama_wv = w2v_model.get_weighted_feature(\n", + " {\n", + " 'Ben-Hur_(1959)': 0.3,\n", + " 'Kolya_(1996)' : 0.3,\n", + " 'Shall_We_Dance?_(Shall_We_Dansu?)_(1996)': 0.3\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next we generate weight vector of drama genre by summing vectors of drama movies" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then subtract animation weight vector from Toy Story, then add drama vector " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "wv = animation_wv + sf_wv" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 movie name: Metisse_(Caf_au_Lait)_(1993) score: 0.70666397\n", + "1 movie name: Stargate_(1994) score: 0.68651146\n", + "2 movie name: Mulan_(1998) score: 0.6690214\n", + "3 movie name: Callejn_de_los_milagros,_El_(1995) score: 0.65673596\n", + "4 movie name: Ghost_in_the_Shell_(Kokaku_kidotai)_(1995) score: 0.6557385\n", + "5 movie name: Star_Trek_VI:_The_Undiscovered_Country_(1991) score: 0.64872766\n", + "6 movie name: Starship_Troopers_(1997) score: 0.6440331\n", + "7 movie name: Star_Trek:_Generations_(1994) score: 0.640712\n", + "8 movie name: Lion_King,_The_(1994) score: 0.6243777\n", + "9 movie name: Rocketeer,_The_(1991) score: 0.6216627\n" + ] + } + ], + "source": [ + "movie_names_to_filter = [\n", + " 'Toy_Story_(1995)',\n", + " 'Ben-Hur_(1959)',\n", + " 'Kolya_(1996)',\n", + " 'Shall_We_Dance?_(Shall_We_Dansu?)_(1996)'\n", + "]\n", + "\n", + "for i, (movie_name, score) in enumerate(w2v_model.most_similar(wv, 10)):\n", + " if movie_name in movie_names_to_filter:\n", + " continue\n", + " print(i, \"movie name:\", movie_name, \"score:\", score)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We only find that \"Ghost_in_the_Shell_(Kokaku_kidotai)_(1995)\" in the high scored items, which is SF animation.\n", + "We conjectured that arithmetic operations doesn't work in item recommendation domain." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/jupyter-examples/.ipynb_checkpoints/4. Performance comparison over models-checkpoint.ipynb b/examples/jupyter-examples/.ipynb_checkpoints/4. Performance comparison over models-checkpoint.ipynb new file mode 100644 index 0000000..3364816 --- /dev/null +++ b/examples/jupyter-examples/.ipynb_checkpoints/4. Performance comparison over models-checkpoint.ipynb @@ -0,0 +1,236 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Comparison of models on movielens 1M dataset\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "import buffalo\n", + "from buffalo import ALS, BPRMF, CFR, ALSOption, BPRMFOption, CFROption \n", + "from buffalo import MatrixMarketOptions, StreamOptions\n", + "from buffalo import aux, log\n", + "log.set_log_level(1) # set log level 3 or higher to check more information" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "als_opt = ALSOption().get_default_option() \n", + "als_opt.num_workers = 4\n", + "als_opt.validation = aux.Option({'topk': 10})\n", + "als_opt.reg_u = 0.06\n", + "als_opt.reg_i = 0.06\n", + "\n", + "bpr_opt = BPRMFOption().get_default_option()\n", + "bpr_opt.use_bias = False\n", + "bpr_opt.num_workers = 4\n", + "bpr_opt.validation = aux.Option({'topk': 10})\n", + "bpr_opt.reg_u = 0.02\n", + "bpr_opt.reg_j = bpr_opt.reg_i = 0.02\n", + "bpr_opt.reg_b = 0.1\n", + "\n", + "cfr_opt = CFROption().get_default_option()\n", + "cfr_opt.num_workers = 4\n", + "cfr_opt.validation = aux.Option({'topk': 10})\n", + "cfr_opt.reg_u = 0.06\n", + "cfr_opt.reg_i = 0.06\n", + "cfr_opt.reg_c = 0.05" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "data_opt = MatrixMarketOptions().get_default_option()\n", + "data_opt.input.main = 'data/ml-1m/main.mtx'\n", + "data_opt.input.iid = 'data/ml-1m/iid'\n", + "data_opt.input.uid = 'data/ml-1m/uid'\n", + "data_opt.data.path = '.4_mm.h5py'\n", + "data_opt.data.validation.p = 0.1\n", + "data_opt.data.validation.max_samples = 10000" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use validation, `opt.validation` must be set like this\n", + "```python\n", + " option.validation = aux.Option({'topk': 10}) # which metric will be used\n", + " # if topk is set to be 10, then NDCG@10, MAP@10 is calculated\n", + " data_option.data.validation.p # fraction of the validation data\n", + " data_option.data.validation.max_samples # the number of maximum validation data\n", + "```\n", + "the total number of validation samples is set to be $\\text{min}(\\text{nnz} * p, \\text{max_samples})$" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "mm_data = buffalo.data.load(data_opt)\n", + "mm_data.create()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "data_opt = StreamOptions().get_default_option()\n", + "data_opt.data.validation.name = \"sample\"\n", + "data_opt.data.sppmi = {\"windows\": 5, \"k\": 10}\n", + "data_opt.input.main = 'data/ml-1m/stream'\n", + "data_opt.input.uid = 'data/ml-1m/uid'\n", + "data_opt.input.iid = 'data/ml-1m/iid'\n", + "data_opt.data.value_prepro = aux.Option({'name': 'OneBased'})\n", + "data_opt.data.path = '.4_stesam.h5py'\n", + "data_opt.data.internal_data_type = \"matrix\"\n", + "data_opt.data.validation.p = 0.1\n", + "data_opt.data.validation.max_samples = 10000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stream_data = buffalo.data.load(data_opt)\n", + "stream_data.create()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "als = ALS(als_opt, data=mm_data)\n", + "als.initialize()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "bpr = BPRMF(bpr_opt, data=mm_data)\n", + "bpr.initialize()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cfr = CFR(cfr_opt, data=stream_data)\n", + "cfr.initialize()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "als_res = als.train()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bpr_res = bpr.train()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cfr_res = cfr.train()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cfr_res" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "als_res" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bpr_res" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + }, + "vscode": { + "interpreter": { + "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/jupyter-examples/5. KakaoBrunch12M.ipynb b/examples/jupyter-examples/.ipynb_checkpoints/5. KakaoBrunch12M-checkpoint.ipynb similarity index 96% rename from examples/jupyter-examples/5. KakaoBrunch12M.ipynb rename to examples/jupyter-examples/.ipynb_checkpoints/5. KakaoBrunch12M-checkpoint.ipynb index 4a452e5..6b96f9e 100644 --- a/examples/jupyter-examples/5. KakaoBrunch12M.ipynb +++ b/examples/jupyter-examples/.ipynb_checkpoints/5. KakaoBrunch12M-checkpoint.ipynb @@ -17,12 +17,8 @@ "outputs": [], "source": [ "import buffalo.data\n", - "from buffalo.algo.als import ALS\n", - "from buffalo.algo.options import ALSOption\n", - "from buffalo.misc import aux\n", - "from buffalo.misc import log \n", - "from buffalo.data.mm import MatrixMarketOptions\n", - "\n", + "from buffalo import ALS, ALSOption, MatrixMarketOptions\n", + "from buffalo import aux, log \n", "log.set_log_level(1) # set log level 3 or higher to check more information" ] }, @@ -242,7 +238,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -256,7 +252,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.10.6" } }, "nbformat": 4, diff --git a/examples/jupyter-examples/0. Data Transform for Buffalo's input data.ipynb b/examples/jupyter-examples/0. Data Transform for Buffalo's input data.ipynb index e137e56..f2bb6fd 100644 --- a/examples/jupyter-examples/0. Data Transform for Buffalo's input data.ipynb +++ b/examples/jupyter-examples/0. Data Transform for Buffalo's input data.ipynb @@ -18,8 +18,8 @@ "name": "stdout", "output_type": "stream", "text": [ + "mkdir: data: File exists\n", "Archive: ml-1m.zip\n", - " creating: data/ml-1m/\n", " inflating: data/ml-1m/movies.dat \n", " inflating: data/ml-1m/ratings.dat \n", " inflating: data/ml-1m/README \n", @@ -78,7 +78,7 @@ "metadata": {}, "outputs": [], "source": [ - "movies = pd.read_csv('data/ml-1m/movies.dat', header=None, sep=\"::\", engine='python')\n", + "movies = pd.read_csv('data/ml-1m/movies.dat', header=None, sep=\"::\", engine='python', encoding='latin-1')\n", "movies.columns = ['iid', 'movie_name', 'genre']" ] }, @@ -291,7 +291,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -305,7 +305,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.10.6" } }, "nbformat": 4, diff --git a/examples/jupyter-examples/1. Matrix Factorization.ipynb b/examples/jupyter-examples/1. Matrix Factorization.ipynb index ba9f124..762e2da 100644 --- a/examples/jupyter-examples/1. Matrix Factorization.ipynb +++ b/examples/jupyter-examples/1. Matrix Factorization.ipynb @@ -14,13 +14,11 @@ "metadata": {}, "outputs": [], "source": [ - "from buffalo.algo.als import ALS\n", - "from buffalo.algo.bpr import BPRMF\n", - "from buffalo.misc import aux, log\n", - "from buffalo.algo.options import ALSOption, BPRMFOption\n", - "import buffalo.data\n", - "from buffalo.data.mm import MatrixMarketOptions\n", - "\n", + "import buffalo\n", + "from buffalo import ALS, BPRMF\n", + "from buffalo import aux, log\n", + "from buffalo import ALSOption, BPRMFOption\n", + "from buffalo import MatrixMarketOptions\n", "log.set_log_level(1) # set log level 3 or higher to check more information" ] }, @@ -31,7 +29,7 @@ "outputs": [], "source": [ "MODEL_TO_USE = \"ALS\"\n", - "MODEL_TO_USE = \"BPR\"# un-comment this if you want to use BPR" + "# MODEL_TO_USE = \"BPR\"# un-comment this if you want to use BPR" ] }, { @@ -119,28 +117,25 @@ " 'compute_loss_on_training': True,\n", " 'early_stopping_rounds': 0,\n", " 'save_best': False,\n", - " 'evaluation_period': 100,\n", + " 'evaluation_period': 1,\n", " 'save_period': 10,\n", " 'random_seed': 0,\n", " 'validation': {'topk': 10},\n", - " 'use_bias': True,\n", - " 'num_workers': 1,\n", - " 'num_iters': 100,\n", + " 'adaptive_reg': False,\n", + " 'save_factors': False,\n", + " 'accelerator': False,\n", " 'd': 20,\n", - " 'update_i': True,\n", - " 'update_j': True,\n", - " 'reg_u': 0.025,\n", - " 'reg_i': 0.025,\n", - " 'reg_j': 0.025,\n", - " 'reg_b': 0.025,\n", - " 'optimizer': 'sgd',\n", - " 'lr': 0.002,\n", - " 'min_lr': 0.0001,\n", - " 'beta1': 0.9,\n", - " 'beta2': 0.999,\n", - " 'per_coordinate_normalize': False,\n", - " 'num_negative_samples': 1,\n", - " 'sampling_power': 0.0,\n", + " 'num_iters': 10,\n", + " 'num_workers': 1,\n", + " 'hyper_threads': 256,\n", + " 'num_cg_max_iters': 3,\n", + " 'reg_u': 0.1,\n", + " 'reg_i': 0.1,\n", + " 'alpha': 8,\n", + " 'optimizer': 'manual_cg',\n", + " 'cg_tolerance': 1e-10,\n", + " 'block_size': 32,\n", + " 'eps': 1e-10,\n", " 'model_path': '',\n", " 'data_opt': {}}" ] @@ -239,7 +234,8 @@ " 'batch_mb': 1024,\n", " 'use_cache': False,\n", " 'tmp_dir': '/tmp/',\n", - " 'path': './mm.h5py'}}" + " 'path': './mm.h5py',\n", + " 'disk_based': False}}" ] }, "execution_count": 9, @@ -296,7 +292,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -306,6 +301,15 @@ { "cell_type": "code", "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import buffalo" + ] + }, + { + "cell_type": "code", + "execution_count": 12, "metadata": { "scrolled": true }, @@ -328,7 +332,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -338,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": { "scrolled": true }, @@ -353,7 +357,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": { "scrolled": true }, @@ -364,21 +368,22 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'train_loss': 0.0,\n", - " 'val_ndcg': 0.03604497128947639,\n", - " 'val_map': 0.023406525573192238,\n", - " 'val_accuracy': 0.07333333333333333,\n", - " 'val_rmse': 3.030244968098499,\n", - " 'val_error': 2.814815138220787}" + "{'train_loss': 0.2804447780030053,\n", + " 'val_ndcg': 0.053509737512824056,\n", + " 'val_map': 0.036605582307829496,\n", + " 'val_accuracy': 0.10280898876404494,\n", + " 'val_auc': 0.5500847197037205,\n", + " 'val_rmse': 2.9060066759494854,\n", + " 'val_error': 2.713486196756363}" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -396,12 +401,30 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mkdir: model: File exists\r\n" + ] + } + ], + "source": [ + "!mkdir model" + ] + }, + { + "cell_type": "code", + "execution_count": 18, "metadata": { "scrolled": true }, "outputs": [], "source": [ + "\n", "model.save(\"model/model-ml-1m\")\n", "del model\n", "if MODEL_TO_USE == \"ALS\":\n", @@ -420,7 +443,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "metadata": { "scrolled": true }, @@ -430,31 +453,31 @@ "output_type": "stream", "text": [ "for user 61, recommendations are \n", - "items ['American_Beauty_(1999)', 'Star_Wars:_Episode_IV_-_A_New_Hope_(1977)', 'Star_Wars:_Episode_V_-_The_Empire_Strikes_Back_(1980)'].\n", + "items ['Rules_of_Engagement_(2000)', 'Remember_the_Titans_(2000)', 'Skulls,_The_(2000)'].\n", "\n", "for user 62, recommendations are \n", - "items ['American_Beauty_(1999)', 'Star_Wars:_Episode_IV_-_A_New_Hope_(1977)', 'Star_Wars:_Episode_V_-_The_Empire_Strikes_Back_(1980)'].\n", + "items ['Midnight_in_the_Garden_of_Good_and_Evil_(1997)', 'Bonnie_and_Clyde_(1967)', 'Coming_Home_(1978)'].\n", "\n", "for user 63, recommendations are \n", - "items ['American_Beauty_(1999)', 'Star_Wars:_Episode_IV_-_A_New_Hope_(1977)', 'Star_Wars:_Episode_V_-_The_Empire_Strikes_Back_(1980)'].\n", + "items ['Eyes_Wide_Shut_(1999)', 'Summer_of_Sam_(1999)', 'Go_(1999)'].\n", "\n", "for user 64, recommendations are \n", - "items ['American_Beauty_(1999)', 'Star_Wars:_Episode_IV_-_A_New_Hope_(1977)', 'Star_Wars:_Episode_V_-_The_Empire_Strikes_Back_(1980)'].\n", + "items ['Jurassic_Park_(1993)', 'Braveheart_(1995)', 'Star_Wars:_Episode_VI_-_Return_of_the_Jedi_(1983)'].\n", "\n", "for user 65, recommendations are \n", - "items ['American_Beauty_(1999)', 'Star_Wars:_Episode_IV_-_A_New_Hope_(1977)', 'Star_Wars:_Episode_V_-_The_Empire_Strikes_Back_(1980)'].\n", + "items ['Air_Force_One_(1997)', 'Patriot,_The_(2000)', 'Backdraft_(1991)'].\n", "\n", "for user 66, recommendations are \n", - "items ['American_Beauty_(1999)', 'Star_Wars:_Episode_IV_-_A_New_Hope_(1977)', 'Star_Wars:_Episode_V_-_The_Empire_Strikes_Back_(1980)'].\n", + "items ['American_Beauty_(1999)', 'Star_Wars:_Episode_VI_-_Return_of_the_Jedi_(1983)', 'Braveheart_(1995)'].\n", "\n", "for user 67, recommendations are \n", - "items ['American_Beauty_(1999)', 'Star_Wars:_Episode_IV_-_A_New_Hope_(1977)', 'Star_Wars:_Episode_V_-_The_Empire_Strikes_Back_(1980)'].\n", + "items ['12_Angry_Men_(1957)', 'Grapes_of_Wrath,_The_(1940)', 'Bridge_on_the_River_Kwai,_The_(1957)'].\n", "\n", "for user 68, recommendations are \n", - "items ['American_Beauty_(1999)', 'Star_Wars:_Episode_IV_-_A_New_Hope_(1977)', 'Star_Wars:_Episode_V_-_The_Empire_Strikes_Back_(1980)'].\n", + "items ['Wrong_Trousers,_The_(1993)', 'Close_Shave,_A_(1995)', 'Grand_Day_Out,_A_(1992)'].\n", "\n", "for user 69, recommendations are \n", - "items ['American_Beauty_(1999)', 'Star_Wars:_Episode_IV_-_A_New_Hope_(1977)', 'Star_Wars:_Episode_V_-_The_Empire_Strikes_Back_(1980)'].\n", + "items ['Dead_Man_Walking_(1995)', 'Hamlet_(1996)', 'Malcolm_X_(1992)'].\n", "\n" ] } @@ -475,7 +498,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -483,16 +506,16 @@ "output_type": "stream", "text": [ "for user 1, recommendations are \n", - "items ['Frequency_(2000)', 'Shanghai_Noon_(2000)', 'Gone_in_60_Seconds_(2000)'].\n", + "items ['Shanghai_Noon_(2000)', 'Frequency_(2000)', 'Remember_the_Titans_(2000)'].\n", "\n", "for user 2, recommendations are \n", - "items ['Frequency_(2000)', 'Shanghai_Noon_(2000)', 'Gone_in_60_Seconds_(2000)'].\n", + "items ['Remember_the_Titans_(2000)', 'Rules_of_Engagement_(2000)', 'Frequency_(2000)'].\n", "\n", "for user 3, recommendations are \n", - "items ['Frequency_(2000)', 'Shanghai_Noon_(2000)', 'Gone_in_60_Seconds_(2000)'].\n", + "items ['Shanghai_Noon_(2000)', 'Frequency_(2000)', 'Remember_the_Titans_(2000)'].\n", "\n", "for user 4, recommendations are \n", - "items ['Frequency_(2000)', 'Shanghai_Noon_(2000)', 'Gone_in_60_Seconds_(2000)'].\n", + "items ['Shanghai_Noon_(2000)', 'Frequency_(2000)', 'Gone_in_60_Seconds_(2000)'].\n", "\n" ] } @@ -530,7 +553,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -538,11 +561,11 @@ "output_type": "stream", "text": [ "Similar movies to Toy_Story_2_(1999)\n", - "01. 1.000 Bug's_Life,_A_(1998)\n", - "02. 1.000 Toy_Story_(1995)\n", - "03. 1.000 Gladiator_(2000)\n", - "04. 0.999 Galaxy_Quest_(1999)\n", - "05. 0.999 Sixth_Sense,_The_(1999)\n" + "01. 0.958 Toy_Story_(1995)\n", + "02. 0.957 Bug's_Life,_A_(1998)\n", + "03. 0.949 Shakespeare_in_Love_(1998)\n", + "04. 0.945 Being_John_Malkovich_(1999)\n", + "05. 0.935 Sixth_Sense,_The_(1999)\n" ] } ], @@ -562,19 +585,19 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "01. 0.996 Shanghai_Noon_(2000)\n", - "02. 0.982 Frequency_(2000)\n", - "03. 0.980 Gone_in_60_Seconds_(2000)\n", - "04. 0.980 Rules_of_Engagement_(2000)\n", - "05. 0.950 Reindeer_Games_(2000)\n", - "06. 0.948 Remember_the_Titans_(2000)\n" + "01. 0.467 Shanghai_Noon_(2000)\n", + "02. 0.435 Frequency_(2000)\n", + "03. 0.354 Gone_in_60_Seconds_(2000)\n", + "04. 0.320 28_Days_(2000)\n", + "05. 0.259 What_Lies_Beneath_(2000)\n", + "06. 0.186 Final_Destination_(2000)\n" ] } ], @@ -597,7 +620,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -611,7 +634,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1 (default, Nov 8 2022, 08:56:14) \n[GCC 4.8.5 20150623 (Red Hat 4.8.5-44)]" + "version": "3.10.6" }, "vscode": { "interpreter": { diff --git a/examples/jupyter-examples/2. Cofactor.ipynb b/examples/jupyter-examples/2. Cofactor.ipynb index 87cb13c..91a9da5 100644 --- a/examples/jupyter-examples/2. Cofactor.ipynb +++ b/examples/jupyter-examples/2. Cofactor.ipynb @@ -17,11 +17,8 @@ "metadata": {}, "outputs": [], "source": [ - "from buffalo.algo.cfr import CFR\n", - "from buffalo.algo.options import CFROption\n", - "from buffalo.data.stream import StreamOptions\n", - "from buffalo.misc import aux\n", - "from buffalo.misc import log" + "from buffalo import CFR, CFROption, StreamOptions\n", + "from buffalo import aux, log" ] }, { @@ -79,7 +76,7 @@ "data_opt.input.uid = 'data/ml-1m/uid'\n", "data_opt.input.iid = 'data/ml-1m/iid'\n", "data_opt.data.value_prepro = aux.Option({'name': 'OneBased'})\n", - "data_opt.data.path = './3-cfr.h5py'\n", + "data_opt.data.path = './2-cfr.h5py'\n", "data_opt.data.internal_data_type = 'matrix'" ] }, @@ -94,30 +91,30 @@ "name": "stderr", "output_type": "stream", "text": [ - "[INFO ] 2019-10-04 10:40:41 [stream.py:278] Create database from stream data\n", - "[INFO ] 2019-10-04 10:40:41 [stream.py:101] gathering itemids from data/ml-1m/stream...\n", - "[INFO ] 2019-10-04 10:40:41 [stream.py:125] Found 3706 unique itemids\n", - "[INFO ] 2019-10-04 10:40:41 [stream.py:287] Creating working data...\n", - "[INFO ] 2019-10-04 10:40:49 [stream.py:295] Building data part...\n", - "[INFO ] 2019-10-04 10:40:49 [base.py:362] Building compressed triplets for rowwise...\n", - "[INFO ] 2019-10-04 10:40:49 [base.py:363] Preprocessing...\n", - "[INFO ] 2019-10-04 10:40:49 [base.py:366] In-memory Compressing ...\n", - "[INFO ] 2019-10-04 10:40:50 [base.py:249] Load triplet files. Total job files: 7\n", - "[INFO ] 2019-10-04 10:40:50 [base.py:396] Finished\n", - "[INFO ] 2019-10-04 10:40:50 [base.py:362] Building compressed triplets for colwise...\n", - "[INFO ] 2019-10-04 10:40:50 [base.py:363] Preprocessing...\n", - "[INFO ] 2019-10-04 10:40:50 [base.py:366] In-memory Compressing ...\n", - "[INFO ] 2019-10-04 10:40:51 [base.py:249] Load triplet files. Total job files: 7\n", - "[INFO ] 2019-10-04 10:40:51 [base.py:396] Finished\n", - "[INFO ] 2019-10-04 10:40:51 [stream.py:166] build sppmi (shift k: 10)\n", - "[INFO ] 2019-10-04 10:41:01 [stream.py:177] convert from /tmp/tmp8ztbh60r to /tmp/tmpt4rxnat9\n", - "[INFO ] 2019-10-04 10:41:01 [stream.py:180] sppmi nnz: 350626\n", - "[INFO ] 2019-10-04 10:41:01 [stream.py:184] Disk-based Compressing...\n", - "[INFO ] 2019-10-04 10:41:01 [base.py:294] Dividing into 12 chunks...\n", - "[INFO ] 2019-10-04 10:41:01 [base.py:304] Total job files: 12\n", - "[PROGRESS] 100.00% 1.1/1.1secs 11.06it/s\n", - "[INFO ] 2019-10-04 10:41:03 [stream.py:312] DB built on ./3-cfr.h5py\n", - "[INFO ] 2019-10-04 10:41:03 [cfr.py:62] CFR ({\n", + "[INFO ] 2023-01-19 14:23:40 [stream.py:279] Create database from stream data\n", + "[INFO ] 2023-01-19 14:23:40 [stream.py:103] gathering itemids from data/ml-1m/stream...\n", + "[INFO ] 2023-01-19 14:23:40 [stream.py:127] Found 3706 unique itemids\n", + "[INFO ] 2023-01-19 14:23:40 [stream.py:288] Creating working data...\n", + "[INFO ] 2023-01-19 14:23:43 [stream.py:296] Building data part...\n", + "[INFO ] 2023-01-19 14:23:43 [base.py:410] Building compressed triplets for rowwise...\n", + "[INFO ] 2023-01-19 14:23:43 [base.py:411] Preprocessing...\n", + "[INFO ] 2023-01-19 14:23:43 [base.py:414] In-memory Compressing ...\n", + "[INFO ] 2023-01-19 14:23:43 [base.py:294] Load triplet files. Total job files: 11\n", + "[INFO ] 2023-01-19 14:23:43 [base.py:444] Finished\n", + "[INFO ] 2023-01-19 14:23:43 [base.py:410] Building compressed triplets for colwise...\n", + "[INFO ] 2023-01-19 14:23:43 [base.py:411] Preprocessing...\n", + "[INFO ] 2023-01-19 14:23:43 [base.py:414] In-memory Compressing ...\n", + "[INFO ] 2023-01-19 14:23:43 [base.py:294] Load triplet files. Total job files: 11\n", + "[INFO ] 2023-01-19 14:23:43 [base.py:444] Finished\n", + "[INFO ] 2023-01-19 14:23:43 [stream.py:168] build sppmi (shift k: 10)\n", + "[INFO ] 2023-01-19 14:24:06 [stream.py:179] convert from /tmp/tmpfmi51i06 to /tmp/tmpyaq01o42\n", + "[INFO ] 2023-01-19 14:24:06 [stream.py:182] sppmi nnz: 350162\n", + "[INFO ] 2023-01-19 14:24:06 [stream.py:186] Disk-based Compressing...\n", + "[INFO ] 2023-01-19 14:24:06 [base.py:339] Dividing into 20 chunks...\n", + "[INFO ] 2023-01-19 14:24:06 [base.py:349] Total job files: 20\n", + "[PROGRESS] 100.00% 0.0/0.0secs 2,337.05it/s\n", + "[INFO ] 2023-01-19 14:24:06 [stream.py:311] DB built on ./2-cfr.h5py\n", + "[INFO ] 2023-01-19 14:24:06 [cfr.py:59] CFR ({\n", " \"evaluation_on_learning\": true,\n", " \"compute_loss_on_training\": true,\n", " \"early_stopping_rounds\": 0,\n", @@ -142,7 +139,7 @@ " \"model_path\": \"\",\n", " \"data_opt\": {}\n", "})\n", - "[INFO ] 2019-10-04 10:41:03 [cfr.py:64] Stream Header(6040, 3706, 999709) Validation(500 samples)\n" + "[INFO ] 2023-01-19 14:24:06 [cfr.py:61] Stream Header(6040, 3706, 994169) Validation(6040 samples)\n" ] } ], @@ -168,17 +165,17 @@ "name": "stderr", "output_type": "stream", "text": [ - "[INFO ] 2019-10-04 10:41:03 [buffered_data.py:71] Set data buffer size as 67108864(minimum required batch size is 245).\n", - "[INFO ] 2019-10-04 10:41:03 [cfr.py:207] Iteration 1: Loss 0.000 Elapsed 0.518 secs\n", - "[INFO ] 2019-10-04 10:41:04 [cfr.py:207] Iteration 2: Loss 0.000 Elapsed 0.477 secs\n", - "[INFO ] 2019-10-04 10:41:04 [cfr.py:207] Iteration 3: Loss 0.000 Elapsed 0.486 secs\n", - "[INFO ] 2019-10-04 10:41:05 [cfr.py:207] Iteration 4: Loss 0.000 Elapsed 0.490 secs\n", - "[INFO ] 2019-10-04 10:41:05 [cfr.py:207] Iteration 5: Loss 0.000 Elapsed 0.489 secs\n", - "[INFO ] 2019-10-04 10:41:06 [cfr.py:207] Iteration 6: Loss 0.000 Elapsed 0.415 secs\n", - "[INFO ] 2019-10-04 10:41:06 [cfr.py:207] Iteration 7: Loss 0.000 Elapsed 0.267 secs\n", - "[INFO ] 2019-10-04 10:41:06 [cfr.py:207] Iteration 8: Loss 0.000 Elapsed 0.181 secs\n", - "[INFO ] 2019-10-04 10:41:06 [cfr.py:207] Iteration 9: Loss 0.000 Elapsed 0.320 secs\n", - "[INFO ] 2019-10-04 10:41:07 [cfr.py:207] Iteration 10: Loss 0.000 Elapsed 0.380 secs\n" + "[INFO ] 2023-01-19 14:24:06 [buffered_data.py:72] Set data buffer size as 67108864(minimum required batch size is 251).\n", + "[INFO ] 2023-01-19 14:24:06 [cfr.py:214] Iteration 1: Loss 0.000 Elapsed 0.095 secs\n", + "[INFO ] 2023-01-19 14:24:06 [cfr.py:214] Iteration 2: Loss 0.000 Elapsed 0.094 secs\n", + "[INFO ] 2023-01-19 14:24:07 [cfr.py:214] Iteration 3: Loss 0.000 Elapsed 0.093 secs\n", + "[INFO ] 2023-01-19 14:24:07 [cfr.py:214] Iteration 4: Loss 0.000 Elapsed 0.089 secs\n", + "[INFO ] 2023-01-19 14:24:07 [cfr.py:214] Iteration 5: Loss 0.000 Elapsed 0.090 secs\n", + "[INFO ] 2023-01-19 14:24:07 [cfr.py:214] Iteration 6: Loss 0.000 Elapsed 0.091 secs\n", + "[INFO ] 2023-01-19 14:24:07 [cfr.py:214] Iteration 7: Loss 0.000 Elapsed 0.090 secs\n", + "[INFO ] 2023-01-19 14:24:07 [cfr.py:214] Iteration 8: Loss 0.000 Elapsed 0.090 secs\n", + "[INFO ] 2023-01-19 14:24:07 [cfr.py:214] Iteration 9: Loss 0.000 Elapsed 0.092 secs\n", + "[INFO ] 2023-01-19 14:24:07 [cfr.py:214] Iteration 10: Loss 0.000 Elapsed 0.095 secs\n" ] }, { @@ -215,31 +212,31 @@ "output_type": "stream", "text": [ "for user 61, recommendations are \n", - "items ['Patriot,_The_(2000)', 'Perfect_Storm,_The_(2000)', 'Scary_Movie_(2000)'].\n", + "items ['Frequency_(2000)', 'Patriot,_The_(2000)', 'Perfect_Storm,_The_(2000)'].\n", "\n", "for user 62, recommendations are \n", - "items ['Rear_Window_(1954)', 'Witness_(1985)', 'Chinatown_(1974)'].\n", + "items ['Witness_(1985)', '2001:_A_Space_Odyssey_(1968)', 'African_Queen,_The_(1951)'].\n", "\n", "for user 63, recommendations are \n", "items ['Austin_Powers:_The_Spy_Who_Shagged_Me_(1999)', 'Blair_Witch_Project,_The_(1999)', 'American_Pie_(1999)'].\n", "\n", "for user 64, recommendations are \n", - "items ['Jurassic_Park_(1993)', 'Terminator_2:_Judgment_Day_(1991)', 'American_Beauty_(1999)'].\n", + "items ['Jurassic_Park_(1993)', 'Terminator_2:_Judgment_Day_(1991)', 'Braveheart_(1995)'].\n", "\n", "for user 65, recommendations are \n", "items ['Braveheart_(1995)', 'Saving_Private_Ryan_(1998)', 'Jurassic_Park_(1993)'].\n", "\n", "for user 66, recommendations are \n", - "items ['Braveheart_(1995)', 'American_Beauty_(1999)', 'Airplane!_(1980)'].\n", + "items ['Braveheart_(1995)', 'American_Beauty_(1999)', 'Jurassic_Park_(1993)'].\n", "\n", "for user 67, recommendations are \n", "items ['Bridge_on_the_River_Kwai,_The_(1957)', 'To_Kill_a_Mockingbird_(1962)', 'Graduate,_The_(1967)'].\n", "\n", "for user 68, recommendations are \n", - "items ['Shakespeare_in_Love_(1998)', 'Groundhog_Day_(1993)', 'Toy_Story_2_(1999)'].\n", + "items ['Shakespeare_in_Love_(1998)', 'Groundhog_Day_(1993)', 'Being_John_Malkovich_(1999)'].\n", "\n", "for user 69, recommendations are \n", - "items ['Good_Will_Hunting_(1997)', 'Dead_Man_Walking_(1995)', 'Shawshank_Redemption,_The_(1994)'].\n", + "items ['Dead_Man_Walking_(1995)', 'Good_Will_Hunting_(1997)', 'Apollo_13_(1995)'].\n", "\n" ] } @@ -260,7 +257,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -268,16 +265,16 @@ "output_type": "stream", "text": [ "for user 1, recommendations are \n", - "items ['Shanghai_Noon_(2000)', 'Frequency_(2000)', 'Remember_the_Titans_(2000)'].\n", + "items ['Frequency_(2000)', 'Shanghai_Noon_(2000)', '28_Days_(2000)'].\n", "\n", "for user 2, recommendations are \n", - "items ['Remember_the_Titans_(2000)', 'Shanghai_Noon_(2000)', 'Frequency_(2000)'].\n", + "items ['Remember_the_Titans_(2000)', 'Frequency_(2000)', 'Shanghai_Noon_(2000)'].\n", "\n", "for user 3, recommendations are \n", - "items ['Shanghai_Noon_(2000)', '28_Days_(2000)', 'Frequency_(2000)'].\n", + "items ['Shanghai_Noon_(2000)', 'Frequency_(2000)', 'Gone_in_60_Seconds_(2000)'].\n", "\n", "for user 4, recommendations are \n", - "items ['Shanghai_Noon_(2000)', 'Final_Destination_(2000)', 'Frequency_(2000)'].\n", + "items ['Shanghai_Noon_(2000)', 'Gone_in_60_Seconds_(2000)', 'Frequency_(2000)'].\n", "\n" ] } @@ -308,7 +305,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -316,17 +313,17 @@ "output_type": "stream", "text": [ "Similar movies to Toy_Story_2_(1999) in similar items\n", - "[(\"Bug's_Life,_A_(1998)\", 0.93695074), ('Toy_Story_(1995)', 0.91278535), ('Babe_(1995)', 0.8598581), ('Shakespeare_in_Love_(1998)', 0.84673494), ('Being_John_Malkovich_(1999)', 0.83271587), ('Election_(1999)', 0.8022457), ('American_Beauty_(1999)', 0.788048), ('South_Park:_Bigger,_Longer_and_Uncut_(1999)', 0.77576375), ('Groundhog_Day_(1993)', 0.7618997), ('Aladdin_(1992)', 0.7471918)]\n", - "01. 0.937 Bug's_Life,_A_(1998)\n", - "02. 0.913 Toy_Story_(1995)\n", - "03. 0.860 Babe_(1995)\n", - "04. 0.847 Shakespeare_in_Love_(1998)\n", - "05. 0.833 Being_John_Malkovich_(1999)\n", - "06. 0.802 Election_(1999)\n", - "07. 0.788 American_Beauty_(1999)\n", - "08. 0.776 South_Park:_Bigger,_Longer_and_Uncut_(1999)\n", - "09. 0.762 Groundhog_Day_(1993)\n", - "10. 0.747 Aladdin_(1992)\n" + "[(\"Bug's_Life,_A_(1998)\", 0.9459578), ('Toy_Story_(1995)', 0.9253026), ('Babe_(1995)', 0.88850766), ('Shakespeare_in_Love_(1998)', 0.8874263), ('Being_John_Malkovich_(1999)', 0.8751306), ('Election_(1999)', 0.83334714), ('American_Beauty_(1999)', 0.8240388), ('South_Park:_Bigger,_Longer_and_Uncut_(1999)', 0.81413776), ('Galaxy_Quest_(1999)', 0.80605197), ('Sixth_Sense,_The_(1999)', 0.8048809)]\n", + "01. 0.946 Bug's_Life,_A_(1998)\n", + "02. 0.925 Toy_Story_(1995)\n", + "03. 0.889 Babe_(1995)\n", + "04. 0.887 Shakespeare_in_Love_(1998)\n", + "05. 0.875 Being_John_Malkovich_(1999)\n", + "06. 0.833 Election_(1999)\n", + "07. 0.824 American_Beauty_(1999)\n", + "08. 0.814 South_Park:_Bigger,_Longer_and_Uncut_(1999)\n", + "09. 0.806 Galaxy_Quest_(1999)\n", + "10. 0.805 Sixth_Sense,_The_(1999)\n" ] } ], @@ -347,19 +344,19 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "01. 0.385 Shanghai_Noon_(2000)\n", - "02. 0.379 28_Days_(2000)\n", - "03. 0.364 Frequency_(2000)\n", - "04. 0.297 Gone_in_60_Seconds_(2000)\n", - "05. 0.224 Final_Destination_(2000)\n", - "06. 0.195 What_Lies_Beneath_(2000)\n" + "01. 0.443 Shanghai_Noon_(2000)\n", + "02. 0.419 Frequency_(2000)\n", + "03. 0.405 28_Days_(2000)\n", + "04. 0.394 Gone_in_60_Seconds_(2000)\n", + "05. 0.298 Final_Destination_(2000)\n", + "06. 0.257 What_Lies_Beneath_(2000)\n" ] } ], @@ -382,7 +379,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -396,7 +393,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.10.6" } }, "nbformat": 4, diff --git a/examples/jupyter-examples/3. skip-gram.ipynb b/examples/jupyter-examples/3. skip-gram.ipynb index 4d59d9e..1204336 100644 --- a/examples/jupyter-examples/3. skip-gram.ipynb +++ b/examples/jupyter-examples/3. skip-gram.ipynb @@ -13,11 +13,9 @@ "metadata": {}, "outputs": [], "source": [ - "from buffalo.algo.w2v import W2V\n", - "from buffalo.algo.options import W2VOption\n", - "from buffalo.data.stream import StreamOptions\n", - "from buffalo.misc import aux, log\n", - "from buffalo.misc.log import set_log_level\n", + "from buffalo import W2V, W2VOption\n", + "from buffalo import StreamOptions\n", + "from buffalo import aux, log\n", "log.set_log_level(1) # set log level 3 or higher to check more information" ] }, @@ -43,9 +41,9 @@ " 'window': 5,\n", " 'min_count': 2,\n", " 'sample': 0.001,\n", + " 'num_negative_samples': 5,\n", " 'lr': 0.025,\n", " 'min_lr': 0.0001,\n", - " 'num_negative_samples': 5,\n", " 'model_path': '',\n", " 'data_opt': {}}" ] @@ -125,11 +123,11 @@ "output_type": "stream", "text": [ "Similar movies to Lion_King,_The_(1994)\n", - "01. 0.765 Hunchback_of_Notre_Dame,_The_(1996)\n", - "02. 0.762 Mulan_(1998)\n", - "03. 0.760 Beauty_and_the_Beast_(1991)\n", - "04. 0.709 Sleeping_Beauty_(1959)\n", - "05. 0.685 Cinderella_(1950)\n" + "01. 0.784 Hunchback_of_Notre_Dame,_The_(1996)\n", + "02. 0.773 Beauty_and_the_Beast_(1991)\n", + "03. 0.726 Mulan_(1998)\n", + "04. 0.703 Sleeping_Beauty_(1959)\n", + "05. 0.687 Dumbo_(1941)\n" ] } ], @@ -164,14 +162,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "2 movie name: Star_Trek:_Generations_(1994) score: 0.80435485\n", - "3 movie name: Star_Trek_VI:_The_Undiscovered_Country_(1991) score: 0.7985013\n", - "4 movie name: Rocketeer,_The_(1991) score: 0.75975585\n", - "5 movie name: Star_Trek:_First_Contact_(1996) score: 0.7055327\n", - "6 movie name: Star_Trek:_Insurrection_(1998) score: 0.70361114\n", - "7 movie name: Fifth_Element,_The_(1997) score: 0.6806074\n", - "8 movie name: Deep_Impact_(1998) score: 0.68037814\n", - "9 movie name: Superman_II_(1980) score: 0.6738094\n" + "2 movie name: Star_Trek:_Generations_(1994) score: 0.81956935\n", + "3 movie name: Star_Trek_VI:_The_Undiscovered_Country_(1991) score: 0.7973181\n", + "4 movie name: Rocketeer,_The_(1991) score: 0.7550323\n", + "5 movie name: Star_Trek:_First_Contact_(1996) score: 0.7185978\n", + "6 movie name: Demolition_Man_(1993) score: 0.7172408\n", + "7 movie name: Fifth_Element,_The_(1997) score: 0.70883423\n", + "8 movie name: Star_Trek:_Insurrection_(1998) score: 0.70751184\n", + "9 movie name: Superman_II_(1980) score: 0.683364\n" ] } ], @@ -283,16 +281,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "0 movie name: Metisse_(Caf_au_Lait)_(1993) score: 0.70677006\n", - "1 movie name: Mulan_(1998) score: 0.6815622\n", - "2 movie name: Ghost_in_the_Shell_(Kokaku_kidotai)_(1995) score: 0.6702216\n", - "3 movie name: Stargate_(1994) score: 0.66783875\n", - "4 movie name: Starship_Troopers_(1997) score: 0.6632842\n", - "5 movie name: Star_Trek_VI:_The_Undiscovered_Country_(1991) score: 0.6519709\n", - "6 movie name: Star_Trek:_Generations_(1994) score: 0.6481054\n", - "7 movie name: Callejn_de_los_milagros,_El_(1995) score: 0.6469138\n", - "8 movie name: Lion_King,_The_(1994) score: 0.63585776\n", - "9 movie name: Loser_(1991) score: 0.6239109\n" + "0 movie name: Metisse_(Caf_au_Lait)_(1993) score: 0.75229746\n", + "1 movie name: Stargate_(1994) score: 0.6703097\n", + "2 movie name: Ghost_in_the_Shell_(Kokaku_kidotai)_(1995) score: 0.6669427\n", + "3 movie name: Mulan_(1998) score: 0.66553444\n", + "4 movie name: Star_Trek:_Generations_(1994) score: 0.6484513\n", + "5 movie name: Star_Trek_VI:_The_Undiscovered_Country_(1991) score: 0.64739656\n", + "6 movie name: Starship_Troopers_(1997) score: 0.644105\n", + "7 movie name: Star_Trek:_First_Contact_(1996) score: 0.6387861\n", + "8 movie name: Lion_King,_The_(1994) score: 0.635358\n", + "9 movie name: Batman:_Mask_of_the_Phantasm_(1993) score: 0.6184186\n" ] } ], @@ -321,7 +319,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -335,7 +333,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.10.6" } }, "nbformat": 4, diff --git a/examples/jupyter-examples/4. Performance comparison over models.ipynb b/examples/jupyter-examples/4. Performance comparison over models.ipynb index 59521f0..c373bac 100644 --- a/examples/jupyter-examples/4. Performance comparison over models.ipynb +++ b/examples/jupyter-examples/4. Performance comparison over models.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -14,17 +13,10 @@ "metadata": {}, "outputs": [], "source": [ - "from buffalo.algo.als import ALS\n", - "from buffalo.algo.bpr import BPRMF\n", - "from buffalo.algo.cfr import CFR\n", - "from buffalo.algo.options import ALSOption\n", - "from buffalo.algo.options import BPRMFOption\n", - "from buffalo.algo.options import CFROption\n", - "import buffalo.data\n", - "from buffalo.data.mm import MatrixMarketOptions\n", - "from buffalo.data.stream import StreamOptions\n", - "from buffalo.misc import aux\n", - "from buffalo.misc import log\n", + "import buffalo\n", + "from buffalo import ALS, BPRMF, CFR, ALSOption, BPRMFOption, CFROption \n", + "from buffalo import MatrixMarketOptions, StreamOptions\n", + "from buffalo import aux, log\n", "log.set_log_level(1) # set log level 3 or higher to check more information" ] }, @@ -196,11 +188,12 @@ "data": { "text/plain": [ "{'train_loss': 0.0,\n", - " 'vali_ndcg': 0.10699514576320421,\n", - " 'vali_map': 0.06417434804366856,\n", - " 'vali_accuracy': 0.16763943065105189,\n", - " 'vali_rmse': 0.3810662058383934,\n", - " 'vali_error': 0.2963493851780891}" + " 'vali_ndcg': 0.11313609555680584,\n", + " 'vali_map': 0.07248239162498113,\n", + " 'vali_accuracy': 0.1669767849519524,\n", + " 'vali_auc': 0.5822383884347648,\n", + " 'vali_rmse': 0.38118396912795977,\n", + " 'vali_error': 0.2968087188065052}" ] }, "execution_count": 13, @@ -220,12 +213,13 @@ { "data": { "text/plain": [ - "{'train_loss': 0.2806028804154141,\n", - " 'val_ndcg': 0.08137682343853958,\n", - " 'val_map': 0.04811094610256136,\n", - " 'val_accuracy': 0.13730048306770012,\n", - " 'val_rmse': 2.9210035418392146,\n", - " 'val_error': 2.7068358163237574}" + "{'train_loss': 0.28126777717438733,\n", + " 'val_ndcg': 0.08033160943235802,\n", + " 'val_map': 0.047727582529763245,\n", + " 'val_accuracy': 0.13283373263163062,\n", + " 'val_auc': 0.5651281155357816,\n", + " 'val_rmse': 2.945920371384216,\n", + " 'val_error': 2.7345076943099498}" ] }, "execution_count": 14, @@ -246,11 +240,12 @@ "data": { "text/plain": [ "{'train_loss': 0.0,\n", - " 'val_ndcg': 0.05350434864011935,\n", - " 'val_map': 0.032103983029096554,\n", - " 'val_accuracy': 0.07715742633439586,\n", - " 'val_rmse': 2.788766983419126,\n", - " 'val_error': 2.520328412902355}" + " 'val_ndcg': 0.0516402307624793,\n", + " 'val_map': 0.029940895687196878,\n", + " 'val_accuracy': 0.07631531715372068,\n", + " 'val_auc': 0.5368478182027646,\n", + " 'val_rmse': 2.726413081239296,\n", + " 'val_error': 2.4062163526296616}" ] }, "execution_count": 15, @@ -265,7 +260,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -279,7 +274,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1 (default, Nov 8 2022, 08:56:14) \n[GCC 4.8.5 20150623 (Red Hat 4.8.5-44)]" + "version": "3.10.6" }, "vscode": { "interpreter": {