.

kakao · Jan 19, 2023 · 54c97fd · 54c97fd
1 parent 0e4ea30
commit 54c97fd
Show file tree

Hide file tree

Showing 11 changed files with 2,164 additions and 217 deletions.
diff --git a/...r-examples/.ipynb_checkpoints/0. Data Transform for Buffalo's input data-checkpoint.ipynb b/...r-examples/.ipynb_checkpoints/0. Data Transform for Buffalo's input data-checkpoint.ipynb
@@ -0,0 +1,313 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download movielens 1M dataset(https://grouplens.org/datasets/movielens/)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Archive:  ml-1m.zip\n",
+      "   creating: data/ml-1m/\n",
+      "  inflating: data/ml-1m/movies.dat   \n",
+      "  inflating: data/ml-1m/ratings.dat  \n",
+      "  inflating: data/ml-1m/README       \n",
+      "  inflating: data/ml-1m/users.dat    \n"
+     ]
+    }
+   ],
+   "source": [
+    "!mkdir data\n",
+    "!wget -q http://www.grouplens.org/system/files/ml-1m.zip ./data\n",
+    "!unzip -o ml-1m -d data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Transform ml-1m dataset into Matrix Market Form"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If you are not familiar with mm(matrix market) format, refer [this](http://networkrepository.com/mtx-matrix-market-format.html)\n",
+    "\n",
+    "If you need to know further on how buffalo handle data, check [Documentation on database of Buffalo](https://buffalo-recsys.readthedocs.io/en/latest/intro.html#database)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from scipy.io import mmwrite\n",
+    "from scipy.io import mmread\n",
+    "from scipy.sparse import csr_matrix"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ratings = pd.read_csv(\"data/ml-1m/ratings.dat\", header=None, sep=\"::\", engine='python')\n",
+    "ratings.columns = [\"uid\", \"iid\", \"rating\", \"timestamp\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "movies = pd.read_csv('data/ml-1m/movies.dat', header=None, sep=\"::\", engine='python', encoding='latin-1')\n",
+    "movies.columns = ['iid', 'movie_name', 'genre']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "buffalo iid does not support string with utf-8 encoding and having spaces.\n",
+    "\n",
+    "Therefore, we have to replace spaces and utf-8 text."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def parse_moviename(movie_name):\n",
+    "    return movie_name.replace(' ', '_').encode('utf-8').decode('ascii', 'ignore')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "iid_to_movie_name = dict(zip(movies.iid.tolist(), movies.movie_name.tolist()))\n",
+    "iid_to_movie_name = {iid: parse_moviename(movie_name) for (iid, movie_name) in iid_to_movie_name.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "uid_to_idx = {uid: idx for (idx, uid) in enumerate(ratings.uid.unique().tolist())}\n",
+    "iid_to_idx = {iid: idx for (idx, iid) in enumerate(ratings.iid.unique().tolist())}\n",
+    "idx_to_movie_name = {idx:iid_to_movie_name[iid] for (iid, idx) in iid_to_idx.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Examples of movie names\n",
+      "\n",
+      "[index 30] movie_name: Antz_(1998)\n",
+      "[index 31] movie_name: Girl,_Interrupted_(1999)\n",
+      "[index 32] movie_name: Hercules_(1997)\n",
+      "[index 33] movie_name: Aladdin_(1992)\n",
+      "[index 34] movie_name: Mulan_(1998)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Examples of movie names\\n\")\n",
+    "\n",
+    "for i in range(30, 35):\n",
+    "    print(\"[index %d] movie_name: %s\" % (i, idx_to_movie_name[i]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "row, col, dat = ratings.uid.tolist(), ratings.iid.tolist(), ratings.rating.tolist()\n",
+    "row = [uid_to_idx[r] for r in row]\n",
+    "col = [iid_to_idx[c] for c in col]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_matrix = csr_matrix((dat, (row,col)), shape=(1 + np.max(row), 1 + np.max(col)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(6040, 3706)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(train_matrix.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### To transform csr matrix into matrix market format easily, we use mmwrite (matrix market write)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mmwrite('data/ml-1m/main', train_matrix)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"data/ml-1m/uid\", \"w\") as f:\n",
+    "    for uid in uid_to_idx:\n",
+    "        print(uid, file=f)\n",
+    "\n",
+    "with open(\"data/ml-1m/iid\", \"w\") as f:\n",
+    "    for iid, movie_name in idx_to_movie_name.items():\n",
+    "        print(movie_name, file=f)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Transform ml-1m dataset into Stream format"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Stream file format used in buffalo contains lines lists, having space as delimiter.\n",
+    "\n",
+    "One line is ordered list of items that each user interacted (ordered by time)\n",
+    "\n",
+    "This is useful when the order between interactions are considered(e.g., word2vec, Cofactor).\n",
+    "\n",
+    "See `2. Cofactor` or `3. Word2vec` to see the case where Stream format data is used\n",
+    "\n",
+    "If you need to know further on Stream format data, check [Documentation on database of Buffalo](https://buffalo-recsys.readthedocs.io/en/latest/intro.html#database)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ratings_as_list = ratings.sort_values(by='timestamp').groupby('uid').iid.apply(list).reset_index()\n",
+    "uid = ratings_as_list.uid.tolist()\n",
+    "seen_iids = ratings_as_list.iid.tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "seen_iids = [' '.join([iid_to_movie_name[iid] for iid in iids]) for iids in seen_iids]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Girl,_Interrupted_(1999) Titanic_(1997) Back_to_the_Future_(1985) Cinderella_(1950) Meet_Joe_Black_(1998) Last_Days_of_Disco,_The_(1998) Erin_Brockovich_(2000) To_Kill_a_Mockingbird_(1962) Christmas_Story,_A_(1983) Star_Wars:_Episode_IV_-_A_New_Hope_(1977) Wallace_&_Gromit:_The_Best_of_Aardman_Animation_(1996) One_Flew_Over_the_Cuckoo's_Nest_(1975) Wizard_of_Oz,_The_(1939) Fargo_(1996) Run_Lola_Run_(Lola_rennt)_(1998) Rain_Man_(1988) Saving_Private_Ryan_(1998) Awakenings_(1990) Gigi_(1958) Sound_of_Music,_The_(1965) Driving_Miss_Daisy_(1989) Mary_Poppins_(1964) Bambi_(1942) Apollo_13_(1995) E.T._the_Extra-Terrestrial_(1982) My_Fair_Lady_(1964) Ben-Hur_(1959) Big_(1988) Dead_Poets_Society_(1989) Sixth_Sense,_The_(1999) James_and_the_Giant_Peach_(1996) Ferris_Bueller's_Day_Off_(1986) Secret_Garden,_The_(1993) Toy_Story_2_(1999) Airplane!_(1980) Dumbo_(1941) Pleasantville_(1998) Princess_Bride,_The_(1987) Snow_White_and_the_Seven_Dwarfs_(1937) Miracle_on_34th_Street_(1947) Ponette_(1996) Schindler's_List_(1993) Close_Shave,_A_(1995) Beauty_and_the_Beast_(1991) Aladdin_(1992) Toy_Story_(1995) Tarzan_(1999) Hunchback_of_Notre_Dame,_The_(1996) Antz_(1998) Bug's_Life,_A_(1998) Mulan_(1998) Hercules_(1997) Pocahontas_(1995)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(seen_iids[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"data/ml-1m/stream\", \"w\") as f:\n",
+    "    for iid_list in seen_iids:\n",
+    "        print(iid_list, file=f)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}