diff --git a/examples/usecases/entertainment-with-pretrained-embeddings.ipynb b/examples/usecases/entertainment-with-pretrained-embeddings.ipynb new file mode 100644 index 0000000000..5bc2295aa0 --- /dev/null +++ b/examples/usecases/entertainment-with-pretrained-embeddings.ipynb @@ -0,0 +1,358 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "a556f660", + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2022 NVIDIA Corporation. All Rights Reserved.\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions anda\n", + "# limitations under the License.\n", + "# ==============================================================================" + ] + }, + { + "cell_type": "markdown", + "id": "697d1452", + "metadata": {}, + "source": [ + "\n", + "\n", + "# Training with pretrained embeddings\n", + "\n", + "## Overview\n", + "\n", + "In this use case we will consider how we might train with pretrained embeddings.\n", + "\n", + "Pretrained embeddings can allow our model to include information from additional modalities (for instance, we might want to grab CNN descriptors of product images). They can also come from other models that we train on our data. For example, we might train a word2vec model on the sequence of purchased items by a customer and want to include this information in our retrieval or ranking model.\n", + "\n", + "The use cases are many, but this particular example will focus on the technical aspects of working with pretrained embeddings.\n", + "\n", + "We will use a synthetic version of the MovieLens 100k dataset and emulate a scenario where we would have a pretrained embedding for each of the movies in the dataset.\n", + "\n", + "### Learning objectives\n", + "\n", + "- Training with pretrained embeddings\n", + "- Understanding [the Schema file](https://github.com/NVIDIA-Merlin/core/blob/main/merlin/schema/schema.py)" + ] + }, + { + "cell_type": "markdown", + "id": "1cccd005", + "metadata": {}, + "source": [ + "## Downloading and preparing the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e8c63b03", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-06-23 04:16:53.178231: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:952] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", + "2022-06-23 04:16:53.178616: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:952] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", + "2022-06-23 04:16:53.178776: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:952] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", + "2022-06-23 04:16:53.200029: I tensorflow/core/platform/cpu_feature_guard.cc:152] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2022-06-23 04:16:53.201040: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:952] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", + "2022-06-23 04:16:53.201216: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:952] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", + "2022-06-23 04:16:53.201355: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:952] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", + "2022-06-23 04:16:53.875404: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:952] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", + "2022-06-23 04:16:53.875589: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:952] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", + "2022-06-23 04:16:53.875729: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:952] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", + "2022-06-23 04:16:53.875855: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24576 MB memory: -> device: 0, name: Quadro RTX 8000, pci bus id: 0000:08:00.0, compute capability: 7.5\n" + ] + } + ], + "source": [ + "import merlin.models.tf as mm\n", + "from merlin.schema.tags import Tags\n", + "import tensorflow as tf\n", + "from merlin.models.tf.prediction_tasks.classification import BinaryClassificationTask\n", + "from merlin.models.tf.blocks import *\n", + "from merlin.datasets.synthetic import generate_data\n", + "\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e16e1ec1", + "metadata": {}, + "outputs": [], + "source": [ + "train = generate_data('movielens-100k', num_rows=100_000)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e5400a9a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'rating_binary'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_column = train.schema.select_by_tag(Tags.TARGET).column_names[1]\n", + "target_column" + ] + }, + { + "cell_type": "markdown", + "id": "35fdb65b", + "metadata": {}, + "source": [ + "The schema holds vital information about our dataset. We can extract the embedding table size for the `moveId` column from it." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3955ab52", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1680.0" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train.schema['movieId'].properties['embedding_sizes']['cardinality']" + ] + }, + { + "cell_type": "markdown", + "id": "d60873da", + "metadata": {}, + "source": [ + "From the schema, we can tell that the cardinality of `movieId` is 1680. Index 0 will be used in case an unknown `movieId` is encountered\n", + "\n", + "In order to accommodate this, we initialize our embedding table of dimensionality 1681 by 64." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cc919e80", + "metadata": {}, + "outputs": [], + "source": [ + "pretrained_movie_embs = np.random.random((1681, 64))" + ] + }, + { + "cell_type": "markdown", + "id": "a5beff74", + "metadata": {}, + "source": [ + "This is only a mock up embedding table. In reality, this is where we would pass our embeddings from another model.\n", + "\n", + "The dimensionality of each embedding, that of 64, is arbitrary. We could have specified some other value here, though generally multiples of 8 tend to work well.\n", + "\n", + "Let us now feed our embedding table into our model." + ] + }, + { + "cell_type": "markdown", + "id": "f575b14b", + "metadata": {}, + "source": [ + "## Building the model" + ] + }, + { + "cell_type": "markdown", + "id": "8536b88b", + "metadata": {}, + "source": [ + "We now have everything we need to construct a model and train on our custom embeddings. In order to do so, we will leverage the `TensorInitializer`." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "22af1945", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-06-23 04:16:54.257546: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n" + ] + } + ], + "source": [ + "model = mm.DCNModel(\n", + " train.schema,\n", + " depth=2,\n", + " deep_block=mm.MLPBlock([64, 32]),\n", + " prediction_tasks=mm.BinaryClassificationTask(target_column),\n", + " embedding_options=mm.EmbeddingOptions(\n", + " embeddings_initializers={\n", + " \"movieId\": mm.TensorInitializer(pretrained_movie_embs),\n", + " }\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "73bdd67d", + "metadata": {}, + "source": [ + "We could have created the model without passing anything for the `embedding_options`. The model would still be able to infer how to construct itself (what should be the dimensionality of the input layer and so on) from the information contained in the schema.\n", + "\n", + "Passing a `TensorInitializer` as part of the `embedding_options` tells our model to use our embedding table (`pretrained_movie_embs`) for that particular column of our dataset (`movieId`) as opposed to the model randomly initializing a brand new embedding matrix. For categorical columns where we do not provide this information, the model will go with the standard initialization logic, which is to create an embedding table of appropriate size and perform random preinitialization." + ] + }, + { + "cell_type": "markdown", + "id": "dbb0df47", + "metadata": {}, + "source": [ + "## Training" + ] + }, + { + "cell_type": "markdown", + "id": "f4cdb0c1", + "metadata": {}, + "source": [ + "We train our model with `AUC` as our metric.\n", + "\n", + "As we use synthetic data, the AUC score will not improve significantly." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "0b96fc50", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 8.02 ms, sys: 24 µs, total: 8.04 ms\n", + "Wall time: 7.49 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "opt = tf.keras.optimizers.Adagrad(learning_rate=1e-1)\n", + "model.compile(optimizer=opt, run_eagerly=False, metrics=[tf.keras.metrics.AUC()])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f9d78213", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/10\n", + "98/98 [==============================] - 3s 5ms/step - loss: 0.6938 - auc: 0.5015 - regularization_loss: 0.0000e+00\n", + "Epoch 2/10\n", + "98/98 [==============================] - 0s 5ms/step - loss: 0.6932 - auc: 0.5059 - regularization_loss: 0.0000e+00\n", + "Epoch 3/10\n", + "98/98 [==============================] - 0s 5ms/step - loss: 0.6930 - auc: 0.5089 - regularization_loss: 0.0000e+00\n", + "Epoch 4/10\n", + "98/98 [==============================] - 0s 5ms/step - loss: 0.6929 - auc: 0.5123 - regularization_loss: 0.0000e+00\n", + "Epoch 5/10\n", + "98/98 [==============================] - 0s 5ms/step - loss: 0.6929 - auc: 0.5118 - regularization_loss: 0.0000e+00\n", + "Epoch 6/10\n", + "98/98 [==============================] - 0s 5ms/step - loss: 0.6928 - auc: 0.5137 - regularization_loss: 0.0000e+00\n", + "Epoch 7/10\n", + "98/98 [==============================] - 1s 5ms/step - loss: 0.6928 - auc: 0.5149 - regularization_loss: 0.0000e+00\n", + "Epoch 8/10\n", + "98/98 [==============================] - 0s 5ms/step - loss: 0.6926 - auc: 0.5176 - regularization_loss: 0.0000e+00\n", + "Epoch 9/10\n", + "98/98 [==============================] - 0s 5ms/step - loss: 0.6926 - auc: 0.5177 - regularization_loss: 0.0000e+00\n", + "Epoch 10/10\n", + "98/98 [==============================] - 0s 5ms/step - loss: 0.6926 - auc: 0.5181 - regularization_loss: 0.0000e+00\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.fit(train, batch_size=1024, epochs=10)" + ] + }, + { + "cell_type": "markdown", + "id": "bb96ac21", + "metadata": {}, + "source": [ + "The model trains and we have utilized pretrained embeddings " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/unit/tf/examples/test_usecase_pretrained_embeddings.py b/tests/unit/tf/examples/test_usecase_pretrained_embeddings.py new file mode 100644 index 0000000000..f250eaba9d --- /dev/null +++ b/tests/unit/tf/examples/test_usecase_pretrained_embeddings.py @@ -0,0 +1,12 @@ +from testbook import testbook + +from tests.conftest import REPO_ROOT + + +@testbook( + REPO_ROOT / "examples/usecases/entertainment-with-pretrained-embeddings.ipynb", execute=False +) +def test_usecase_pretrained_embeddings(tb): + tb.execute() + model = tb.ref("model") + assert set(model.history.history.keys()) == set(["auc", "loss", "regularization_loss"])