diff --git a/.flake8 b/.flake8 index ff7cd897..6c1ac61b 100644 --- a/.flake8 +++ b/.flake8 @@ -1,2 +1,4 @@ [flake8] -exclude = tmp.py, tests/ \ No newline at end of file +ignore = E203 +exclude = tmp.py, tests/ +max-line-length = 120 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 283349bf..9fff4fe3 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -13,7 +13,16 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.7, 3.8, 3.9] + include: + - python-version: '3.7' + tf-version: '2.7' + - python-version: '3.7' + tf-version: '2.11' + - python-version: '3.10' + # Python 3.10 only supports TF >= 2.8 + tf-version: '2.8' + - python-version: '3.10' + tf-version: '2.11' steps: - uses: actions/checkout@v2 @@ -26,9 +35,13 @@ jobs: python -m pip install --upgrade pip pip install coveralls - - name: Install package + - name: Install dev packages run: | - pip install ".[tensorflow,dev]" + pip install ".[dev]" + + - name: Install TF package + run: | + pip install tensorflow==${{ matrix.tf-version }} - name: Lint with flake8 run: | diff --git a/.gitignore b/.gitignore index 724f9026..9dc42d92 100644 --- a/.gitignore +++ b/.gitignore @@ -11,7 +11,6 @@ release.sh benchmark/supervised/datasets/ benchmark/supervised/models/ datasets/ -models/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/examples/README.md b/examples/README.md index 44940da7..7263543f 100644 --- a/examples/README.md +++ b/examples/README.md @@ -6,4 +6,5 @@ | [Hello World](./supervised_hello_world.ipynb) | Supervised | Train and use an image similarity model to find similar looking MNIST digits | | [Self-Supervised Learning](./unsupervised_hello_world.ipynb) | Unsupervised | Train an image model using the SimSiam based self-supervised contrastive learning. | | [visualization](./supervised/visualization.ipynb) | Supervised | Train an image similarity model on the Stanford Dogs dataset using Evaluation Callbacks and the interactive visualizer | -| [Sampler IO Cookbook](./sampler_io_cookbook.ipynb) | Utils | Examples demonstrating how to use the various in memory batch samplers. +| [Sampler IO Cookbook](./sampler_io_cookbook.ipynb) | Utils | Examples demonstrating how to use the various in memory batch samplers. | +| [CLIP finetuning](./multimodal_example.ipynb) | Supervised | Finetune CLIP on atric-dataset using multiple negatives ranking loss. diff --git a/examples/multimodal_example.ipynb b/examples/multimodal_example.ipynb new file mode 100644 index 00000000..b0c324fa --- /dev/null +++ b/examples/multimodal_example.ipynb @@ -0,0 +1,1672 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "008b0470", + "metadata": {}, + "source": [ + "Copyright 2023 abhisharsinha and The TensorFlow Similarity Authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31b6f6d3", + "metadata": {}, + "outputs": [], + "source": [ + "# @title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "5e22169f", + "metadata": {}, + "source": [ + "# TensorFlow Similarity MultiModal Example" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "252467c5", + "metadata": { + "id": "252467c5" + }, + "source": [ + "Imagine that you're running an online art gallery with thousands of paintings. You've allowed your customers to search for paintings by artists or painting titles, but what if they want to find paintings based on their descriptions? Unfortunately, not all paintings come with descriptions, and manually writing them would require a massive amount of effort and slow down the digitization process. So, what if we could search for paintings by analyzing their images, without having to label them manually? For instance, if a customer searches for a painting of sea waves in a Japanese style, they would expect to see something like this." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "5b54b1cb", + "metadata": { + "id": "5b54b1cb" + }, + "source": [ + "" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2ad1a349", + "metadata": { + "id": "2ad1a349" + }, + "source": [ + "### Notebook goal\n", + "\n", + "We will be training an image encoder model that indexes images, as well as a text encoder model that encodes our search queries. With these models, we can locate an image by finding the nearest image vector to our text vector. To accomplish this, we will fine-tune the CLIP model, which was trained on a large dataset and has demonstrated impressive zero-shot performance. Our approach will be to use a pre-trained model as a baseline, and attempt to improve its performance on our task through fine-tuning. To accomplish this, we will use a dataset from the [Art Institute of Chicago](https://www.artic.edu/), which contains images of artworks along with metadata such as the title, artist, alt_text, and description. We will use the descriptions, as they are the most suitable for our specific problem.\"" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "d27ce8a3", + "metadata": {}, + "outputs": [], + "source": [ + "import gc\n", + "import os\n", + "import textwrap\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "from tqdm import tqdm\n", + "from matplotlib import pyplot as plt\n", + "# from tabulate import tabulate\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# INFO messages are not printed.\n", + "# This must be run before loading other modules.\n", + "os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"1\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "7188af19", + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "083b3d18", + "metadata": {}, + "outputs": [], + "source": [ + "# install TF similarity if needed\n", + "try:\n", + " import tensorflow_similarity as tfsim # main package\n", + "except ModuleNotFoundError:\n", + " !pip install tensorflow_similarity\n", + " import tensorflow_similarity as tfsim" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "044843d7", + "metadata": {}, + "outputs": [], + "source": [ + "# Install Transformer deps from hugging face\n", + "try:\n", + " from transformers import TFCLIPTextModel, TFCLIPVisionModel, CLIPTokenizer, TFCLIPModel\n", + "except ModuleNotFoundError:\n", + " !pip install transformers\n", + " from transformers import TFCLIPTextModel, TFCLIPVisionModel, CLIPTokenizer, TFCLIPModel" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2355313f", + "metadata": {}, + "outputs": [], + "source": [ + "tfsim.utils.tf_cap_memory()\n", + "# Clear out any old model state.\n", + "gc.collect()\n", + "tf.keras.backend.clear_session()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e6924a49", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TensorFlow: 2.11.0\n", + "TensorFlow Similarity 0.17.0.dev18\n" + ] + } + ], + "source": [ + "print(\"TensorFlow:\", tf.__version__)\n", + "print(\"TensorFlow Similarity\", tfsim.__version__)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "818ce1c6", + "metadata": { + "id": "818ce1c6" + }, + "outputs": [], + "source": [ + "N_CPU = os.cpu_count()\n", + "IMG_SIZE = 224\n", + "BATCH_SIZE = 64\n", + "COLOR_CHANNELS = 3\n", + "N_TOKENS = 77\n", + "DATA_DIR = \"multi_modal_datasets\"\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "JJRebGa4pdMu", + "metadata": { + "id": "JJRebGa4pdMu" + }, + "source": [ + "## Data preparation" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "9f8a8952", + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists(DATA_DIR):\n", + " os.makedirs(DATA_DIR)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "jWsG-qx3Aj5Q", + "metadata": { + "id": "jWsG-qx3Aj5Q" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File ‘multi_modal_datasets/artworks.csv’ already there; not retrieving.\n", + "\n", + "File ‘multi_modal_datasets/artic-dataset.zip’ already there; not retrieving.\n", + "\n", + "Archive: multi_modal_datasets/artic-dataset.zip\n" + ] + } + ], + "source": [ + "!wget -nc https://huggingface.co/datasets/abhishars/artic-dataset/resolve/main/artworks.csv -P {DATA_DIR}\n", + "!wget -nc https://storage.googleapis.com/mys-released-models/gsoc/artic-dataset.zip -P {DATA_DIR}\n", + "!unzip -n {DATA_DIR}/artic-dataset.zip -d {DATA_DIR}" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "3c46d48c", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3c46d48c", + "outputId": "72fcf867-4212-441f-b747-9816ac984220" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 37649 unique image ids\n" + ] + } + ], + "source": [ + "# Get the set of unique image ids in the unzipped dir\n", + "ARTIC_IMAGE_DIR = os.path.join(DATA_DIR, \"artic-dataset\")\n", + "image_ids = [os.path.splitext(fn)[0] for fn in os.listdir(ARTIC_IMAGE_DIR)]\n", + "image_ids = set(image_ids)\n", + "\n", + "print(f\"There are {len(image_ids)} unique image ids\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "d3bbb2ec", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 754 + }, + "id": "d3bbb2ec", + "outputId": "d507b727-4384-4c00-bf87-7787cff6152a" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | art_id | \n", + "image_id | \n", + "alt_text | \n", + "description | \n", + "full_description | \n", + "artist | \n", + "title | \n", + "
---|---|---|---|---|---|---|---|
0 | \n", + "8101 | \n", + "c24feb4e-d7f5-791e-58ee-5db1a40d0a0d | \n", + "A work made of cotton, plain weave; drawnwork ... | \n", + "This apron mimics lace in its play of transluc... | \n", + "This apron mimics lace in its play of transluc... | \n", + "NaN | \n", + "Apron | \n", + "
1 | \n", + "11460 | \n", + "b84f047e-f871-48c5-6bbf-618731650105 | \n", + "A work made of engraving in black on ivory lai... | \n", + "The ', <a href='https://www.artic.edu/artists/... | \n", + "The ', <a href='https://www.artic.edu/artists/... | \n", + "Master of the E-Series Tarocchi | \n", + "Philosophy, plate 28 from Arts and Sciences | \n", + "
2 | \n", + "21550 | \n", + "40694d77-c9d7-d861-e201-4228e99316e7 | \n", + "A work made of lithograph in black on white wo... | \n", + "A favorite of Daumier’s, this print is a play ... | \n", + "A favorite of Daumier’s, this print is a play ... | \n", + "Honoré-Victorin Daumier | \n", + "Sight, plate 39 from Types Parisiens | \n", + "
3 | \n", + "25374 | \n", + "329b85e4-2865-1281-feeb-5c0ab47e500e | \n", + "A work made of etching in black on ivory laid ... | \n", + "This etching might represent two related prove... | \n", + "This etching might represent two related prove... | \n", + "Pieter Bruegel, the elder | \n", + "The Hare Hunters | \n", + "