From cb2ad0d9ea62900dd745c0481130e89c94c849fc Mon Sep 17 00:00:00 2001 From: Boris Feld Date: Thu, 4 Jul 2024 17:15:26 +0200 Subject: [PATCH] Rename mosaicml composer directory to composer and add notebook example --- .github/workflows/test-examples.yml | 3 +- .../mosaicml-getting-started/README.md | 0 .../mosaicml-getting-started.py | 0 .../mosaicml-getting-started/requirements.txt | 0 .../composer/notebooks/comet_composer.ipynb | 1338 +++++++++++++++++ 5 files changed, 1340 insertions(+), 1 deletion(-) rename integrations/model-training/{mosaicml => composer}/mosaicml-getting-started/README.md (100%) rename integrations/model-training/{mosaicml => composer}/mosaicml-getting-started/mosaicml-getting-started.py (100%) rename integrations/model-training/{mosaicml => composer}/mosaicml-getting-started/requirements.txt (100%) create mode 100644 integrations/model-training/composer/notebooks/comet_composer.ipynb diff --git a/.github/workflows/test-examples.yml b/.github/workflows/test-examples.yml index 7167c84..d096801 100644 --- a/.github/workflows/test-examples.yml +++ b/.github/workflows/test-examples.yml @@ -21,6 +21,7 @@ jobs: - integrations/model-evaluation/gradio/notebooks/Gradio_and_Comet.ipynb - integrations/model-evaluation/gradio/notebooks/Logging_Model_Inferences_with_Comet_and_Gradio.ipynb - integrations/model-optimization/ray-tune/notebooks/Comet_and_Ray.ipynb + - integrations/model-training/composer/notebooks/comet_composer.ipynb - integrations/model-training/fastai/notebooks/fastai_hello_world.ipynb - integrations/model-training/hugging_face/notebooks/Comet_with_Hugging_Face_Trainer.ipynb - integrations/model-training/keras/notebooks/Comet_with_Keras.ipynb @@ -103,11 +104,11 @@ jobs: example: - {script: "integrations/model-evaluation/shap/shap-hello-world/shap-hello-world.py", arg: ""} - {script: "integrations/model-optimization/optuna/optuna-hello-world/optuna-hello-world.py", arg: ""} + - {script: "integrations/model-training/composer/mosaicml-getting-started/mosaicml-getting-started.py", arg: ""} - {script: "integrations/model-training/fastai/fastai-hello-world/fastai_hello_world.py", arg: ""} - {script: "integrations/model-training/hugging_face/transformers-distilbert-fine-tuning/transformers-distilbert-fine-tuning.py", arg: ""} - {script: "integrations/model-training/keras/keras-mnist-dnn/keras-mnist-dnn.py", arg: ""} - {script: "integrations/model-training/mlflow/mlflow-hello-world/mlflow-hello-world.py", arg: "run"} - - {script: "integrations/model-training/mosaicml/mosaicml-getting-started/mosaicml-getting-started.py", arg: ""} - {script: "integrations/model-training/pytorch-lightning/pytorch-lightning-optimizer/pytorch-lightning-optimizer.py", arg: ""} - {script: "integrations/model-training/pytorch/pytorch-mnist/pytorch-mnist-example.py", arg: ""} - {script: "integrations/model-training/pytorch/pytorch-rich-logging/pytorch-rich-logging-example.py", arg: ""} diff --git a/integrations/model-training/mosaicml/mosaicml-getting-started/README.md b/integrations/model-training/composer/mosaicml-getting-started/README.md similarity index 100% rename from integrations/model-training/mosaicml/mosaicml-getting-started/README.md rename to integrations/model-training/composer/mosaicml-getting-started/README.md diff --git a/integrations/model-training/mosaicml/mosaicml-getting-started/mosaicml-getting-started.py b/integrations/model-training/composer/mosaicml-getting-started/mosaicml-getting-started.py similarity index 100% rename from integrations/model-training/mosaicml/mosaicml-getting-started/mosaicml-getting-started.py rename to integrations/model-training/composer/mosaicml-getting-started/mosaicml-getting-started.py diff --git a/integrations/model-training/mosaicml/mosaicml-getting-started/requirements.txt b/integrations/model-training/composer/mosaicml-getting-started/requirements.txt similarity index 100% rename from integrations/model-training/mosaicml/mosaicml-getting-started/requirements.txt rename to integrations/model-training/composer/mosaicml-getting-started/requirements.txt diff --git a/integrations/model-training/composer/notebooks/comet_composer.ipynb b/integrations/model-training/composer/notebooks/comet_composer.ipynb new file mode 100644 index 0000000..f894a0b --- /dev/null +++ b/integrations/model-training/composer/notebooks/comet_composer.ipynb @@ -0,0 +1,1338 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xKSEXlINHv63" + }, + "source": [ + "[Comet](https://www.comet.com/site/products/ml-experiment-tracking/) is an MLOps Platform that is designed to help Data Scientists and Teams build better models faster! Comet provides tooling to track, Explain, Manage, and Monitor your models in a single place! It works with Jupyter Notebooks and Scripts and most importantly it's 100% free to get started!\n", + "\n", + "[Composer](https://github.com/mosaicml/composer/tree/dev) is an open-source deep learning training library by [MosaicML](https://www.mosaicml.com/). Built on top of PyTorch, the Composer library makes it easier to implement distributed training workflows on large-scale clusters.\n", + "\n", + "Instrument Composer with Comet to start managing experiments, create dataset versions and track hyperparameters for faster and easier reproducibility and collaboration.\n", + "\n", + "[Find more information about our integration with Composer](https://www.comet.ml/docs/v2/integrations/ml-frameworks/composer/)\n", + "\n", + "Curious about how Comet can help you build better models, faster? Find out more about [Comet](https://www.comet.com/site/products/ml-experiment-tracking/) and our [other integrations](https://www.comet.com/docs/v2/integrations/overview/)\n", + "\n", + "Get a preview for what's to come. Check out a completed experiment created from this notebook [here](TODO).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YZJS1h5pHv64" + }, + "source": [ + "# Install Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "OwaG-avGHv66" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting composer>=0.16.1\n", + " Downloading composer-0.23.5-py3-none-any.whl.metadata (28 kB)\n", + "Collecting comet_ml>=3.33.10\n", + " Using cached comet_ml-3.43.2-py3-none-any.whl.metadata (3.9 kB)\n", + "Requirement already satisfied: pyyaml<7,>=6.0 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from composer>=0.16.1) (6.0.1)\n", + "Collecting tqdm<5,>=4.62.3 (from composer>=0.16.1)\n", + " Using cached tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)\n", + "Collecting torchmetrics<1.3.3,>=0.10.0 (from composer>=0.16.1)\n", + " Using cached torchmetrics-1.3.2-py3-none-any.whl.metadata (19 kB)\n", + "Collecting torch-optimizer<0.4,>=0.3.0 (from composer>=0.16.1)\n", + " Using cached torch_optimizer-0.3.0-py3-none-any.whl.metadata (55 kB)\n", + "Collecting torchvision<0.18.2,>=0.13.1 (from composer>=0.16.1)\n", + " Using cached torchvision-0.18.1-cp310-cp310-manylinux1_x86_64.whl.metadata (6.6 kB)\n", + "Collecting torch<2.3.2,>=2.1.2 (from composer>=0.16.1)\n", + " Using cached torch-2.3.1-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)\n", + "Requirement already satisfied: requests<3,>=2.26.0 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from composer>=0.16.1) (2.32.3)\n", + "Requirement already satisfied: numpy<2.1.0,>=1.21.5 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from composer>=0.16.1) (2.0.0)\n", + "Requirement already satisfied: psutil<7,>=5.8.0 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from composer>=0.16.1) (6.0.0)\n", + "Collecting coolname<3,>=1.1.0 (from composer>=0.16.1)\n", + " Using cached coolname-2.2.0-py2.py3-none-any.whl.metadata (6.2 kB)\n", + "Collecting tabulate==0.9.0 (from composer>=0.16.1)\n", + " Using cached tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)\n", + "Collecting py-cpuinfo<10,>=8.0.0 (from composer>=0.16.1)\n", + " Using cached py_cpuinfo-9.0.0-py3-none-any.whl.metadata (794 bytes)\n", + "Requirement already satisfied: packaging<24.2,>=21.3.0 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from composer>=0.16.1) (24.1)\n", + "Collecting importlib-metadata<7,>=5.0.0 (from composer>=0.16.1)\n", + " Using cached importlib_metadata-6.11.0-py3-none-any.whl.metadata (4.9 kB)\n", + "Collecting mosaicml-cli<0.7,>=0.5.25 (from composer>=0.16.1)\n", + " Downloading mosaicml_cli-0.6.37-py3-none-any.whl.metadata (4.9 kB)\n", + "Requirement already satisfied: pillow<11,>=10.3.0 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from composer>=0.16.1) (10.4.0)\n", + "Collecting everett<3.2.0,>=1.0.1 (from everett[ini]<3.2.0,>=1.0.1->comet_ml>=3.33.10)\n", + " Using cached everett-3.1.0-py2.py3-none-any.whl.metadata (17 kB)\n", + "Requirement already satisfied: jsonschema!=3.1.0,>=2.6.0 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from comet_ml>=3.33.10) (4.22.0)\n", + "Collecting python-box<7.0.0 (from comet_ml>=3.33.10)\n", + " Using cached python_box-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.8 kB)\n", + "Collecting requests-toolbelt>=0.8.0 (from comet_ml>=3.33.10)\n", + " Using cached requests_toolbelt-1.0.0-py2.py3-none-any.whl.metadata (14 kB)\n", + "Collecting semantic-version>=2.8.0 (from comet_ml>=3.33.10)\n", + " Using cached semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)\n", + "Collecting sentry-sdk>=1.1.0 (from comet_ml>=3.33.10)\n", + " Using cached sentry_sdk-2.7.1-py2.py3-none-any.whl.metadata (14 kB)\n", + "Requirement already satisfied: simplejson in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from comet_ml>=3.33.10) (3.19.2)\n", + "Requirement already satisfied: urllib3>=1.21.1 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from comet_ml>=3.33.10) (2.2.2)\n", + "Collecting wrapt>=1.11.2 (from comet_ml>=3.33.10)\n", + " Using cached wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n", + "Collecting wurlitzer>=1.0.2 (from comet_ml>=3.33.10)\n", + " Using cached wurlitzer-3.1.1-py3-none-any.whl.metadata (2.5 kB)\n", + "Collecting dulwich!=0.20.33,>=0.20.6 (from comet_ml>=3.33.10)\n", + " Using cached dulwich-0.22.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)\n", + "Collecting rich>=13.3.2 (from comet_ml>=3.33.10)\n", + " Using cached rich-13.7.1-py3-none-any.whl.metadata (18 kB)\n", + "Collecting configobj (from everett[ini]<3.2.0,>=1.0.1->comet_ml>=3.33.10)\n", + " Using cached configobj-5.0.8-py2.py3-none-any.whl.metadata (3.4 kB)\n", + "Collecting zipp>=0.5 (from importlib-metadata<7,>=5.0.0->composer>=0.16.1)\n", + " Using cached zipp-3.19.2-py3-none-any.whl.metadata (3.6 kB)\n", + "Requirement already satisfied: attrs>=22.2.0 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from jsonschema!=3.1.0,>=2.6.0->comet_ml>=3.33.10) (23.2.0)\n", + "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from jsonschema!=3.1.0,>=2.6.0->comet_ml>=3.33.10) (2023.12.1)\n", + "Requirement already satisfied: referencing>=0.28.4 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from jsonschema!=3.1.0,>=2.6.0->comet_ml>=3.33.10) (0.35.1)\n", + "Requirement already satisfied: rpds-py>=0.7.1 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from jsonschema!=3.1.0,>=2.6.0->comet_ml>=3.33.10) (0.18.1)\n", + "Collecting argcomplete>=2.0.0 (from mosaicml-cli<0.7,>=0.5.25->composer>=0.16.1)\n", + " Using cached argcomplete-3.4.0-py3-none-any.whl.metadata (16 kB)\n", + "Requirement already satisfied: arrow>=1.2.2 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from mosaicml-cli<0.7,>=0.5.25->composer>=0.16.1) (1.3.0)\n", + "Collecting backoff>=2.2.1 (from mosaicml-cli<0.7,>=0.5.25->composer>=0.16.1)\n", + " Using cached backoff-2.2.1-py3-none-any.whl.metadata (14 kB)\n", + "Collecting gql>=3.4.0 (from gql[websockets]>=3.4.0->mosaicml-cli<0.7,>=0.5.25->composer>=0.16.1)\n", + " Using cached gql-3.5.0-py2.py3-none-any.whl.metadata (9.2 kB)\n", + "Requirement already satisfied: prompt-toolkit>=3.0.29 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from mosaicml-cli<0.7,>=0.5.25->composer>=0.16.1) (3.0.47)\n", + "Collecting protobuf>=3.20.0 (from mosaicml-cli<0.7,>=0.5.25->composer>=0.16.1)\n", + " Downloading protobuf-5.27.2-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)\n", + "Collecting questionary>=1.10.0 (from mosaicml-cli<0.7,>=0.5.25->composer>=0.16.1)\n", + " Using cached questionary-2.0.1-py3-none-any.whl.metadata (5.4 kB)\n", + "Collecting ruamel.yaml>=0.17.21 (from mosaicml-cli<0.7,>=0.5.25->composer>=0.16.1)\n", + " Using cached ruamel.yaml-0.18.6-py3-none-any.whl.metadata (23 kB)\n", + "Requirement already satisfied: typing-extensions>=4.0.1 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from mosaicml-cli<0.7,>=0.5.25->composer>=0.16.1) (4.12.2)\n", + "Collecting validators>=0.20.0 (from mosaicml-cli<0.7,>=0.5.25->composer>=0.16.1)\n", + " Downloading validators-0.30.0-py3-none-any.whl.metadata (3.8 kB)\n", + "Collecting termcolor>=1.1.0 (from mosaicml-cli<0.7,>=0.5.25->composer>=0.16.1)\n", + " Using cached termcolor-2.4.0-py3-none-any.whl.metadata (6.1 kB)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from requests<3,>=2.26.0->composer>=0.16.1) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from requests<3,>=2.26.0->composer>=0.16.1) (3.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from requests<3,>=2.26.0->composer>=0.16.1) (2024.7.4)\n", + "Collecting markdown-it-py>=2.2.0 (from rich>=13.3.2->comet_ml>=3.33.10)\n", + " Using cached markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from rich>=13.3.2->comet_ml>=3.33.10) (2.18.0)\n", + "Collecting filelock (from torch<2.3.2,>=2.1.2->composer>=0.16.1)\n", + " Using cached filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)\n", + "Collecting sympy (from torch<2.3.2,>=2.1.2->composer>=0.16.1)\n", + " Using cached sympy-1.12.1-py3-none-any.whl.metadata (12 kB)\n", + "Collecting networkx (from torch<2.3.2,>=2.1.2->composer>=0.16.1)\n", + " Using cached networkx-3.3-py3-none-any.whl.metadata (5.1 kB)\n", + "Requirement already satisfied: jinja2 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from torch<2.3.2,>=2.1.2->composer>=0.16.1) (3.1.4)\n", + "Collecting fsspec (from torch<2.3.2,>=2.1.2->composer>=0.16.1)\n", + " Using cached fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)\n", + "Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch<2.3.2,>=2.1.2->composer>=0.16.1)\n", + " Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n", + "Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch<2.3.2,>=2.1.2->composer>=0.16.1)\n", + " Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n", + "Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch<2.3.2,>=2.1.2->composer>=0.16.1)\n", + " Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n", + "Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch<2.3.2,>=2.1.2->composer>=0.16.1)\n", + " Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n", + "Collecting nvidia-cublas-cu12==12.1.3.1 (from torch<2.3.2,>=2.1.2->composer>=0.16.1)\n", + " Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n", + "Collecting nvidia-cufft-cu12==11.0.2.54 (from torch<2.3.2,>=2.1.2->composer>=0.16.1)\n", + " Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n", + "Collecting nvidia-curand-cu12==10.3.2.106 (from torch<2.3.2,>=2.1.2->composer>=0.16.1)\n", + " Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n", + "Collecting nvidia-cusolver-cu12==11.4.5.107 (from torch<2.3.2,>=2.1.2->composer>=0.16.1)\n", + " Using cached nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n", + "Collecting nvidia-cusparse-cu12==12.1.0.106 (from torch<2.3.2,>=2.1.2->composer>=0.16.1)\n", + " Using cached nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n", + "Collecting nvidia-nccl-cu12==2.20.5 (from torch<2.3.2,>=2.1.2->composer>=0.16.1)\n", + " Using cached nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)\n", + "Collecting nvidia-nvtx-cu12==12.1.105 (from torch<2.3.2,>=2.1.2->composer>=0.16.1)\n", + " Using cached nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.7 kB)\n", + "Collecting triton==2.3.1 (from torch<2.3.2,>=2.1.2->composer>=0.16.1)\n", + " Using cached triton-2.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)\n", + "Collecting nvidia-nvjitlink-cu12 (from nvidia-cusolver-cu12==11.4.5.107->torch<2.3.2,>=2.1.2->composer>=0.16.1)\n", + " Using cached nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", + "Collecting pytorch-ranger>=0.1.1 (from torch-optimizer<0.4,>=0.3.0->composer>=0.16.1)\n", + " Using cached pytorch_ranger-0.1.1-py3-none-any.whl.metadata (509 bytes)\n", + "Collecting lightning-utilities>=0.8.0 (from torchmetrics<1.3.3,>=0.10.0->composer>=0.16.1)\n", + " Downloading lightning_utilities-0.11.3.post0-py3-none-any.whl.metadata (4.7 kB)\n", + "Requirement already satisfied: python-dateutil>=2.7.0 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from arrow>=1.2.2->mosaicml-cli<0.7,>=0.5.25->composer>=0.16.1) (2.9.0.post0)\n", + "Requirement already satisfied: types-python-dateutil>=2.8.10 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from arrow>=1.2.2->mosaicml-cli<0.7,>=0.5.25->composer>=0.16.1) (2.9.0.20240316)\n", + "Collecting graphql-core<3.3,>=3.2 (from gql>=3.4.0->gql[websockets]>=3.4.0->mosaicml-cli<0.7,>=0.5.25->composer>=0.16.1)\n", + " Using cached graphql_core-3.2.3-py3-none-any.whl.metadata (10 kB)\n", + "Collecting yarl<2.0,>=1.6 (from gql>=3.4.0->gql[websockets]>=3.4.0->mosaicml-cli<0.7,>=0.5.25->composer>=0.16.1)\n", + " Using cached yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (31 kB)\n", + "Requirement already satisfied: anyio<5,>=3.0 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from gql>=3.4.0->gql[websockets]>=3.4.0->mosaicml-cli<0.7,>=0.5.25->composer>=0.16.1) (4.4.0)\n", + "Collecting websockets<12,>=10 (from gql[websockets]>=3.4.0->mosaicml-cli<0.7,>=0.5.25->composer>=0.16.1)\n", + " Using cached websockets-11.0.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n", + "Requirement already satisfied: setuptools in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from lightning-utilities>=0.8.0->torchmetrics<1.3.3,>=0.10.0->composer>=0.16.1) (70.0.0)\n", + "Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich>=13.3.2->comet_ml>=3.33.10)\n", + " Using cached mdurl-0.1.2-py3-none-any.whl.metadata (1.6 kB)\n", + "Requirement already satisfied: wcwidth in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from prompt-toolkit>=3.0.29->mosaicml-cli<0.7,>=0.5.25->composer>=0.16.1) (0.2.13)\n", + "Collecting prompt-toolkit>=3.0.29 (from mosaicml-cli<0.7,>=0.5.25->composer>=0.16.1)\n", + " Using cached prompt_toolkit-3.0.36-py3-none-any.whl.metadata (7.0 kB)\n", + "Collecting ruamel.yaml.clib>=0.2.7 (from ruamel.yaml>=0.17.21->mosaicml-cli<0.7,>=0.5.25->composer>=0.16.1)\n", + " Using cached ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (2.2 kB)\n", + "Requirement already satisfied: six in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from configobj->everett[ini]<3.2.0,>=1.0.1->comet_ml>=3.33.10) (1.16.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from jinja2->torch<2.3.2,>=2.1.2->composer>=0.16.1) (2.1.5)\n", + "Collecting mpmath<1.4.0,>=1.1.0 (from sympy->torch<2.3.2,>=2.1.2->composer>=0.16.1)\n", + " Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)\n", + "Requirement already satisfied: sniffio>=1.1 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from anyio<5,>=3.0->gql>=3.4.0->gql[websockets]>=3.4.0->mosaicml-cli<0.7,>=0.5.25->composer>=0.16.1) (1.3.1)\n", + "Requirement already satisfied: exceptiongroup>=1.0.2 in /home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/lib/python3.10/site-packages (from anyio<5,>=3.0->gql>=3.4.0->gql[websockets]>=3.4.0->mosaicml-cli<0.7,>=0.5.25->composer>=0.16.1) (1.2.1)\n", + "Collecting multidict>=4.0 (from yarl<2.0,>=1.6->gql>=3.4.0->gql[websockets]>=3.4.0->mosaicml-cli<0.7,>=0.5.25->composer>=0.16.1)\n", + " Using cached multidict-6.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)\n", + "Downloading composer-0.23.5-py3-none-any.whl (598 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m598.4/598.4 kB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m0m eta \u001b[36m0:00:01\u001b[0m:01\u001b[0m\n", + "\u001b[?25hUsing cached tabulate-0.9.0-py3-none-any.whl (35 kB)\n", + "Using cached comet_ml-3.43.2-py3-none-any.whl (677 kB)\n", + "Using cached coolname-2.2.0-py2.py3-none-any.whl (37 kB)\n", + "Using cached dulwich-0.22.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (979 kB)\n", + "Using cached everett-3.1.0-py2.py3-none-any.whl (35 kB)\n", + "Using cached importlib_metadata-6.11.0-py3-none-any.whl (23 kB)\n", + "Downloading mosaicml_cli-0.6.37-py3-none-any.whl (274 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m274.5/274.5 kB\u001b[0m \u001b[31m13.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hUsing cached py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)\n", + "Using cached python_box-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)\n", + "Using cached requests_toolbelt-1.0.0-py2.py3-none-any.whl (54 kB)\n", + "Using cached rich-13.7.1-py3-none-any.whl (240 kB)\n", + "Using cached semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)\n", + "Using cached sentry_sdk-2.7.1-py2.py3-none-any.whl (300 kB)\n", + "Using cached torch-2.3.1-cp310-cp310-manylinux1_x86_64.whl (779.1 MB)\n", + "Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n", + "Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n", + "Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n", + "Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n", + "Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)\n", + "Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n", + "Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n", + "Using cached nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n", + "Using cached nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n", + "Using cached nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl (176.2 MB)\n", + "Using cached nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n", + "Using cached triton-2.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (168.1 MB)\n", + "Using cached torch_optimizer-0.3.0-py3-none-any.whl (61 kB)\n", + "Using cached torchmetrics-1.3.2-py3-none-any.whl (841 kB)\n", + "Using cached torchvision-0.18.1-cp310-cp310-manylinux1_x86_64.whl (7.0 MB)\n", + "Using cached tqdm-4.66.4-py3-none-any.whl (78 kB)\n", + "Using cached wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (80 kB)\n", + "Using cached wurlitzer-3.1.1-py3-none-any.whl (8.6 kB)\n", + "Using cached argcomplete-3.4.0-py3-none-any.whl (42 kB)\n", + "Using cached backoff-2.2.1-py3-none-any.whl (15 kB)\n", + "Using cached gql-3.5.0-py2.py3-none-any.whl (74 kB)\n", + "Downloading lightning_utilities-0.11.3.post0-py3-none-any.whl (26 kB)\n", + "Using cached markdown_it_py-3.0.0-py3-none-any.whl (87 kB)\n", + "Downloading protobuf-5.27.2-cp38-abi3-manylinux2014_x86_64.whl (309 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m309.3/309.3 kB\u001b[0m \u001b[31m39.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hUsing cached pytorch_ranger-0.1.1-py3-none-any.whl (14 kB)\n", + "Using cached questionary-2.0.1-py3-none-any.whl (34 kB)\n", + "Using cached prompt_toolkit-3.0.36-py3-none-any.whl (386 kB)\n", + "Using cached ruamel.yaml-0.18.6-py3-none-any.whl (117 kB)\n", + "Using cached termcolor-2.4.0-py3-none-any.whl (7.7 kB)\n", + "Downloading validators-0.30.0-py3-none-any.whl (42 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.4/42.4 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hUsing cached zipp-3.19.2-py3-none-any.whl (9.0 kB)\n", + "Using cached configobj-5.0.8-py2.py3-none-any.whl (36 kB)\n", + "Using cached filelock-3.15.4-py3-none-any.whl (16 kB)\n", + "Using cached fsspec-2024.6.1-py3-none-any.whl (177 kB)\n", + "Using cached networkx-3.3-py3-none-any.whl (1.7 MB)\n", + "Using cached sympy-1.12.1-py3-none-any.whl (5.7 MB)\n", + "Using cached graphql_core-3.2.3-py3-none-any.whl (202 kB)\n", + "Using cached mdurl-0.1.2-py3-none-any.whl (10.0 kB)\n", + "Using cached mpmath-1.3.0-py3-none-any.whl (536 kB)\n", + "Using cached ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (526 kB)\n", + "Using cached websockets-11.0.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (129 kB)\n", + "Using cached yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (301 kB)\n", + "Using cached nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl (21.3 MB)\n", + "Using cached multidict-6.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (124 kB)\n", + "Installing collected packages: py-cpuinfo, mpmath, everett, coolname, zipp, wurlitzer, wrapt, websockets, validators, tqdm, termcolor, tabulate, sympy, sentry-sdk, semantic-version, ruamel.yaml.clib, python-box, protobuf, prompt-toolkit, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, networkx, multidict, mdurl, lightning-utilities, graphql-core, fsspec, filelock, dulwich, configobj, backoff, argcomplete, yarl, triton, ruamel.yaml, requests-toolbelt, questionary, nvidia-cusparse-cu12, nvidia-cudnn-cu12, markdown-it-py, importlib-metadata, rich, nvidia-cusolver-cu12, gql, torch, comet_ml, torchvision, torchmetrics, pytorch-ranger, mosaicml-cli, torch-optimizer, composer\n", + " Attempting uninstall: prompt-toolkit\n", + " Found existing installation: prompt_toolkit 3.0.47\n", + " Uninstalling prompt_toolkit-3.0.47:\n", + " Successfully uninstalled prompt_toolkit-3.0.47\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "ipython 8.26.0 requires prompt-toolkit<3.1.0,>=3.0.41, but you have prompt-toolkit 3.0.36 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0mSuccessfully installed argcomplete-3.4.0 backoff-2.2.1 comet_ml-3.43.2 composer-0.23.5 configobj-5.0.8 coolname-2.2.0 dulwich-0.22.1 everett-3.1.0 filelock-3.15.4 fsspec-2024.6.1 gql-3.5.0 graphql-core-3.2.3 importlib-metadata-6.11.0 lightning-utilities-0.11.3.post0 markdown-it-py-3.0.0 mdurl-0.1.2 mosaicml-cli-0.6.37 mpmath-1.3.0 multidict-6.0.5 networkx-3.3 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.20.5 nvidia-nvjitlink-cu12-12.5.82 nvidia-nvtx-cu12-12.1.105 prompt-toolkit-3.0.36 protobuf-5.27.2 py-cpuinfo-9.0.0 python-box-6.1.0 pytorch-ranger-0.1.1 questionary-2.0.1 requests-toolbelt-1.0.0 rich-13.7.1 ruamel.yaml-0.18.6 ruamel.yaml.clib-0.2.8 semantic-version-2.10.0 sentry-sdk-2.7.1 sympy-1.12.1 tabulate-0.9.0 termcolor-2.4.0 torch-2.3.1 torch-optimizer-0.3.0 torchmetrics-1.3.2 torchvision-0.18.1 tqdm-4.66.4 triton-2.3.1 validators-0.30.0 websockets-11.0.3 wrapt-1.16.0 wurlitzer-3.1.1 yarl-1.9.4 zipp-3.19.2\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.1.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49m/home/lothiraldan/.virtualenvs/tempenv-0634243152b7b/bin/python -m pip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install \"composer>=0.16.1\" \"comet_ml>=3.33.10\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4_xqF2RNHv67" + }, + "source": [ + "# Initialize Comet" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import comet_ml\n", + "\n", + "comet_ml.init(project_name=\"comet-example-mosaicml-getting-started-notebook\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Import Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "ZyMHDdQ7Hv68" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import time\n", + "\n", + "import torch\n", + "import torch.utils.data\n", + "\n", + "import composer\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from torchvision import datasets, transforms\n", + "from composer.loggers import CometMLLogger\n", + "\n", + "torch.manual_seed(42) # For replicability" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ORxmVNIyHv69" + }, + "source": [ + "# Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "gNzmNKSbHv69" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 170498071/170498071 [00:06<00:00, 25502895.17it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracting ./data/cifar-10-python.tar.gz to ./data\n", + "Files already downloaded and verified\n" + ] + } + ], + "source": [ + "data_directory = \"./data\"\n", + "\n", + "# Normalization constants\n", + "mean = (0.507, 0.487, 0.441)\n", + "std = (0.267, 0.256, 0.276)\n", + "\n", + "batch_size = 1024\n", + "\n", + "cifar10_transforms = transforms.Compose(\n", + " [transforms.ToTensor(), transforms.Normalize(mean, std)]\n", + ")\n", + "\n", + "train_dataset = datasets.CIFAR10(\n", + " data_directory, train=True, download=True, transform=cifar10_transforms\n", + ")\n", + "test_dataset = datasets.CIFAR10(\n", + " data_directory, train=False, download=True, transform=cifar10_transforms\n", + ")\n", + "\n", + "# Our train and test dataloaders are PyTorch DataLoader objects!\n", + "train_dataloader = torch.utils.data.DataLoader(\n", + " train_dataset, batch_size=batch_size, shuffle=True\n", + ")\n", + "test_dataloader = torch.utils.data.DataLoader(\n", + " test_dataset, batch_size=batch_size, shuffle=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v7y7awUZHv6-" + }, + "source": [ + "# Define Model" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "dXO5w7SMHv6_" + }, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "from composer.models import ComposerClassifier\n", + "\n", + "\n", + "class Block(nn.Module):\n", + " \"\"\"A ResNet block.\"\"\"\n", + "\n", + " def __init__(self, f_in: int, f_out: int, downsample: bool = False):\n", + " super(Block, self).__init__()\n", + "\n", + " stride = 2 if downsample else 1\n", + " self.conv1 = nn.Conv2d(\n", + " f_in, f_out, kernel_size=3, stride=stride, padding=1, bias=False\n", + " )\n", + " self.bn1 = nn.BatchNorm2d(f_out)\n", + " self.conv2 = nn.Conv2d(\n", + " f_out, f_out, kernel_size=3, stride=1, padding=1, bias=False\n", + " )\n", + " self.bn2 = nn.BatchNorm2d(f_out)\n", + " self.relu = nn.ReLU(inplace=True)\n", + "\n", + " # No parameters for shortcut connections.\n", + " if downsample or f_in != f_out:\n", + " self.shortcut = nn.Sequential(\n", + " nn.Conv2d(f_in, f_out, kernel_size=1, stride=2, bias=False),\n", + " nn.BatchNorm2d(f_out),\n", + " )\n", + " else:\n", + " self.shortcut = nn.Sequential()\n", + "\n", + " def forward(self, x: torch.Tensor):\n", + " out = self.relu(self.bn1(self.conv1(x)))\n", + " out = self.bn2(self.conv2(out))\n", + " out += self.shortcut(x)\n", + " return self.relu(out)\n", + "\n", + "\n", + "class ResNetCIFAR(nn.Module):\n", + " \"\"\"A residual neural network as originally designed for CIFAR-10.\"\"\"\n", + "\n", + " def __init__(self, outputs: int = 10):\n", + " super(ResNetCIFAR, self).__init__()\n", + "\n", + " depth = 56\n", + " width = 16\n", + " num_blocks = (depth - 2) // 6\n", + "\n", + " plan = [(width, num_blocks), (2 * width, num_blocks), (4 * width, num_blocks)]\n", + "\n", + " self.num_classes = outputs\n", + "\n", + " # Initial convolution.\n", + " current_filters = plan[0][0]\n", + " self.conv = nn.Conv2d(\n", + " 3, current_filters, kernel_size=3, stride=1, padding=1, bias=False\n", + " )\n", + " self.bn = nn.BatchNorm2d(current_filters)\n", + " self.relu = nn.ReLU(inplace=True)\n", + "\n", + " # The subsequent blocks of the ResNet.\n", + " blocks = []\n", + " for segment_index, (filters, num_blocks) in enumerate(plan):\n", + " for block_index in range(num_blocks):\n", + " downsample = segment_index > 0 and block_index == 0\n", + " blocks.append(Block(current_filters, filters, downsample))\n", + " current_filters = filters\n", + "\n", + " self.blocks = nn.Sequential(*blocks)\n", + "\n", + " # Final fc layer. Size = number of filters in last segment.\n", + " self.fc = nn.Linear(plan[-1][0], outputs)\n", + " self.criterion = nn.CrossEntropyLoss()\n", + "\n", + " def forward(self, x: torch.Tensor):\n", + " out = self.relu(self.bn(self.conv(x)))\n", + " out = self.blocks(out)\n", + " out = F.avg_pool2d(out, out.size()[3])\n", + " out = out.view(out.size(0), -1)\n", + " out = self.fc(out)\n", + " return out\n", + "\n", + "\n", + "model = ComposerClassifier(module=ResNetCIFAR(), num_classes=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Y0U11Dl4Hv7A" + }, + "source": [ + "# Optimizer and Scheduler" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "2FeZ5zvwHv7A" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "You are using a high value of `weight_decay=0.002` for the `DecoupledSGDW` optimizer. Are you sure you want to do this? Your model's weights will be multiplied by 0.998 on every step!\n" + ] + } + ], + "source": [ + "optimizer = composer.optim.DecoupledSGDW(\n", + " model.parameters(), # Model parameters to update\n", + " lr=0.05, # Peak learning rate\n", + " momentum=0.9,\n", + " weight_decay=2.0e-3, # If this looks large, it's because its not scaled by the LR as in non-decoupled weight decay\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "P0HrODeWHv7B" + }, + "source": [ + "We'll assume this is being run on Colab, which means training for hundreds of epochs would take a very long time. Instead we'll train our baseline model for three epochs. The first epoch will be linear warmup, followed by two epochs of constant LR. We achieve this by instantiating a `LinearWithWarmupScheduler` class.\n", + "\n", + "**Note**: Composer provides a handful of different [schedulers][schedulers] to help customize your training!\n", + "\n", + "[schedulers]: https://docs.mosaicml.com/projects/composer/en/stable/trainer/schedulers.html" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "rFYJ6aq4Hv7B" + }, + "outputs": [], + "source": [ + "lr_scheduler = composer.optim.LinearWithWarmupScheduler(\n", + " t_warmup=\"1ep\", # Warm up over 1 epoch\n", + " alpha_i=1.0, # Flat LR schedule achieved by having alpha_i == alpha_f\n", + " alpha_f=1.0,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AL1-0ou8Hv7B" + }, + "source": [ + "# Logging to Comet" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "iDmoWdoDHv7B" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[1;38;5;214mCOMET WARNING:\u001b[0m As you are running in a Jupyter environment, you will need to call `experiment.end()` when finished to ensure all metrics and code are logged before exiting.\n", + "\u001b[1;38;5;39mCOMET INFO:\u001b[0m Experiment is live on comet.com https://www.comet.com/lothiraldan/comet-example-mosaicml-getting-started/a10fc0bc760a4b41adbaa173fb313763\n", + "\n" + ] + } + ], + "source": [ + "# \"baseline\" = no algorithms (which is what we're doing now)\n", + "logger_for_baseline = CometMLLogger()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mgjj3zJkHv7C" + }, + "source": [ + "# Train a Baseline Model" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "eN4u6t49Hv7C" + }, + "outputs": [], + "source": [ + "train_epochs = (\n", + " \"3ep\" # Train for 3 epochs because we're assuming Colab environment and hardware\n", + ")\n", + "device = \"gpu\" if torch.cuda.is_available() else \"cpu\" # select the device\n", + "\n", + "trainer = composer.trainer.Trainer(\n", + " model=model,\n", + " train_dataloader=train_dataloader,\n", + " eval_dataloader=test_dataloader,\n", + " max_duration=train_epochs,\n", + " optimizers=optimizer,\n", + " schedulers=lr_scheduler,\n", + " device=device,\n", + " loggers=logger_for_baseline,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zcgAbdKzHv7C" + }, + "source": [ + "We train and measure the training time below." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "Ft3IyHMnHv7D" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "******************************\n", + "Config:\n", + "composer_commit_hash: None\n", + "composer_version: 0.23.5\n", + "node_name: unknown because NODENAME environment variable not set\n", + "num_cpus_per_node: 1\n", + "num_nodes: 1\n", + "rank_zero_seed: 3481102868\n", + "\n", + "******************************\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ba035fe365a04135a09b4bc6e6efb4b0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "train Epoch 0: 0%|| 0/49 [00:00" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KkQp_qJPHv7E" + }, + "source": [ + "One of the things we're most excited about at MosaicML is our arsenal of speed-up [algorithms][algorithms]. We used these algorithms to [speed up training of ResNet-50 on ImageNet by up to 7.6x][explorer]. Let's try applying a few algorithms to make our ResNet-56 more efficient.\n", + "\n", + "Before we jump in, here's a quick primer on Composer speed-up algorithms. Each one is implemented as an `Algorithm` class, which basically just adds some structure that controls what happens when the algorithm is applied and when in the training loop it should be applied. Adding a particular algorithm into the training loop is as simple as creating an instance of it (using args/kwargs to set any hyperparameters) and passing it to the `Trainer` during initialization. We'll see that in action below...\n", + "\n", + "For our first algorithm here, let's start with [Label Smoothing][label_smoothing], which serves as a form of regularization by interpolating between the target distribution and another distribution that usually has higher entropy.\n", + "\n", + "[algorithms]: https://docs.mosaicml.com/projects/composer/en/stable/trainer/algorithms.html\n", + "[explorer]: https://app.mosaicml.com/explorer/imagenet\n", + "[label_smoothing]: https://docs.mosaicml.com/projects/composer/en/stable/method_cards/label_smoothing.html" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "aQdpLWcMHv7E" + }, + "outputs": [], + "source": [ + "label_smoothing = composer.algorithms.LabelSmoothing(\n", + " 0.1\n", + ") # We're creating an instance of the LabelSmoothing algorithm class" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oRvNhLP_Hv7E" + }, + "source": [ + "Let's also use [BlurPool][blurpool], which increases accuracy by applying a spatial low-pass filter before the pool in max pooling and whenever using a strided convolution.\n", + "\n", + "[blurpool]: https://docs.mosaicml.com/projects/composer/en/stable/method_cards/blurpool.html" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "id": "t-NcUB1-Hv7F" + }, + "outputs": [], + "source": [ + "blurpool = composer.algorithms.BlurPool(\n", + " replace_convs=True, # Blur before convs\n", + " replace_maxpools=True, # Blur before max-pools\n", + " blur_first=True, # Blur before conv/max-pool\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ehzs9JQ4Hv7F" + }, + "source": [ + "Our final algorithm in our improved training recipe is [Progressive Image Resizing][progressive_image_resizing]. Progressive Image Resizing initially shrinks the size of training images and slowly scales them back to their full size over the course of training. It increases throughput during the early phase of training, when the network may learn coarse-grained features that do not require the details lost by reducing image resolution.\n", + "\n", + "[progressive_image_resizing]: https://docs.mosaicml.com/projects/composer/en/stable/method_cards/progressive_resizing.html" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "_SoKO96oHv7F" + }, + "outputs": [], + "source": [ + "prog_resize = composer.algorithms.ProgressiveResizing(\n", + " initial_scale=0.6, # Size of images at the beginning of training = .6 * default image size\n", + " finetune_fraction=0.34, # Train on default size images for 0.34 of total training time.\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3VdfcA73Hv7F" + }, + "source": [ + "We'll assemble all our algorithms into a list to pass to our trainer." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "id": "ueiSJ2rtHv7G" + }, + "outputs": [], + "source": [ + "algorithms = [label_smoothing, blurpool, prog_resize]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "V4wesG49Hv7G" + }, + "source": [ + "Now let's instantiate our model, optimizer, logger, and trainer again. No need to instantiate our scheduler again because it's stateless!" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "id": "kfM9asQKHv7G" + }, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "from composer.models import ComposerClassifier\n", + "\n", + "\n", + "class Block(nn.Module):\n", + " \"\"\"A ResNet block.\"\"\"\n", + "\n", + " def __init__(self, f_in: int, f_out: int, downsample: bool = False):\n", + " super(Block, self).__init__()\n", + "\n", + " stride = 2 if downsample else 1\n", + " self.conv1 = nn.Conv2d(\n", + " f_in, f_out, kernel_size=3, stride=stride, padding=1, bias=False\n", + " )\n", + " self.bn1 = nn.BatchNorm2d(f_out)\n", + " self.conv2 = nn.Conv2d(\n", + " f_out, f_out, kernel_size=3, stride=1, padding=1, bias=False\n", + " )\n", + " self.bn2 = nn.BatchNorm2d(f_out)\n", + " self.relu = nn.ReLU(inplace=True)\n", + "\n", + " # No parameters for shortcut connections.\n", + " if downsample or f_in != f_out:\n", + " self.shortcut = nn.Sequential(\n", + " nn.Conv2d(f_in, f_out, kernel_size=1, stride=2, bias=False),\n", + " nn.BatchNorm2d(f_out),\n", + " )\n", + " else:\n", + " self.shortcut = nn.Sequential()\n", + "\n", + " def forward(self, x: torch.Tensor):\n", + " out = self.relu(self.bn1(self.conv1(x)))\n", + " out = self.bn2(self.conv2(out))\n", + " out += self.shortcut(x)\n", + " return self.relu(out)\n", + "\n", + "\n", + "class ResNetCIFAR(nn.Module):\n", + " \"\"\"A residual neural network as originally designed for CIFAR-10.\"\"\"\n", + "\n", + " def __init__(self, outputs: int = 10):\n", + " super(ResNetCIFAR, self).__init__()\n", + "\n", + " depth = 56\n", + " width = 16\n", + " num_blocks = (depth - 2) // 6\n", + "\n", + " plan = [(width, num_blocks), (2 * width, num_blocks), (4 * width, num_blocks)]\n", + "\n", + " self.num_classes = outputs\n", + "\n", + " # Initial convolution.\n", + " current_filters = plan[0][0]\n", + " self.conv = nn.Conv2d(\n", + " 3, current_filters, kernel_size=3, stride=1, padding=1, bias=False\n", + " )\n", + " self.bn = nn.BatchNorm2d(current_filters)\n", + " self.relu = nn.ReLU(inplace=True)\n", + "\n", + " # The subsequent blocks of the ResNet.\n", + " blocks = []\n", + " for segment_index, (filters, num_blocks) in enumerate(plan):\n", + " for block_index in range(num_blocks):\n", + " downsample = segment_index > 0 and block_index == 0\n", + " blocks.append(Block(current_filters, filters, downsample))\n", + " current_filters = filters\n", + "\n", + " self.blocks = nn.Sequential(*blocks)\n", + "\n", + " # Final fc layer. Size = number of filters in last segment.\n", + " self.fc = nn.Linear(plan[-1][0], outputs)\n", + " self.criterion = nn.CrossEntropyLoss()\n", + "\n", + " def forward(self, x: torch.Tensor):\n", + " out = self.relu(self.bn(self.conv(x)))\n", + " out = self.blocks(out)\n", + " out = F.avg_pool2d(out, out.size()[3])\n", + " out = out.view(out.size(0), -1)\n", + " out = self.fc(out)\n", + " return out\n", + "\n", + "\n", + "model = ComposerClassifier(module=ResNetCIFAR(), num_classes=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "id": "hTjHl8nPHv7H" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[1;38;5;214mCOMET WARNING:\u001b[0m As you are running in a Jupyter environment, you will need to call `experiment.end()` when finished to ensure all metrics and code are logged before exiting.\n", + "\u001b[1;38;5;39mCOMET INFO:\u001b[0m Experiment is live on comet.com https://www.comet.com/lothiraldan/comet-example-mosaicml-getting-started/2da0b5f6b86f450b811be4c672fb9f48\n", + "\n", + "You are using a high value of `weight_decay=0.002` for the `DecoupledSGDW` optimizer. Are you sure you want to do this? Your model's weights will be multiplied by 0.998 on every step!\n" + ] + } + ], + "source": [ + "logger_for_algorithm_run = CometMLLogger()\n", + "\n", + "optimizer = composer.optim.DecoupledSGDW(\n", + " model.parameters(), lr=0.05, momentum=0.9, weight_decay=2.0e-3\n", + ")\n", + "\n", + "trainer = composer.trainer.Trainer(\n", + " model=model,\n", + " train_dataloader=train_dataloader,\n", + " eval_dataloader=test_dataloader,\n", + " max_duration=train_epochs,\n", + " optimizers=optimizer,\n", + " schedulers=lr_scheduler,\n", + " device=device,\n", + " loggers=logger_for_algorithm_run,\n", + " algorithms=algorithms, # Adding algorithms this time!\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9SIxIQmCHv7H" + }, + "source": [ + "And let's get training!" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "id": "1ulP2b0FHv7H" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "******************************\n", + "Config:\n", + "blurpool/num_blurconv_layers: 4\n", + "blurpool/num_blurpool_layers: 0\n", + "composer_commit_hash: None\n", + "composer_version: 0.23.5\n", + "enabled_algorithms/BlurPool: true\n", + "enabled_algorithms/LabelSmoothing: true\n", + "enabled_algorithms/ProgressiveResizing: true\n", + "node_name: unknown because NODENAME environment variable not set\n", + "num_cpus_per_node: 1\n", + "num_nodes: 1\n", + "rank_zero_seed: 4104564262\n", + "\n", + "******************************\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3104881ca91144738114705f27cd9d0a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "train Epoch 0: 0%|| 0/49 [00:00